Merge pull request #95 from udomobi/develop

Version 1.0.0
weni-ai · Oct 10, 2018 · 50bbf7d · 50bbf7d
2 parents 4084d99 + b835dc5
commit 50bbf7d
Show file tree

Hide file tree

Showing 15 changed files with 540 additions and 187 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -8,6 +8,8 @@ omit =
     *env*,
     *__main__*,
     bothub_nlp/cli/*.py
+    bothub_nlp/core/pipeline_components/intent_entity_featurizer_regex.py
+    setup.py
 
 [report]
 fail_under = 90

diff --git a/Dockerfile b/Dockerfile
@@ -1,21 +1,19 @@
-FROM ilha/scipy:python3.6-alpine3.7
+FROM python:3.6.6
 
 ENV WORKDIR /home/app
 ENV IS_PRODUCTION true
 ENV PORT 2657
 
 WORKDIR $WORKDIR
 
-RUN apk update && apk upgrade && \
-    apk add alpine-sdk postgresql-dev libpng openblas-dev freetype-dev libpng-dev
-
-RUN pip install pipenv psycopg2-binary
+RUN pip install pipenv
+RUN pip install psycopg2-binary
 
 COPY Pipfile .
 COPY Pipfile.lock .
 COPY Makefile .
 
-RUN make check_environment
+RUN make -s check_environment
 
 COPY . .
 
@@ -25,7 +23,7 @@ RUN if [ ${DOWNLOAD_LANGUAGES_ON_DOCKER_IMAGE_BUILD} ]; \
     fi
 ENV DOWNLOADED_LANGUAGES ${DOWNLOAD_LANGUAGES_ON_DOCKER_IMAGE_BUILD}
 
-RUN make import_ilha_spacy_langs CHECK_ENVIRONMENT=false
+RUN make -s import_ilha_spacy_langs CHECK_ENVIRONMENT=false
 
 RUN chmod +x ./entrypoint.sh
 ENTRYPOINT $WORKDIR/entrypoint.sh
diff --git a/Makefile b/Makefile
@@ -79,7 +79,7 @@ install_development_requirements:
 
 install_production_requirements:
 	@echo "${INFO}Installing production requirements...${NC}"
-	@pipenv install --system
+	@pipenv install --system -v
 	@echo "${SUCCESS}✔${NC} Requirements installed"
 
 development_mode_guard:

diff --git a/Pipfile b/Pipfile
@@ -4,21 +4,20 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-tornado = "*"
+bothub-engine = {editable = true, ref = "1.16.0b1", git = "https://github.com/Ilhasoft/bothub-engine"}
+tornado = "==5.1.1"
+contextvars = "==2.3"
+raven = "==6.9.0"
 spacy = "==2.0.12"
 rasa-nlu = "==0.13.1"
-scikit-learn = "*"
-sklearn-crfsuite = "*"
-scipy = "==1.1.0"
-numpy = "==1.14.5"
-bothub = {editable = true, ref = "1.15.3", git = "https://github.com/Ilhasoft/bothub-engine"}
+tensorflow = "==1.11.0"
+scikit-learn = "==0.20.0"
+sklearn-crfsuite = "==0.3.6"
 plac = "==0.9.6"
-raven = "*"
-pytz = "==2018.5"
 
 [dev-packages]
-"flake8" = "*"
 coverage = "*"
+"flake8" = "*"
 
 [requires]
 python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/bothub_nlp/core/tests/test_parse.py b/bothub_nlp/core/tests/test_parse.py
@@ -7,6 +7,7 @@
 from ..train import train_update
 from ..parse import parse_text
 from ..parse import format_parse_output
+from ..parse import position_match
 from ...tests.utils import fill_examples
 from ...tests.utils import EXAMPLES_MOCKUP
 from ...tests.utils import EXAMPLES_WITH_LABEL_MOCKUP
@@ -136,3 +137,57 @@ def test_entity_priority(self):
         self.assertEqual(
             len(out.get('entities')),
             1)
+
+
+class PositionMatchTestCase(TestCase):
+    def test_match(self):
+        r = position_match(
+            {
+                'start': 0,
+                'end': 4,
+            },
+            {
+                'start': 0,
+                'end': 4,
+            }
+        )
+        self.assertTrue(r)
+
+    def test_diff_start(self):
+        r = position_match(
+            {
+                'start': 0,
+                'end': 4,
+            },
+            {
+                'start': 1,
+                'end': 4,
+            }
+        )
+        self.assertFalse(r)
+
+    def test_diff_end(self):
+        r = position_match(
+            {
+                'start': 0,
+                'end': 4,
+            },
+            {
+                'start': 0,
+                'end': 3,
+            }
+        )
+        self.assertFalse(r)
+
+    def test_diff_twice(self):
+        r = position_match(
+            {
+                'start': 1,
+                'end': 4,
+            },
+            {
+                'start': 2,
+                'end': 3,
+            }
+        )
+        self.assertFalse(r)
diff --git a/bothub_nlp/core/tests/test_train.py b/bothub_nlp/core/tests/test_train.py
@@ -34,6 +34,34 @@ def test_train(self):
         self.assertIsNotNone(update.training_started_at)
         self.assertIsNotNone(update.trained_at)
 
+    def test_train_without_language_model(self):
+        self.repository.use_language_model_featurizer = False
+        self.repository.save()
+        fill_examples(EXAMPLES_MOCKUP, self.repository)
+        update = self.repository.current_update()
+        train_update(update, self.user)
+
+        self.assertEqual(
+            update.by.id,
+            self.user.id)
+
+        self.assertIsNotNone(update.training_started_at)
+        self.assertIsNotNone(update.trained_at)
+
+    def test_train_competing_intents(self):
+        self.repository.use_competing_intents = True
+        self.repository.save()
+        fill_examples(EXAMPLES_MOCKUP, self.repository)
+        update = self.repository.current_update()
+        train_update(update, self.user)
+
+        self.assertEqual(
+            update.by.id,
+            self.user.id)
+
+        self.assertIsNotNone(update.training_started_at)
+        self.assertIsNotNone(update.trained_at)
+
     def test_train_with_labels(self):
         fill_examples(EXAMPLES_WITH_LABEL_MOCKUP, self.repository)
         update = self.repository.current_update()

diff --git a/bothub_nlp/core/train.py b/bothub_nlp/core/train.py
@@ -9,7 +9,9 @@
 from django.db import models
 
 from .utils import get_rasa_nlu_config_from_update
+from .utils import PokeLogging
 from .persistor import BothubPersistor
+from . import logger
 
 
 class BothubWriter(TrainingDataWriter):
@@ -56,42 +58,54 @@ def as_json(self, **kwargs):
 
 def train_update(update, by):
     update.start_training(by)
-
-    examples = [
-        Message.build(
-            text=example.get_text(update.language),
-            intent=example.intent,
-            entities=[
-                example_entity.rasa_nlu_data
-                for example_entity in example.get_entities(update.language)])
-        for example in update.examples]
-
-    label_examples_query = update.examples \
-        .filter(entities__entity__label__isnull=False) \
-        .annotate(entities_count=models.Count('entities')) \
-        .filter(entities_count__gt=0)
-
-    label_examples = [
-        Message.build(
-            text=example.get_text(update.language),
-            entities=[
-                example_entity.get_rasa_nlu_data(label_as_entity=True)
-                for example_entity in filter(
-                    lambda ee: ee.entity.label,
-                    example.get_entities(update.language))])
-        for example in label_examples_query]
-
-    rasa_nlu_config = get_rasa_nlu_config_from_update(update)
-    trainer = Trainer(
-        rasa_nlu_config,
-        ComponentBuilder(use_cache=False))
-    training_data = BothubTrainingData(
-        label_training_examples=label_examples,
-        training_examples=examples)
-    trainer.train(training_data)
-    persistor = BothubPersistor(update)
-    trainer.persist(
-        mkdtemp(),
-        persistor=persistor,
-        project_name=str(update.repository.uuid),
-        fixed_model_name=str(update.id))
+    with PokeLogging() as pl:
+        try:
+            examples = [
+                Message.build(
+                    text=example.get_text(update.language),
+                    intent=example.intent,
+                    entities=[
+                        example_entity.rasa_nlu_data
+                        for example_entity in example.get_entities(
+                            update.language)])
+                for example in update.examples]
+
+            label_examples_query = update.examples \
+                .filter(entities__entity__label__isnull=False) \
+                .annotate(entities_count=models.Count('entities')) \
+                .filter(entities_count__gt=0)
+
+            label_examples = [
+                Message.build(
+                    text=example.get_text(update.language),
+                    entities=[
+                        example_entity.get_rasa_nlu_data(
+                            label_as_entity=True)
+                        for example_entity in filter(
+                            lambda ee: ee.entity.label,
+                            example.get_entities(update.language))])
+                for example in label_examples_query]
+
+            rasa_nlu_config = get_rasa_nlu_config_from_update(update)
+            trainer = Trainer(
+                rasa_nlu_config,
+                ComponentBuilder(use_cache=False))
+            training_data = BothubTrainingData(
+                label_training_examples=label_examples,
+                training_examples=examples)
+
+            trainer.train(training_data)
+
+            persistor = BothubPersistor(update)
+            trainer.persist(
+                mkdtemp(),
+                persistor=persistor,
+                project_name=str(update.repository.uuid),
+                fixed_model_name=str(update.id))
+        except Exception as e:
+            logger.exception(e)
+            update.train_fail()
+            raise e
+        finally:
+            update.training_log = pl.getvalue()
+            update.save(update_fields=['training_log'])
diff --git a/bothub_nlp/core/utils.py b/bothub_nlp/core/utils.py
@@ -1,3 +1,6 @@
+import contextvars
+import logging
+import io
 import spacy
 
 from tempfile import mkdtemp
@@ -11,23 +14,47 @@
 
 
 def get_rasa_nlu_config_from_update(update):
+    pipeline = []
+    use_spacy_tokenizer = True  # TODO: future, check if has lang spacy model
+    use_spacy_featurizer = update.use_language_model_featurizer
+    use_spacy = use_spacy_tokenizer or use_spacy_featurizer
+
+    # load spacy
+    if use_spacy:
+        pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
+                                 'spacy_nlp.SpacyNLP'})
+
+    # tokenizer
+    if use_spacy_tokenizer:
+        pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
+                                 'tokenizer_spacy.SpacyTokenizer'})
+    else:
+        pipeline.append({'name': 'tokenizer_whitespace'})
+
+    # featurizer
+    if use_spacy_featurizer:
+        pipeline.append({'name': 'intent_featurizer_spacy'})
+    else:
+        pipeline.append({'name': 'intent_featurizer_count_vectors'})
+
+    # intent classifier
+    pipeline.append({
+        'name': 'intent_classifier_tensorflow_embedding',
+        'similarity_type': 'inner' if update.use_competing_intents else
+                           'cosine'
+    })
+
+    # entity extractor
+    pipeline.append({'name': 'ner_crf'})
+
+    # label extractor
+    pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
+                             'crf_label_as_entity_extractor.' +
+                             'CRFLabelAsEntityExtractor'})
+
     return RasaNLUModelConfig({
         'language': update.language,
-        'pipeline': [
-            {'name': 'bothub_nlp.core.pipeline_components.spacy_nlp.' +
-                     'SpacyNLP'},
-            {'name': 'bothub_nlp.core.pipeline_components.tokenizer_spacy.' +
-                     'SpacyTokenizer'},
-            {'name': 'intent_featurizer_spacy'},
-            {'name': 'bothub_nlp.core.pipeline_components.' +
-                     'intent_entity_featurizer_regex.RegexFeaturizer'},
-            {'name': 'ner_crf'},
-            {'name': 'ner_synonyms'},
-            {'name': 'intent_classifier_sklearn'},
-            {'name': 'bothub_nlp.core.pipeline_components.' +
-                     'crf_label_as_entity_extractor.' +
-                     'CRFLabelAsEntityExtractor'},
-        ],
+        'pipeline': pipeline,
     })
 
 
@@ -104,3 +131,36 @@ def get(self, lang):
             logger.info(f'loading {lang} spacy lang model...')
             self.nlps[lang] = spacy.load(lang, parser=False)
         return self.nlps[lang]
+
+
+class PokeLoggingHandler(logging.StreamHandler):
+    def __init__(self, pl, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pl = pl
+
+    def emit(self, record):
+        if self.pl.cxt.get(default=None) is self.pl:
+            super().emit(record)
+
+
+class PokeLogging:
+    def __init__(self, loggingLevel=logging.DEBUG):
+        self.loggingLevel = loggingLevel
+
+    def __enter__(self):
+        self.cxt = contextvars.ContextVar(self.__class__.__name__)
+        self.cxt.set(self)
+        logging.captureWarnings(True)
+        self.logger = logging.getLogger()
+        self.logger.setLevel(self.loggingLevel)
+        self.stream = io.StringIO()
+        self.handler = PokeLoggingHandler(self, self.stream)
+        self.formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        self.handler.setLevel(self.loggingLevel)
+        self.handler.setFormatter(self.formatter)
+        self.logger.addHandler(self.handler)
+        return self.stream
+
+    def __exit__(self, *args):
+        self.logger.removeHandler(self.logger)