Skip to content

Commit

Permalink
Merge pull request #95 from udomobi/develop
Browse files Browse the repository at this point in the history
Version 1.0.0
  • Loading branch information
Douglas Paz authored Oct 10, 2018
2 parents 4084d99 + b835dc5 commit 50bbf7d
Show file tree
Hide file tree
Showing 15 changed files with 540 additions and 187 deletions.
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ omit =
*env*,
*__main__*,
bothub_nlp/cli/*.py
bothub_nlp/core/pipeline_components/intent_entity_featurizer_regex.py
setup.py

[report]
fail_under = 90
Expand Down
12 changes: 5 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
FROM ilha/scipy:python3.6-alpine3.7
FROM python:3.6.6

ENV WORKDIR /home/app
ENV IS_PRODUCTION true
ENV PORT 2657

WORKDIR $WORKDIR

RUN apk update && apk upgrade && \
apk add alpine-sdk postgresql-dev libpng openblas-dev freetype-dev libpng-dev

RUN pip install pipenv psycopg2-binary
RUN pip install pipenv
RUN pip install psycopg2-binary

COPY Pipfile .
COPY Pipfile.lock .
COPY Makefile .

RUN make check_environment
RUN make -s check_environment

COPY . .

Expand All @@ -25,7 +23,7 @@ RUN if [ ${DOWNLOAD_LANGUAGES_ON_DOCKER_IMAGE_BUILD} ]; \
fi
ENV DOWNLOADED_LANGUAGES ${DOWNLOAD_LANGUAGES_ON_DOCKER_IMAGE_BUILD}

RUN make import_ilha_spacy_langs CHECK_ENVIRONMENT=false
RUN make -s import_ilha_spacy_langs CHECK_ENVIRONMENT=false

RUN chmod +x ./entrypoint.sh
ENTRYPOINT $WORKDIR/entrypoint.sh
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ install_development_requirements:

install_production_requirements:
@echo "${INFO}Installing production requirements...${NC}"
@pipenv install --system
@pipenv install --system -v
@echo "${SUCCESS}${NC} Requirements installed"

development_mode_guard:
Expand Down
17 changes: 8 additions & 9 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,20 @@ verify_ssl = true
name = "pypi"

[packages]
tornado = "*"
bothub-engine = {editable = true, ref = "1.16.0b1", git = "https://github.com/Ilhasoft/bothub-engine"}
tornado = "==5.1.1"
contextvars = "==2.3"
raven = "==6.9.0"
spacy = "==2.0.12"
rasa-nlu = "==0.13.1"
scikit-learn = "*"
sklearn-crfsuite = "*"
scipy = "==1.1.0"
numpy = "==1.14.5"
bothub = {editable = true, ref = "1.15.3", git = "https://github.com/Ilhasoft/bothub-engine"}
tensorflow = "==1.11.0"
scikit-learn = "==0.20.0"
sklearn-crfsuite = "==0.3.6"
plac = "==0.9.6"
raven = "*"
pytz = "==2018.5"

[dev-packages]
"flake8" = "*"
coverage = "*"
"flake8" = "*"

[requires]
python_version = "3.6"
380 changes: 286 additions & 94 deletions Pipfile.lock

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions bothub_nlp/core/tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..train import train_update
from ..parse import parse_text
from ..parse import format_parse_output
from ..parse import position_match
from ...tests.utils import fill_examples
from ...tests.utils import EXAMPLES_MOCKUP
from ...tests.utils import EXAMPLES_WITH_LABEL_MOCKUP
Expand Down Expand Up @@ -136,3 +137,57 @@ def test_entity_priority(self):
self.assertEqual(
len(out.get('entities')),
1)


class PositionMatchTestCase(TestCase):
def test_match(self):
r = position_match(
{
'start': 0,
'end': 4,
},
{
'start': 0,
'end': 4,
}
)
self.assertTrue(r)

def test_diff_start(self):
r = position_match(
{
'start': 0,
'end': 4,
},
{
'start': 1,
'end': 4,
}
)
self.assertFalse(r)

def test_diff_end(self):
r = position_match(
{
'start': 0,
'end': 4,
},
{
'start': 0,
'end': 3,
}
)
self.assertFalse(r)

def test_diff_twice(self):
r = position_match(
{
'start': 1,
'end': 4,
},
{
'start': 2,
'end': 3,
}
)
self.assertFalse(r)
28 changes: 28 additions & 0 deletions bothub_nlp/core/tests/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,34 @@ def test_train(self):
self.assertIsNotNone(update.training_started_at)
self.assertIsNotNone(update.trained_at)

def test_train_without_language_model(self):
self.repository.use_language_model_featurizer = False
self.repository.save()
fill_examples(EXAMPLES_MOCKUP, self.repository)
update = self.repository.current_update()
train_update(update, self.user)

self.assertEqual(
update.by.id,
self.user.id)

self.assertIsNotNone(update.training_started_at)
self.assertIsNotNone(update.trained_at)

def test_train_competing_intents(self):
self.repository.use_competing_intents = True
self.repository.save()
fill_examples(EXAMPLES_MOCKUP, self.repository)
update = self.repository.current_update()
train_update(update, self.user)

self.assertEqual(
update.by.id,
self.user.id)

self.assertIsNotNone(update.training_started_at)
self.assertIsNotNone(update.trained_at)

def test_train_with_labels(self):
fill_examples(EXAMPLES_WITH_LABEL_MOCKUP, self.repository)
update = self.repository.current_update()
Expand Down
92 changes: 53 additions & 39 deletions bothub_nlp/core/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from django.db import models

from .utils import get_rasa_nlu_config_from_update
from .utils import PokeLogging
from .persistor import BothubPersistor
from . import logger


class BothubWriter(TrainingDataWriter):
Expand Down Expand Up @@ -56,42 +58,54 @@ def as_json(self, **kwargs):

def train_update(update, by):
update.start_training(by)

examples = [
Message.build(
text=example.get_text(update.language),
intent=example.intent,
entities=[
example_entity.rasa_nlu_data
for example_entity in example.get_entities(update.language)])
for example in update.examples]

label_examples_query = update.examples \
.filter(entities__entity__label__isnull=False) \
.annotate(entities_count=models.Count('entities')) \
.filter(entities_count__gt=0)

label_examples = [
Message.build(
text=example.get_text(update.language),
entities=[
example_entity.get_rasa_nlu_data(label_as_entity=True)
for example_entity in filter(
lambda ee: ee.entity.label,
example.get_entities(update.language))])
for example in label_examples_query]

rasa_nlu_config = get_rasa_nlu_config_from_update(update)
trainer = Trainer(
rasa_nlu_config,
ComponentBuilder(use_cache=False))
training_data = BothubTrainingData(
label_training_examples=label_examples,
training_examples=examples)
trainer.train(training_data)
persistor = BothubPersistor(update)
trainer.persist(
mkdtemp(),
persistor=persistor,
project_name=str(update.repository.uuid),
fixed_model_name=str(update.id))
with PokeLogging() as pl:
try:
examples = [
Message.build(
text=example.get_text(update.language),
intent=example.intent,
entities=[
example_entity.rasa_nlu_data
for example_entity in example.get_entities(
update.language)])
for example in update.examples]

label_examples_query = update.examples \
.filter(entities__entity__label__isnull=False) \
.annotate(entities_count=models.Count('entities')) \
.filter(entities_count__gt=0)

label_examples = [
Message.build(
text=example.get_text(update.language),
entities=[
example_entity.get_rasa_nlu_data(
label_as_entity=True)
for example_entity in filter(
lambda ee: ee.entity.label,
example.get_entities(update.language))])
for example in label_examples_query]

rasa_nlu_config = get_rasa_nlu_config_from_update(update)
trainer = Trainer(
rasa_nlu_config,
ComponentBuilder(use_cache=False))
training_data = BothubTrainingData(
label_training_examples=label_examples,
training_examples=examples)

trainer.train(training_data)

persistor = BothubPersistor(update)
trainer.persist(
mkdtemp(),
persistor=persistor,
project_name=str(update.repository.uuid),
fixed_model_name=str(update.id))
except Exception as e:
logger.exception(e)
update.train_fail()
raise e
finally:
update.training_log = pl.getvalue()
update.save(update_fields=['training_log'])
90 changes: 75 additions & 15 deletions bothub_nlp/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import contextvars
import logging
import io
import spacy

from tempfile import mkdtemp
Expand All @@ -11,23 +14,47 @@


def get_rasa_nlu_config_from_update(update):
pipeline = []
use_spacy_tokenizer = True # TODO: future, check if has lang spacy model
use_spacy_featurizer = update.use_language_model_featurizer
use_spacy = use_spacy_tokenizer or use_spacy_featurizer

# load spacy
if use_spacy:
pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
'spacy_nlp.SpacyNLP'})

# tokenizer
if use_spacy_tokenizer:
pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
'tokenizer_spacy.SpacyTokenizer'})
else:
pipeline.append({'name': 'tokenizer_whitespace'})

# featurizer
if use_spacy_featurizer:
pipeline.append({'name': 'intent_featurizer_spacy'})
else:
pipeline.append({'name': 'intent_featurizer_count_vectors'})

# intent classifier
pipeline.append({
'name': 'intent_classifier_tensorflow_embedding',
'similarity_type': 'inner' if update.use_competing_intents else
'cosine'
})

# entity extractor
pipeline.append({'name': 'ner_crf'})

# label extractor
pipeline.append({'name': 'bothub_nlp.core.pipeline_components.' +
'crf_label_as_entity_extractor.' +
'CRFLabelAsEntityExtractor'})

return RasaNLUModelConfig({
'language': update.language,
'pipeline': [
{'name': 'bothub_nlp.core.pipeline_components.spacy_nlp.' +
'SpacyNLP'},
{'name': 'bothub_nlp.core.pipeline_components.tokenizer_spacy.' +
'SpacyTokenizer'},
{'name': 'intent_featurizer_spacy'},
{'name': 'bothub_nlp.core.pipeline_components.' +
'intent_entity_featurizer_regex.RegexFeaturizer'},
{'name': 'ner_crf'},
{'name': 'ner_synonyms'},
{'name': 'intent_classifier_sklearn'},
{'name': 'bothub_nlp.core.pipeline_components.' +
'crf_label_as_entity_extractor.' +
'CRFLabelAsEntityExtractor'},
],
'pipeline': pipeline,
})


Expand Down Expand Up @@ -104,3 +131,36 @@ def get(self, lang):
logger.info(f'loading {lang} spacy lang model...')
self.nlps[lang] = spacy.load(lang, parser=False)
return self.nlps[lang]


class PokeLoggingHandler(logging.StreamHandler):
def __init__(self, pl, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pl = pl

def emit(self, record):
if self.pl.cxt.get(default=None) is self.pl:
super().emit(record)


class PokeLogging:
def __init__(self, loggingLevel=logging.DEBUG):
self.loggingLevel = loggingLevel

def __enter__(self):
self.cxt = contextvars.ContextVar(self.__class__.__name__)
self.cxt.set(self)
logging.captureWarnings(True)
self.logger = logging.getLogger()
self.logger.setLevel(self.loggingLevel)
self.stream = io.StringIO()
self.handler = PokeLoggingHandler(self, self.stream)
self.formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
self.handler.setLevel(self.loggingLevel)
self.handler.setFormatter(self.formatter)
self.logger.addHandler(self.handler)
return self.stream

def __exit__(self, *args):
self.logger.removeHandler(self.logger)
Loading

0 comments on commit 50bbf7d

Please sign in to comment.