Big update: Keras, Tensorflow, Horovod, CuDNN (#57)

lukalabs · May 23, 2019 · 0a33a3a · 0a33a3a
1 parent a3ad43d
commit 0a33a3a
Show file tree

Hide file tree

Showing 83 changed files with 3,090 additions and 2,392 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,6 @@ dist/*
 build/*
 tags
 venv
-data
+results
 !data/corpora_processed
 !data/quality
diff --git a/README.md b/README.md
diff --git a/bin/cakechat_server.py b/bin/cakechat_server.py
@@ -3,9 +3,10 @@
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from cakechat.utils.env import init_theano_env
+from cakechat.utils.env import set_keras_tf_session
 
-init_theano_env()
+gpu_memory_fraction = os.environ.get('GPU_MEMORY_FRACTION', 0.1)
+set_keras_tf_session(gpu_memory_fraction)
 
 from cakechat.api.v1.server import app
 

diff --git a/cakechat/api/config.py b/cakechat/api/config.py
@@ -1,15 +1,15 @@
 from cakechat.config import PREDICTION_MODES
 
 # Prediction mode used in API
-PREDICTION_MODE = PREDICTION_MODES.sampling
+PREDICTION_MODE = PREDICTION_MODES.sampling_reranking
 
 # In case of PREDICTION_MODES.{beamsearch, beamsearch-reranking, sampling-reranking} choose random non-offensive
 # response out of K best candidates proposed by the algorithm.
-NUM_BEST_CANDIDATES_TO_PICK_FROM = 5
+NUM_BEST_CANDIDATES_TO_PICK_FROM = 3
 
 # In case of PREDICTION_MODES.sampling generate samples one-by-one until a non-offensive sample occurs. This parameter
 # defines max number of samples will be generated until succeed.
-SAMPLING_ATTEMPTS_NUM = 5
+SAMPLING_ATTEMPTS_NUM = 10
 
 # Default response text in case we weren't able to produce appropriate response.
-DEFAULT_RESPONSE = ''
+DEFAULT_RESPONSE = '🙊'
diff --git a/cakechat/api/response.py b/cakechat/api/response.py
@@ -1,30 +1,31 @@
 import random
 
-from six.moves import xrange, map
-
 from cakechat.api.config import PREDICTION_MODE, NUM_BEST_CANDIDATES_TO_PICK_FROM, SAMPLING_ATTEMPTS_NUM, \
     DEFAULT_RESPONSE
 from cakechat.config import INPUT_CONTEXT_SIZE, INPUT_SEQUENCE_LENGTH, PREDICTION_MODES
-from cakechat.dialog_model.factory import get_trained_model
+from cakechat.dialog_model.factory import get_trained_model, get_reverse_model
 from cakechat.dialog_model.inference import get_nn_responses, warmup_predictor
 from cakechat.dialog_model.model_utils import transform_contexts_to_token_ids, transform_conditions_to_ids
-from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
 from cakechat.utils.offense_detector import OffenseDetector
-from cakechat.utils.text_processing import get_tokens_sequence, get_pretty_str_from_tokens_sequence
+from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
+from cakechat.utils.text_processing import get_tokens_sequence, prettify_response
 
 _offense_detector = OffenseDetector(OFFENSIVE_PHRASES_PATH)
-_cakechat_model = get_trained_model(fetch_from_s3=False)
+_cakechat_model = get_trained_model(reverse_model=get_reverse_model(PREDICTION_MODE))
 warmup_predictor(_cakechat_model, PREDICTION_MODE)
 
 
+def _is_appropriate_response(response):
+    return response != '' and not _offense_detector.has_offensive_ngrams(response)
+
+
 def _get_non_offensive_response_using_fast_sampling(context_tokens_ids, condition_id):
-    for _ in xrange(SAMPLING_ATTEMPTS_NUM):
+    for _ in range(SAMPLING_ATTEMPTS_NUM):
         response = get_nn_responses(
             context_tokens_ids, _cakechat_model, PREDICTION_MODES.sampling, condition_ids=condition_id)[0][0]
 
-        tokenized_response = get_tokens_sequence(response)
-        if not _offense_detector.has_offensive_ngrams(tokenized_response):
-            return get_pretty_str_from_tokens_sequence(tokenized_response)
+        if _is_appropriate_response(response):
+            return prettify_response(response)
 
     return DEFAULT_RESPONSE
 
@@ -37,14 +38,10 @@ def _get_non_offensive_response(context_tokens_ids, condition_id):
         output_candidates_num=NUM_BEST_CANDIDATES_TO_PICK_FROM,
         condition_ids=condition_id)[0]
 
-    tokenized_responses = [get_tokens_sequence(response) for response in responses]
-    non_offensive_tokenized_responses = [
-        r for r in tokenized_responses if not _offense_detector.has_offensive_ngrams(r)
-    ]
-
-    if non_offensive_tokenized_responses:
-        tokenized_response = random.choice(non_offensive_tokenized_responses)
-        return get_pretty_str_from_tokens_sequence(tokenized_response)
+    responses = list(filter(_is_appropriate_response, responses))
+    if responses:
+        selected_response = random.choice(responses)
+        return prettify_response(selected_response)
 
     return DEFAULT_RESPONSE
 
@@ -60,9 +57,7 @@ def get_response(dialog_context, emotion):
     context_tokens_ids = transform_contexts_to_token_ids(tokenized_dialog_contexts, _cakechat_model.token_to_index,
                                                          INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE)
 
-    condition_ids_num = len(context_tokens_ids)
-    condition_ids = transform_conditions_to_ids([emotion] * condition_ids_num, _cakechat_model.condition_to_index,
-                                                condition_ids_num)
+    condition_ids = transform_conditions_to_ids([emotion], _cakechat_model.condition_to_index, n_dialogs=1)
 
     if PREDICTION_MODE == PREDICTION_MODES.sampling:  # Different strategy here for better performance.
         return _get_non_offensive_response_using_fast_sampling(context_tokens_ids, condition_ids)

diff --git a/cakechat/api/utils.py b/cakechat/api/utils.py
@@ -1,5 +1,4 @@
 from flask import jsonify
-from six import text_type
 
 
 def get_api_error_response(message, code, logger):
@@ -8,8 +7,7 @@ def get_api_error_response(message, code, logger):
 
 
 def _is_list_of_unicode_strings(data):
-    return bool(data and isinstance(data, (list, tuple)) and
-                all(isinstance(s, text_type) for s in data))
+    return data and isinstance(data, (list, tuple)) and all(isinstance(s, str) for s in data)
 
 
 def parse_dataset_param(params, param_name, required=True):
@@ -18,8 +16,8 @@ def parse_dataset_param(params, param_name, required=True):
 
     dataset = params[param_name]
     if not _is_list_of_unicode_strings(dataset):
-        raise ValueError('`%s` should be non-empty list of unicode strings' % param_name)
+        raise ValueError('`{}` should be non-empty list of unicode strings'.format(param_name))
     if not all(dataset):
-        raise ValueError('`%s` should not contain empty strings' % param_name)
+        raise ValueError('`{}` should not contain empty strings'.format(param_name))
 
     return dataset
diff --git a/cakechat/api/v1/server.py b/cakechat/api/v1/server.py
@@ -4,37 +4,36 @@
 from cakechat.api.utils import get_api_error_response, parse_dataset_param
 from cakechat.config import EMOTIONS_TYPES, DEFAULT_CONDITION
 from cakechat.utils.logger import get_logger
-from cakechat.utils.profile import timer
 
 _logger = get_logger(__name__)
 
 app = Flask(__name__)
 
 
 @app.route('/cakechat_api/v1/actions/get_response', methods=['POST'])
-@timer
 def get_model_response():
     params = request.get_json()
-    _logger.info('request params: %s' % params)
+    _logger.info('request params: {}'.format(params))
 
     try:
         dialog_context = parse_dataset_param(params, param_name='context')
     except KeyError as e:
-        return get_api_error_response('Malformed request, no "%s" param was found' % str(e), 400, _logger)
+        return get_api_error_response('Malformed request, no "{}" param was found'.format(e), 400, _logger)
     except ValueError as e:
-        return get_api_error_response('Malformed request: %s' % str(e), 400, _logger)
+        return get_api_error_response('Malformed request: {}'.format(e), 400, _logger)
 
     emotion = params.get('emotion', DEFAULT_CONDITION)
     if emotion not in EMOTIONS_TYPES:
-        return get_api_error_response('Malformed request, emotion param "%s" is not in emotion list %s' %
-                                      (emotion, list(EMOTIONS_TYPES)), 400, _logger)
+        return get_api_error_response(
+            'Malformed request, emotion param "{}" is not in emotion list {}'.format(emotion, list(EMOTIONS_TYPES)),
+            400, _logger)
 
     response = get_response(dialog_context, emotion)
+    _logger.info('Given response: "{}" for context: {}; emotion "{}"'.format(response, dialog_context, emotion))
 
-    if not response:
-        _logger.error('No response for context: %s; emotion "%s"' % (dialog_context, emotion))
-        return jsonify({}), 200
+    return jsonify({'response': response}), 200
 
-    _logger.info('Given response: "%s" for context: %s; emotion "%s"' % (response, dialog_context, emotion))
 
-    return jsonify({'response': response}), 200
+@app.errorhandler(Exception)
+def on_exception(exception):
+    return get_api_error_response('Can\'t process request: {}'.format(exception), 500, _logger)
diff --git a/cakechat/config.py b/cakechat/config.py
@@ -3,47 +3,52 @@
 from cakechat.utils.data_structures import create_namedtuple_instance
 from cakechat.utils.env import is_dev_env
 
+MODEL_NAME = 'cakechat_v2.0_keras_tf'
+
+INTX = 'uint16'  # use unsigined 16-bits int representation for memory efficiency
 RANDOM_SEED = 42  # Fix the random seed to a certain value to make everything reproducible
 
 # AWS S3 params
-S3_MODELS_BUCKET_NAME = 'cake-chat-data'  # S3 bucket with all the data
+S3_MODELS_BUCKET_NAME = 'cake-chat-data-v2'  # S3 bucket with all the data
 S3_NN_MODEL_REMOTE_DIR = 'nn_models'  # S3 remote directory with models itself
 S3_TOKENS_IDX_REMOTE_DIR = 'tokens_index'  # S3 remote directory with tokens index
 S3_CONDITIONS_IDX_REMOTE_DIR = 'conditions_index'  # S3 remote directory with conditions index
 S3_W2V_REMOTE_DIR = 'w2v_models'  # S3 remote directory with pre-trained w2v models
 
-# data params
-DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')  # Directory to store all the data
-# e.g. datasets, models, indices
-NN_MODELS_DIR = os.path.join(DATA_DIR, 'nn_models')  # Path to a directory for saving and restoring dialog models
-PROCESSED_CORPUS_DIR = os.path.join(DATA_DIR, 'corpora_processed')  # Path to a processed corpora datasets
-TOKEN_INDEX_DIR = os.path.join(DATA_DIR, 'tokens_index')  # Path to a prepared tokens index file
-CONDITION_IDS_INDEX_DIR = os.path.join(DATA_DIR, 'conditions_index')  # Path to a prepared conditions index file
+# train datasets
+DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')
+PROCESSED_CORPUS_DIR = os.path.join(DATA_PATH, 'corpora_processed')
+TOKEN_INDEX_DIR = os.path.join(DATA_PATH, 'tokens_index')  # Path to prepared tokens index directory
+CONDITION_IDS_INDEX_DIR = os.path.join(DATA_PATH, 'conditions_index')  # Path to prepared conditions index directory
 
 # train & val data params
 BASE_CORPUS_NAME = 'processed_dialogs'  # Basic corpus name prefix
-TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME  # Corpus name prefix for the training dataset
-CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME  # Corpus name prefix for the validation dataset
-
+TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME  # Training dataset filename prefix
+CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME  # Validation dataset filename prefix for intermediate
+CONTEXT_SENSITIVE_TEST_CORPUS_NAME = 'test_' + BASE_CORPUS_NAME  # Testing dataset for final metrics calculation
 MAX_VAL_LINES_NUM = 10000  # Max lines number from validation set to be used for metrics calculation
-VAL_SUBSET_SIZE = 250  # Subset from the validation dataset to be used to calculated some validation metrics
-TRAIN_SUBSET_SIZE = int(os.environ['SLICE_TRAINSET']) if 'SLICE_TRAINSET' in os.environ else None  # Subset from the
-# training dataset to be used during the training. In case of None use all lines in the train dataset (default behavior)
 
-# test data paths
-TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'quality')
+# test datasets
+TEST_DATA_DIR = os.path.join(DATA_PATH, 'quality')
 CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set'  # Context-free validation set path
 TEST_CORPUS_NAME = 'context_free_test_set'  # Context-free test set path
 QUESTIONS_CORPUS_NAME = 'context_free_questions'  # Context-free questions only path
 
+# directory to store model wights and calcualted metrics
+RESULTS_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'results')  # Directory to store training results
+TENSORBOARD_LOG_DIR = os.path.join(RESULTS_PATH, 'tensorboard')  # Path to tensorboard logs directory
+
 # word embeddings params
 USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True  # Whether to use word2vec to pre-train weights for the embedding layer
 TRAIN_WORD_EMBEDDINGS_LAYER = True  # Allow fine-tuning of the word embedding layer during the model training
-W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models')  # Path to store & load trained word2vec models
+W2V_MODEL_DIR = os.path.join(DATA_PATH, 'w2v_models')  # Path to store & load trained word2vec models
 WORD_EMBEDDING_DIMENSION = 128  # word2vec embedding dimension
 W2V_WINDOW_SIZE = 10  # word2vec window size, used during the w2v pre-training
 USE_SKIP_GRAM = True  # Use skip-gram word2vec mode. When False, CBOW is used
+TOKEN_REPRESENTATION_SIZE = 256
 MIN_WORD_FREQ = 1  # Minimum frequency of a word to be used in word2vec pre-calculation
+VOCABULARY_MAX_SIZE = 50000  # Maximum vocabulary size in tokens
+MAX_CONDITIONS_NUM = 5  # Maximum conditions num
 
 # condition inputs. We use five major emotions to condition our model's predictions
 EMOTIONS_TYPES = create_namedtuple_instance(
@@ -52,23 +57,25 @@
 CONDITION_EMBEDDING_DIMENSION = 128  # Conditions embedding layer dimension to be trained.
 
 # NN architecture params
-ENCODER_DEPTH = 2  # Number of recurrent (GRU) layers for the encoder
-DECODER_DEPTH = 2  # Number of recurrent (GRU) layers for the decoder
-HIDDEN_LAYER_DIMENSION = 512  # Dimension for the recurrent layer
+HIDDEN_LAYER_DIMENSION = 768  # Dimension for the recurrent layer
 DENSE_DROPOUT_RATIO = 0.2  # Use dropout with the given ratio before decoder's output
+USE_CUDNN = bool(os.environ.get('CUDA_VISIBLE_DEVICES'))  # True by default for GPU-enable machines (provides ~25% inference
+# speed up) and False on CPU-only machines since they does not support CuDNN
 
 # training params
+EPOCHS_NUM = 2  # Total epochs num
+BATCH_SIZE = 196  # Number of samples to be used for gradient estimation on each train step. In case of using multiple
+# GPUs for train, each worker will have this number of samples on each step.
+SHUFFLE_TRAINING_BATCHES = True  # Shuffle training batches in the dataset each epoch
+
 INPUT_SEQUENCE_LENGTH = 30  # Input sequence length for the model during the training;
 INPUT_CONTEXT_SIZE = 3  # Maximum depth of the conversational history to be used in encoder (at least 1)
 OUTPUT_SEQUENCE_LENGTH = 32  # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
-BATCH_SIZE = 192  # Default batch size which fits into 8GB of GPU memory
-SHUFFLE_TRAINING_BATCHES = True  # Shuffle training batches in the dataset each epoch
-EPOCHS_NUM = 100  # Total epochs num
-GRAD_CLIP = 5.0  # Gradient clipping passed into theano.gradient.grad_clip()
-LEARNING_RATE = 1.0  # Learning rate for the chosen optimizer (currently using Adadelta, see model.py)
 
-# model params
-NN_MODEL_PREFIX = 'cakechat_v1.3'  # Specify prefix to be prepended to model's name
+GRAD_CLIP = 5.0  # Gradient clipping param passed to optimizer
+LEARNING_RATE = 6.0  # Learning rate for Adadelta optimzer
+LOG_RUN_METADATA = False  # Set 'True' to profile memory consumption and computation time on tensorboard
+AUTOENCODER_MODE = False  # Set 'True' to switch seq2seq (x -> y) into autoencoder (x -> x). Used for debugging
 
 # predictions params
 MAX_PREDICTIONS_LENGTH = 40  # Max. number of tokens which can be generated on the prediction step
@@ -90,35 +97,55 @@
 DEFAULT_TEMPERATURE = 0.5  # Default softmax temperature used for sampling
 
 # Options for beamsearch and sampling-reranking:
-BEAM_SIZE = 20  # Size of the beam (beamsearch only)
-SAMPLES_NUM_FOR_RERANKING = 20  # Number of samples used in reranking (sampling-reranking only)
+BEAM_SIZE = 10  # Size of the beam (beamsearch only)
+SAMPLES_NUM_FOR_RERANKING = 10  # Number of samples used in reranking (sampling-reranking only)
 MMI_REVERSE_MODEL_SCORE_WEIGHT = 1.0  # Weight for MMI reranking reverse-model score, see the paper:
 # 0.0 - scoring is performing using completely the default model, 1.0 - using completely the reverse model
 
 # Logging params
-LOG_CANDIDATES_NUM = 10  # Number of candidates to be printed to output during the logging
+LOG_CANDIDATES_NUM = 3  # Number of candidates to be printed to output during the logging
 SCREEN_LOG_NUM_TEST_LINES = 10  # Number of first test lines to use when logging outputs on screen
-SCREEN_LOG_FREQUENCY_PER_BATCHES = 500  # How many batches to train until next logging of output on screen
-LOG_TO_TB_FREQUENCY_PER_BATCHES = 500  # How many batches to train until next metrics computed for TensorBoard
-LOG_TO_FILE_FREQUENCY_PER_BATCHES = 2500  # How many batches to train until next logging of all the output into file
-SAVE_MODEL_FREQUENCY_PER_BATCHES = 2500  # How many batches to train until next logging of all the output into file
-AVG_LOSS_DECAY = 0.99  # Decay for the averaging the loss
+EVAL_STATE_PER_BATCHES = 500  # How many batches to train until next metrics computed for TensorBoard
 
-# Use reduced sizes for input/output sequences, hidden layers and datasets sizes for the 'Developer Mode'
+# Use reduced params values for development
 if is_dev_env():
-    INPUT_SEQUENCE_LENGTH = 7
-    OUTPUT_SEQUENCE_LENGTH = 9
-    MAX_PREDICTIONS_LENGTH = 5
-    BATCH_SIZE = 128
-    HIDDEN_LAYER_DIMENSION = 7
-    SCREEN_LOG_FREQUENCY_PER_BATCHES = 2
-    LOG_TO_TB_FREQUENCY_PER_BATCHES = 3
-    LOG_TO_FILE_FREQUENCY_PER_BATCHES = 4
-    SAVE_MODEL_FREQUENCY_PER_BATCHES = 4
-    WORD_EMBEDDING_DIMENSION = 15
-    SAMPLES_NUM_FOR_RERANKING = BEAM_SIZE = 5
+    # train & val data params
+    MAX_VAL_LINES_NUM = 10
+
+    # word embeddings params
+    USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True
+    TRAIN_WORD_EMBEDDINGS_LAYER = True
+    WORD_EMBEDDING_DIMENSION = 64
+    VOCABULARY_MAX_SIZE = 1000
+    MAX_CONDITIONS_NUM = 5
+
+    # condition inputs
+    CONDITION_EMBEDDING_DIMENSION = 1
+
+    # NN architecture params
+    HIDDEN_LAYER_DIMENSION = 128
+    DENSE_DROPOUT_RATIO = 0.2
+    USE_CUDNN = False
+
+    # training params
+    INPUT_SEQUENCE_LENGTH = 3
+    INPUT_CONTEXT_SIZE = 1
+    OUTPUT_SEQUENCE_LENGTH = 5
+    BATCH_SIZE = 4
+    SHUFFLE_TRAINING_BATCHES = False
+    EPOCHS_NUM = 4
+    LEARNING_RATE = 1.0
+    LOG_RUN_METADATA = False
+    AUTOENCODER_MODE = False
+
+    # predictions params
+    MAX_PREDICTIONS_LENGTH = 4
+
+    # options for beamsearch and sampling-reranking:
+    SAMPLES_NUM_FOR_RERANKING = 5
+    BEAM_SIZE = 5
+
+    # logging params
     LOG_CANDIDATES_NUM = 3
-    USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = False
-    VAL_SUBSET_SIZE = 100
-    MAX_VAL_LINES_NUM = 100
-    TRAIN_SUBSET_SIZE = 10000
+    SCREEN_LOG_NUM_TEST_LINES = 4
+    EVAL_STATE_PER_BATCHES = 5