Skip to content
This repository has been archived by the owner on May 25, 2020. It is now read-only.

Commit

Permalink
Big update: Keras, Tensorflow, Horovod, CuDNN (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolas-ivanov authored May 23, 2019
1 parent a3ad43d commit 0a33a3a
Show file tree
Hide file tree
Showing 83 changed files with 3,090 additions and 2,392 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ dist/*
build/*
tags
venv
data
results
!data/corpora_processed
!data/quality
533 changes: 343 additions & 190 deletions README.md

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions bin/cakechat_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from cakechat.utils.env import init_theano_env
from cakechat.utils.env import set_keras_tf_session

init_theano_env()
gpu_memory_fraction = os.environ.get('GPU_MEMORY_FRACTION', 0.1)
set_keras_tf_session(gpu_memory_fraction)

from cakechat.api.v1.server import app

Expand Down
8 changes: 4 additions & 4 deletions cakechat/api/config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from cakechat.config import PREDICTION_MODES

# Prediction mode used in API
PREDICTION_MODE = PREDICTION_MODES.sampling
PREDICTION_MODE = PREDICTION_MODES.sampling_reranking

# In case of PREDICTION_MODES.{beamsearch, beamsearch-reranking, sampling-reranking} choose random non-offensive
# response out of K best candidates proposed by the algorithm.
NUM_BEST_CANDIDATES_TO_PICK_FROM = 5
NUM_BEST_CANDIDATES_TO_PICK_FROM = 3

# In case of PREDICTION_MODES.sampling generate samples one-by-one until a non-offensive sample occurs. This parameter
# defines max number of samples will be generated until succeed.
SAMPLING_ATTEMPTS_NUM = 5
SAMPLING_ATTEMPTS_NUM = 10

# Default response text in case we weren't able to produce appropriate response.
DEFAULT_RESPONSE = ''
DEFAULT_RESPONSE = '🙊'
37 changes: 16 additions & 21 deletions cakechat/api/response.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import random

from six.moves import xrange, map

from cakechat.api.config import PREDICTION_MODE, NUM_BEST_CANDIDATES_TO_PICK_FROM, SAMPLING_ATTEMPTS_NUM, \
DEFAULT_RESPONSE
from cakechat.config import INPUT_CONTEXT_SIZE, INPUT_SEQUENCE_LENGTH, PREDICTION_MODES
from cakechat.dialog_model.factory import get_trained_model
from cakechat.dialog_model.factory import get_trained_model, get_reverse_model
from cakechat.dialog_model.inference import get_nn_responses, warmup_predictor
from cakechat.dialog_model.model_utils import transform_contexts_to_token_ids, transform_conditions_to_ids
from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
from cakechat.utils.offense_detector import OffenseDetector
from cakechat.utils.text_processing import get_tokens_sequence, get_pretty_str_from_tokens_sequence
from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
from cakechat.utils.text_processing import get_tokens_sequence, prettify_response

_offense_detector = OffenseDetector(OFFENSIVE_PHRASES_PATH)
_cakechat_model = get_trained_model(fetch_from_s3=False)
_cakechat_model = get_trained_model(reverse_model=get_reverse_model(PREDICTION_MODE))
warmup_predictor(_cakechat_model, PREDICTION_MODE)


def _is_appropriate_response(response):
return response != '' and not _offense_detector.has_offensive_ngrams(response)


def _get_non_offensive_response_using_fast_sampling(context_tokens_ids, condition_id):
for _ in xrange(SAMPLING_ATTEMPTS_NUM):
for _ in range(SAMPLING_ATTEMPTS_NUM):
response = get_nn_responses(
context_tokens_ids, _cakechat_model, PREDICTION_MODES.sampling, condition_ids=condition_id)[0][0]

tokenized_response = get_tokens_sequence(response)
if not _offense_detector.has_offensive_ngrams(tokenized_response):
return get_pretty_str_from_tokens_sequence(tokenized_response)
if _is_appropriate_response(response):
return prettify_response(response)

return DEFAULT_RESPONSE

Expand All @@ -37,14 +38,10 @@ def _get_non_offensive_response(context_tokens_ids, condition_id):
output_candidates_num=NUM_BEST_CANDIDATES_TO_PICK_FROM,
condition_ids=condition_id)[0]

tokenized_responses = [get_tokens_sequence(response) for response in responses]
non_offensive_tokenized_responses = [
r for r in tokenized_responses if not _offense_detector.has_offensive_ngrams(r)
]

if non_offensive_tokenized_responses:
tokenized_response = random.choice(non_offensive_tokenized_responses)
return get_pretty_str_from_tokens_sequence(tokenized_response)
responses = list(filter(_is_appropriate_response, responses))
if responses:
selected_response = random.choice(responses)
return prettify_response(selected_response)

return DEFAULT_RESPONSE

Expand All @@ -60,9 +57,7 @@ def get_response(dialog_context, emotion):
context_tokens_ids = transform_contexts_to_token_ids(tokenized_dialog_contexts, _cakechat_model.token_to_index,
INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE)

condition_ids_num = len(context_tokens_ids)
condition_ids = transform_conditions_to_ids([emotion] * condition_ids_num, _cakechat_model.condition_to_index,
condition_ids_num)
condition_ids = transform_conditions_to_ids([emotion], _cakechat_model.condition_to_index, n_dialogs=1)

if PREDICTION_MODE == PREDICTION_MODES.sampling: # Different strategy here for better performance.
return _get_non_offensive_response_using_fast_sampling(context_tokens_ids, condition_ids)
Expand Down
8 changes: 3 additions & 5 deletions cakechat/api/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from flask import jsonify
from six import text_type


def get_api_error_response(message, code, logger):
Expand All @@ -8,8 +7,7 @@ def get_api_error_response(message, code, logger):


def _is_list_of_unicode_strings(data):
return bool(data and isinstance(data, (list, tuple)) and
all(isinstance(s, text_type) for s in data))
return data and isinstance(data, (list, tuple)) and all(isinstance(s, str) for s in data)


def parse_dataset_param(params, param_name, required=True):
Expand All @@ -18,8 +16,8 @@ def parse_dataset_param(params, param_name, required=True):

dataset = params[param_name]
if not _is_list_of_unicode_strings(dataset):
raise ValueError('`%s` should be non-empty list of unicode strings' % param_name)
raise ValueError('`{}` should be non-empty list of unicode strings'.format(param_name))
if not all(dataset):
raise ValueError('`%s` should not contain empty strings' % param_name)
raise ValueError('`{}` should not contain empty strings'.format(param_name))

return dataset
23 changes: 11 additions & 12 deletions cakechat/api/v1/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,36 @@
from cakechat.api.utils import get_api_error_response, parse_dataset_param
from cakechat.config import EMOTIONS_TYPES, DEFAULT_CONDITION
from cakechat.utils.logger import get_logger
from cakechat.utils.profile import timer

_logger = get_logger(__name__)

app = Flask(__name__)


@app.route('/cakechat_api/v1/actions/get_response', methods=['POST'])
@timer
def get_model_response():
params = request.get_json()
_logger.info('request params: %s' % params)
_logger.info('request params: {}'.format(params))

try:
dialog_context = parse_dataset_param(params, param_name='context')
except KeyError as e:
return get_api_error_response('Malformed request, no "%s" param was found' % str(e), 400, _logger)
return get_api_error_response('Malformed request, no "{}" param was found'.format(e), 400, _logger)
except ValueError as e:
return get_api_error_response('Malformed request: %s' % str(e), 400, _logger)
return get_api_error_response('Malformed request: {}'.format(e), 400, _logger)

emotion = params.get('emotion', DEFAULT_CONDITION)
if emotion not in EMOTIONS_TYPES:
return get_api_error_response('Malformed request, emotion param "%s" is not in emotion list %s' %
(emotion, list(EMOTIONS_TYPES)), 400, _logger)
return get_api_error_response(
'Malformed request, emotion param "{}" is not in emotion list {}'.format(emotion, list(EMOTIONS_TYPES)),
400, _logger)

response = get_response(dialog_context, emotion)
_logger.info('Given response: "{}" for context: {}; emotion "{}"'.format(response, dialog_context, emotion))

if not response:
_logger.error('No response for context: %s; emotion "%s"' % (dialog_context, emotion))
return jsonify({}), 200
return jsonify({'response': response}), 200

_logger.info('Given response: "%s" for context: %s; emotion "%s"' % (response, dialog_context, emotion))

return jsonify({'response': response}), 200
@app.errorhandler(Exception)
def on_exception(exception):
return get_api_error_response('Can\'t process request: {}'.format(exception), 500, _logger)
129 changes: 78 additions & 51 deletions cakechat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,52 @@
from cakechat.utils.data_structures import create_namedtuple_instance
from cakechat.utils.env import is_dev_env

MODEL_NAME = 'cakechat_v2.0_keras_tf'

INTX = 'uint16' # use unsigined 16-bits int representation for memory efficiency
RANDOM_SEED = 42 # Fix the random seed to a certain value to make everything reproducible

# AWS S3 params
S3_MODELS_BUCKET_NAME = 'cake-chat-data' # S3 bucket with all the data
S3_MODELS_BUCKET_NAME = 'cake-chat-data-v2' # S3 bucket with all the data
S3_NN_MODEL_REMOTE_DIR = 'nn_models' # S3 remote directory with models itself
S3_TOKENS_IDX_REMOTE_DIR = 'tokens_index' # S3 remote directory with tokens index
S3_CONDITIONS_IDX_REMOTE_DIR = 'conditions_index' # S3 remote directory with conditions index
S3_W2V_REMOTE_DIR = 'w2v_models' # S3 remote directory with pre-trained w2v models

# data params
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data') # Directory to store all the data
# e.g. datasets, models, indices
NN_MODELS_DIR = os.path.join(DATA_DIR, 'nn_models') # Path to a directory for saving and restoring dialog models
PROCESSED_CORPUS_DIR = os.path.join(DATA_DIR, 'corpora_processed') # Path to a processed corpora datasets
TOKEN_INDEX_DIR = os.path.join(DATA_DIR, 'tokens_index') # Path to a prepared tokens index file
CONDITION_IDS_INDEX_DIR = os.path.join(DATA_DIR, 'conditions_index') # Path to a prepared conditions index file
# train datasets
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')
PROCESSED_CORPUS_DIR = os.path.join(DATA_PATH, 'corpora_processed')
TOKEN_INDEX_DIR = os.path.join(DATA_PATH, 'tokens_index') # Path to prepared tokens index directory
CONDITION_IDS_INDEX_DIR = os.path.join(DATA_PATH, 'conditions_index') # Path to prepared conditions index directory

# train & val data params
BASE_CORPUS_NAME = 'processed_dialogs' # Basic corpus name prefix
TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME # Corpus name prefix for the training dataset
CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME # Corpus name prefix for the validation dataset

TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME # Training dataset filename prefix
CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME # Validation dataset filename prefix for intermediate
CONTEXT_SENSITIVE_TEST_CORPUS_NAME = 'test_' + BASE_CORPUS_NAME # Testing dataset for final metrics calculation
MAX_VAL_LINES_NUM = 10000 # Max lines number from validation set to be used for metrics calculation
VAL_SUBSET_SIZE = 250 # Subset from the validation dataset to be used to calculated some validation metrics
TRAIN_SUBSET_SIZE = int(os.environ['SLICE_TRAINSET']) if 'SLICE_TRAINSET' in os.environ else None # Subset from the
# training dataset to be used during the training. In case of None use all lines in the train dataset (default behavior)

# test data paths
TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'quality')
# test datasets
TEST_DATA_DIR = os.path.join(DATA_PATH, 'quality')
CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set' # Context-free validation set path
TEST_CORPUS_NAME = 'context_free_test_set' # Context-free test set path
QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path

# directory to store model wights and calcualted metrics
RESULTS_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'results') # Directory to store training results
TENSORBOARD_LOG_DIR = os.path.join(RESULTS_PATH, 'tensorboard') # Path to tensorboard logs directory

# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer
TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models') # Path to store & load trained word2vec models
W2V_MODEL_DIR = os.path.join(DATA_PATH, 'w2v_models') # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension
W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used
TOKEN_REPRESENTATION_SIZE = 256
MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation
VOCABULARY_MAX_SIZE = 50000 # Maximum vocabulary size in tokens
MAX_CONDITIONS_NUM = 5 # Maximum conditions num

# condition inputs. We use five major emotions to condition our model's predictions
EMOTIONS_TYPES = create_namedtuple_instance(
Expand All @@ -52,23 +57,25 @@
CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained.

# NN architecture params
ENCODER_DEPTH = 2 # Number of recurrent (GRU) layers for the encoder
DECODER_DEPTH = 2 # Number of recurrent (GRU) layers for the decoder
HIDDEN_LAYER_DIMENSION = 512 # Dimension for the recurrent layer
HIDDEN_LAYER_DIMENSION = 768 # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output
USE_CUDNN = bool(os.environ.get('CUDA_VISIBLE_DEVICES')) # True by default for GPU-enable machines (provides ~25% inference
# speed up) and False on CPU-only machines since they does not support CuDNN

# training params
EPOCHS_NUM = 2 # Total epochs num
BATCH_SIZE = 196 # Number of samples to be used for gradient estimation on each train step. In case of using multiple
# GPUs for train, each worker will have this number of samples on each step.
SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch

INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
BATCH_SIZE = 192 # Default batch size which fits into 8GB of GPU memory
SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch
EPOCHS_NUM = 100 # Total epochs num
GRAD_CLIP = 5.0 # Gradient clipping passed into theano.gradient.grad_clip()
LEARNING_RATE = 1.0 # Learning rate for the chosen optimizer (currently using Adadelta, see model.py)

# model params
NN_MODEL_PREFIX = 'cakechat_v1.3' # Specify prefix to be prepended to model's name
GRAD_CLIP = 5.0 # Gradient clipping param passed to optimizer
LEARNING_RATE = 6.0 # Learning rate for Adadelta optimzer
LOG_RUN_METADATA = False # Set 'True' to profile memory consumption and computation time on tensorboard
AUTOENCODER_MODE = False # Set 'True' to switch seq2seq (x -> y) into autoencoder (x -> x). Used for debugging

# predictions params
MAX_PREDICTIONS_LENGTH = 40 # Max. number of tokens which can be generated on the prediction step
Expand All @@ -90,35 +97,55 @@
DEFAULT_TEMPERATURE = 0.5 # Default softmax temperature used for sampling

# Options for beamsearch and sampling-reranking:
BEAM_SIZE = 20 # Size of the beam (beamsearch only)
SAMPLES_NUM_FOR_RERANKING = 20 # Number of samples used in reranking (sampling-reranking only)
BEAM_SIZE = 10 # Size of the beam (beamsearch only)
SAMPLES_NUM_FOR_RERANKING = 10 # Number of samples used in reranking (sampling-reranking only)
MMI_REVERSE_MODEL_SCORE_WEIGHT = 1.0 # Weight for MMI reranking reverse-model score, see the paper:
# 0.0 - scoring is performing using completely the default model, 1.0 - using completely the reverse model

# Logging params
LOG_CANDIDATES_NUM = 10 # Number of candidates to be printed to output during the logging
LOG_CANDIDATES_NUM = 3 # Number of candidates to be printed to output during the logging
SCREEN_LOG_NUM_TEST_LINES = 10 # Number of first test lines to use when logging outputs on screen
SCREEN_LOG_FREQUENCY_PER_BATCHES = 500 # How many batches to train until next logging of output on screen
LOG_TO_TB_FREQUENCY_PER_BATCHES = 500 # How many batches to train until next metrics computed for TensorBoard
LOG_TO_FILE_FREQUENCY_PER_BATCHES = 2500 # How many batches to train until next logging of all the output into file
SAVE_MODEL_FREQUENCY_PER_BATCHES = 2500 # How many batches to train until next logging of all the output into file
AVG_LOSS_DECAY = 0.99 # Decay for the averaging the loss
EVAL_STATE_PER_BATCHES = 500 # How many batches to train until next metrics computed for TensorBoard

# Use reduced sizes for input/output sequences, hidden layers and datasets sizes for the 'Developer Mode'
# Use reduced params values for development
if is_dev_env():
INPUT_SEQUENCE_LENGTH = 7
OUTPUT_SEQUENCE_LENGTH = 9
MAX_PREDICTIONS_LENGTH = 5
BATCH_SIZE = 128
HIDDEN_LAYER_DIMENSION = 7
SCREEN_LOG_FREQUENCY_PER_BATCHES = 2
LOG_TO_TB_FREQUENCY_PER_BATCHES = 3
LOG_TO_FILE_FREQUENCY_PER_BATCHES = 4
SAVE_MODEL_FREQUENCY_PER_BATCHES = 4
WORD_EMBEDDING_DIMENSION = 15
SAMPLES_NUM_FOR_RERANKING = BEAM_SIZE = 5
# train & val data params
MAX_VAL_LINES_NUM = 10

# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True
TRAIN_WORD_EMBEDDINGS_LAYER = True
WORD_EMBEDDING_DIMENSION = 64
VOCABULARY_MAX_SIZE = 1000
MAX_CONDITIONS_NUM = 5

# condition inputs
CONDITION_EMBEDDING_DIMENSION = 1

# NN architecture params
HIDDEN_LAYER_DIMENSION = 128
DENSE_DROPOUT_RATIO = 0.2
USE_CUDNN = False

# training params
INPUT_SEQUENCE_LENGTH = 3
INPUT_CONTEXT_SIZE = 1
OUTPUT_SEQUENCE_LENGTH = 5
BATCH_SIZE = 4
SHUFFLE_TRAINING_BATCHES = False
EPOCHS_NUM = 4
LEARNING_RATE = 1.0
LOG_RUN_METADATA = False
AUTOENCODER_MODE = False

# predictions params
MAX_PREDICTIONS_LENGTH = 4

# options for beamsearch and sampling-reranking:
SAMPLES_NUM_FOR_RERANKING = 5
BEAM_SIZE = 5

# logging params
LOG_CANDIDATES_NUM = 3
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = False
VAL_SUBSET_SIZE = 100
MAX_VAL_LINES_NUM = 100
TRAIN_SUBSET_SIZE = 10000
SCREEN_LOG_NUM_TEST_LINES = 4
EVAL_STATE_PER_BATCHES = 5
Loading

0 comments on commit 0a33a3a

Please sign in to comment.