diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 7c13c8eea0..9c8c047224 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -169,8 +169,96 @@ jobs: -scoring_debug "true" \ -tensorboard_log_dir /tmp/logs_dynamic-scoring_and_copy \ -dump_preds /tmp/dump_preds \ + -position_encoding \ -copy_attn python onmt/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_copy -tensorboard_checks valid_metrics + - name : Test Transformer training and validation with dynamic scoring and maxrelative + run: | + python3 train.py \ + -config data/data.yaml \ + -src_vocab /tmp/onmt.vocab.src \ + -tgt_vocab /tmp/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -num_workers 0 -bucket_size 1024 \ + -accum_count 2 4 8 \ + -accum_steps 0 15000 30000 \ + -save_model /tmp/onmt.model \ + -train_steps 10 -valid_steps 5 \ + -report_every 2 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -tensorboard_log_dir /tmp/logs_dynamic-scoring_and_relative \ + -dump_preds /tmp/dump_preds \ + -max_relative_positions 8 + python onmt/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_relative -tensorboard_checks valid_metrics + - name : Test Transformer training and validation with dynamic scoring and rotary + run: | + python3 train.py \ + -config data/data.yaml \ + -src_vocab /tmp/onmt.vocab.src \ + -tgt_vocab /tmp/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -num_workers 0 -bucket_size 1024 \ + -accum_count 2 4 8 \ + -accum_steps 0 15000 30000 \ + -save_model /tmp/onmt.model \ + -train_steps 10 -valid_steps 5 \ + -report_every 2 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -tensorboard_log_dir /tmp/logs_dynamic-scoring_and_rotary \ + -dump_preds /tmp/dump_preds \ + -max_relative_positions -1 + python onmt/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_rotary -tensorboard_checks valid_metrics + - name : Test Transformer training and validation with dynamic scoring and alibi + run: | + python3 train.py \ + -config data/data.yaml \ + -src_vocab /tmp/onmt.vocab.src \ + -tgt_vocab /tmp/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -num_workers 0 -bucket_size 1024 \ + -accum_count 2 4 8 \ + -accum_steps 0 15000 30000 \ + -save_model /tmp/onmt.model \ + -train_steps 10 -valid_steps 5 \ + -report_every 2 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -tensorboard_log_dir /tmp/logs_dynamic-scoring_and_alibi \ + -dump_preds /tmp/dump_preds \ + -max_relative_positions 8 + python onmt/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_alibi -tensorboard_checks valid_metrics - name: Test LM training run: | python train.py \ diff --git a/onmt/bin/translate.py b/onmt/bin/translate.py index 5fc64f6409..aae076ec85 100644 --- a/onmt/bin/translate.py +++ b/onmt/bin/translate.py @@ -9,6 +9,7 @@ from onmt.utils.parse import ArgumentParser from onmt.utils.misc import use_gpu, set_random_seed from torch.profiler import profile, record_function, ProfilerActivity +import time def translate(opt): @@ -52,13 +53,15 @@ def main(): parser = _get_parser() opt = parser.parse_args() if opt.profile: + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: with record_function("Translate"): translate(opt) - print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=30)) - + print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=40)) else: + init_time = time.time() translate(opt) + print("Time w/o python interpreter load/terminate: ", time.time() - init_time) if __name__ == "__main__": diff --git a/onmt/decoders/transformer.py b/onmt/decoders/transformer.py index 5d4ff51afa..84225124ba 100644 --- a/onmt/decoders/transformer.py +++ b/onmt/decoders/transformer.py @@ -581,20 +581,18 @@ def forward(self, tgt, enc_out=None, step=None, **kwargs): {"keys": torch.tensor([]), "values": torch.tensor([])}, ) - emb = self.embeddings(tgt, step=step) - dec_out = emb - assert emb.dim() == 3 # len x batch x embedding_dim + dec_out = self.embeddings(tgt, step=step) pad_idx = self.embeddings.word_padding_idx - src_lens = kwargs["src_len"] + src_len = kwargs["src_len"] src_max_len = self.state["src"].shape[1] - src_pad_mask = ~sequence_mask(src_lens, src_max_len) # [B x slen] - src_pad_mask = src_pad_mask.unsqueeze(1) # [B x 1 x slen] + src_pad_mask = sequence_mask(src_len, src_max_len).unsqueeze( + 1 + ) # [B x 1 x slen] tgt_pad_mask = tgt[:, :, 0].eq(pad_idx).unsqueeze(1) # [B, 1, T_tgt] with_align = kwargs.pop("with_align", False) - return_attn = kwargs.pop("return_attn", False) - return_attn = with_align or self._copy or return_attn + return_attn = with_align or self._copy or kwargs.pop("return_attn", False) attn_aligns = [] diff --git a/onmt/encoders/mean_encoder.py b/onmt/encoders/mean_encoder.py index 115d42b9fa..602524559b 100644 --- a/onmt/encoders/mean_encoder.py +++ b/onmt/encoders/mean_encoder.py @@ -30,7 +30,7 @@ def forward(self, src, src_len=None): if src_len is not None: # we avoid padding while mean pooling - mask = sequence_mask(src_len).float() + mask = (~sequence_mask(src_len)).float() mask = mask / src_len.unsqueeze(1).float() mean = torch.bmm(mask.unsqueeze(1), emb).squeeze(1) else: diff --git a/onmt/encoders/transformer.py b/onmt/encoders/transformer.py index 727bafa654..184d44881f 100644 --- a/onmt/encoders/transformer.py +++ b/onmt/encoders/transformer.py @@ -228,10 +228,9 @@ def from_opt(cls, opt, embeddings): def forward(self, src, src_len=None): """See :func:`EncoderBase.forward()`""" enc_out = self.embeddings(src) - mask = ~sequence_mask(src_len).unsqueeze(1) - mask = mask.unsqueeze(1) + mask = sequence_mask(src_len).unsqueeze(1).unsqueeze(1) mask = mask.expand(-1, -1, mask.size(3), -1) - # mask is now (batch x 1 x slen x slen) + # Padding mask is now (batch x 1 x slen x slen) # 1 to be expanded to number of heads in MHA # Run the forward pass of every layer of the tranformer. diff --git a/onmt/inputters/text_corpus.py b/onmt/inputters/text_corpus.py index ab81718537..5641daae06 100644 --- a/onmt/inputters/text_corpus.py +++ b/onmt/inputters/text_corpus.py @@ -172,44 +172,22 @@ def __init__( self.stride = stride self.offset = offset - def _tokenize(self, stream): - for example in stream: + def _process(self, stream): + for i, example in enumerate(stream): example["src"] = example["src"].strip("\n").split() example["src_original"] = example["src_original"].strip("\n").split() if "src_feats" in example: example["src_feats"] = [ feat.strip("\n").split() for feat in example["src_feats"] ] - if example["tgt"] is not None: - example["tgt"] = example["tgt"].strip("\n").split() - example["tgt_original"] = example["tgt_original"].strip("\n").split() - if "align" in example: - example["align"] = example["align"].strip("\n").split() - yield example - - def _transform(self, stream): - for example in stream: - # NOTE: moved to dynamic_iterator.py cf process() - # item = self.transform.apply( - # example, is_train=self.infinitely, corpus_name=self.cid) - item = (example, self.transform, self.cid) - if item is not None: - yield item - report_msg = self.transform.stats() - if report_msg != "": - logger.info( - "* Transform statistics for {}({:.2f}%):\n{}\n".format( - self.cid, 100 / self.stride, report_msg - ) - ) - - def _add_index(self, stream): - for i, item in enumerate(stream): - example = item[0] line_number = i * self.stride + self.offset example["cid_line_number"] = line_number example["cid"] = self.cid + if "align" in example: + example["align"] = example["align"].strip("\n").split() if example["tgt"] is not None: + example["tgt"] = example["tgt"].strip("\n").split() + example["tgt_original"] = example["tgt_original"].strip("\n").split() if ( len(example["src"]) == 0 or len(example["tgt"]) == 0 @@ -222,16 +200,21 @@ def _add_index(self, stream): elif self.skip_empty_level == "warning": logger.warning(empty_msg) if len(example["src"]) == 0 and len(example["tgt"]) == 0: - yield item + yield (example, self.transform, self.cid) continue - yield item + yield (example, self.transform, self.cid) + report_msg = self.transform.stats() + if report_msg != "": + logger.info( + "* Transform statistics for {}({:.2f}%):\n{}\n".format( + self.cid, 100 / self.stride, report_msg + ) + ) def __iter__(self): corpus_stream = self.corpus.load(stride=self.stride, offset=self.offset) - tokenized_corpus = self._tokenize(corpus_stream) - transformed_corpus = self._transform(tokenized_corpus) - indexed_corpus = self._add_index(transformed_corpus) - yield from indexed_corpus + corpus = self._process(corpus_stream) + yield from corpus def build_corpora_iters( diff --git a/onmt/model_builder.py b/onmt/model_builder.py index e37e4dcce5..c78831a8c4 100644 --- a/onmt/model_builder.py +++ b/onmt/model_builder.py @@ -123,6 +123,7 @@ def load_test_model(opt, device_id=0, model_path=None): model_opt.attention_dropout = ( 0.0 # required to force no dropout at inference with flash ) + model = build_base_model(model_opt, vocabs) precision = torch.float32 @@ -162,6 +163,7 @@ def load_test_model(opt, device_id=0, model_path=None): for name, module in model.named_modules(): if hasattr(module, "dropout_p"): module.dropout_p = 0.0 + return vocabs, model, model_opt diff --git a/onmt/modules/global_attention.py b/onmt/modules/global_attention.py index 493f49005d..9fb2b27c33 100644 --- a/onmt/modules/global_attention.py +++ b/onmt/modules/global_attention.py @@ -169,7 +169,7 @@ def forward(self, src, enc_out, src_len=None, coverage=None): align = self.score(src, enc_out) if src_len is not None: - mask = sequence_mask(src_len, max_len=align.size(-1)) + mask = ~sequence_mask(src_len, max_len=align.size(-1)) mask = mask.unsqueeze(1) # Make it broadcastable. align.masked_fill_(~mask, -float("inf")) diff --git a/onmt/modules/multi_headed_attn.py b/onmt/modules/multi_headed_attn.py index 31813cb7e3..de5a6d085c 100644 --- a/onmt/modules/multi_headed_attn.py +++ b/onmt/modules/multi_headed_attn.py @@ -53,7 +53,7 @@ def relative_matmul(x: Tensor, z: Tensor, transpose: bool) -> Tensor: https://arxiv.org/pdf/1803.02155.pdf x shape [batch_size x heads x q_len x k_len] """ - batch_size, heads, length = x.size() + batch_size, heads, length, _ = x.size() x_t = x.permute(2, 0, 1, 3) x_t_r = x_t.contiguous().view(length, heads * batch_size, -1) if transpose: diff --git a/onmt/tests/pull_request_chk.sh b/onmt/tests/pull_request_chk.sh index 0bb7b97131..99bfc81680 100755 --- a/onmt/tests/pull_request_chk.sh +++ b/onmt/tests/pull_request_chk.sh @@ -176,8 +176,7 @@ ${PYTHON} onmt/bin/train.py \ [ "$?" -eq 0 ] || error_exit echo "Succeeded" | tee -a ${LOG_FILE} - -echo -n " [+] Testing NMT training w/ validation with dynamic scoring and copy ..." +echo -n " [+] Testing NMT transformer training w/ validation with dynamic scoring and copy ..." ${PYTHON} onmt/bin/train.py \ -config ${DATA_DIR}/data.yaml \ -src_vocab $TMP_OUT_DIR/onmt.vocab.src \ @@ -200,6 +199,7 @@ ${PYTHON} onmt/bin/train.py \ -tensorboard "true" \ -scoring_debug "true" \ -copy_attn \ + -position_encoding \ -dump_preds $TMP_OUT_DIR/dump_pred \ -tensorboard_log_dir $TMP_OUT_DIR/logs_dynamic-scoring_and_copy >> ${LOG_FILE} 2>&1 @@ -208,6 +208,99 @@ ${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_dynamic-scoring_a echo "Succeeded" | tee -a ${LOG_FILE} rm -r $TMP_OUT_DIR/logs_dynamic-scoring_and_copy +echo -n " [+] Testing NMT transformer training w/ validation with dynamic scoring and maxrelative ..." +${PYTHON} onmt/bin/train.py \ + -config ${DATA_DIR}/data.yaml \ + -src_vocab $TMP_OUT_DIR/onmt.vocab.src \ + -tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -bucket_size 1024 \ + -train_steps 10 \ + -report_every 2 \ + -valid_steps 5 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -max_relative_positions 8 \ + -dump_preds $TMP_OUT_DIR/dump_pred \ + -tensorboard_log_dir $TMP_OUT_DIR/logs_dynamic-scoring_and_relative >> ${LOG_FILE} 2>&1 + +${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_dynamic-scoring_and_relative -tensorboard_checks valid_metrics +[ "$?" -eq 0 ] || error_exit +echo "Succeeded" | tee -a ${LOG_FILE} +rm -r $TMP_OUT_DIR/logs_dynamic-scoring_and_relative + +echo -n " [+] Testing NMT transformer training w/ validation with dynamic scoring and rotary ..." +${PYTHON} onmt/bin/train.py \ + -config ${DATA_DIR}/data.yaml \ + -src_vocab $TMP_OUT_DIR/onmt.vocab.src \ + -tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -bucket_size 1024 \ + -train_steps 10 \ + -report_every 2 \ + -valid_steps 5 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -max_relative_positions -1 \ + -dump_preds $TMP_OUT_DIR/dump_pred \ + -tensorboard_log_dir $TMP_OUT_DIR/logs_dynamic-scoring_and_rotary >> ${LOG_FILE} 2>&1 + +${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_dynamic-scoring_and_rotary -tensorboard_checks valid_metrics +[ "$?" -eq 0 ] || error_exit +echo "Succeeded" | tee -a ${LOG_FILE} +rm -r $TMP_OUT_DIR/logs_dynamic-scoring_and_rotary + +echo -n " [+] Testing NMT transformer training w/ validation with dynamic scoring and alibi ..." +${PYTHON} onmt/bin/train.py \ + -config ${DATA_DIR}/data.yaml \ + -src_vocab $TMP_OUT_DIR/onmt.vocab.src \ + -tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \ + -src_vocab_size 1000 \ + -tgt_vocab_size 1000 \ + -encoder_type transformer \ + -decoder_type transformer \ + -layers 4 \ + -word_vec_size 16 \ + -hidden_size 16 \ + -num_workers 0 -bucket_size 1024 \ + -heads 2 \ + -transformer_ff 64 \ + -bucket_size 1024 \ + -train_steps 10 \ + -report_every 2 \ + -valid_steps 5 \ + -valid_metrics "BLEU" "TER" \ + -tensorboard "true" \ + -scoring_debug "true" \ + -max_relative_positions -2 \ + -dump_preds $TMP_OUT_DIR/dump_pred \ + -tensorboard_log_dir $TMP_OUT_DIR/logs_dynamic-scoring_and_alibi >> ${LOG_FILE} 2>&1 + +${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_dynamic-scoring_and_alibi -tensorboard_checks valid_metrics +[ "$?" -eq 0 ] || error_exit +echo "Succeeded" | tee -a ${LOG_FILE} +rm -r $TMP_OUT_DIR/logs_dynamic-scoring_and_alibi + echo -n " [+] Testing LM training..." ${PYTHON} onmt/bin/train.py \ -config ${DATA_DIR}/lm_data.yaml \ diff --git a/onmt/translate/translation.py b/onmt/translate/translation.py index a37ad283d9..ab462041f4 100644 --- a/onmt/translate/translation.py +++ b/onmt/translate/translation.py @@ -40,7 +40,7 @@ def _build_target_tokens(self, src, srclen, pred, attn, voc, dyn_voc): voc[tok] if tok < len(voc) else dyn_voc.ids_to_tokens[tok - len(self.vocabs["src"].ids_to_tokens)] - for tok in pred + for tok in pred.tolist() ] if tokens[-1] == DefaultTokens.EOS: tokens = tokens[:-1] diff --git a/onmt/translate/translator.py b/onmt/translate/translator.py index 75d730fc5b..41c607bf34 100644 --- a/onmt/translate/translator.py +++ b/onmt/translate/translator.py @@ -406,7 +406,7 @@ def _process_bucket(bucket_translations): bucket_translations = sorted( bucket_translations, key=lambda x: x.ind_in_bucket ) - for j, trans in enumerate(bucket_translations): + for trans in bucket_translations: bucket_scores += [trans.pred_scores[: self.n_best]] bucket_score += trans.pred_scores[0] bucket_words += len(trans.pred_sents[0]) @@ -418,10 +418,6 @@ def _process_bucket(bucket_translations): " ".join(pred) for pred in trans.pred_sents[: self.n_best] ] - n_best_scores = [ - score.item() for score in trans.pred_scores[: self.n_best] - ] - if self.report_align: align_pharaohs = [ build_align_pharaoh(align) @@ -440,12 +436,14 @@ def _process_bucket(bucket_translations): bucket_predictions += [n_best_preds] - out_all = [ - pred + "\t" + str(score) - for (pred, score) in zip(n_best_preds, n_best_scores) - ] - if self.with_score: + n_best_scores = [ + score.item() for score in trans.pred_scores[: self.n_best] + ] + out_all = [ + pred + "\t" + str(score) + for (pred, score) in zip(n_best_preds, n_best_scores) + ] self.out_file.write("\n".join(out_all) + "\n") else: self.out_file.write("\n".join(n_best_preds) + "\n") @@ -506,6 +504,7 @@ def _process_bucket(bucket_translations): prev_idx = 0 for batch, bucket_idx in infer_iter: + batch_data = self.translate_batch(batch, attn_debug) translations = xlation_builder.from_batch(batch_data) diff --git a/onmt/utils/misc.py b/onmt/utils/misc.py index 5e3fa390fb..2b404c4998 100644 --- a/onmt/utils/misc.py +++ b/onmt/utils/misc.py @@ -54,14 +54,8 @@ def sequence_mask(lengths, max_len=None): """ Creates a boolean mask from sequence lengths. """ - batch_size = lengths.numel() max_len = max_len or lengths.max() - return ( - torch.arange(0, max_len, device=lengths.device) - .type_as(lengths) - .repeat(batch_size, 1) - .lt(lengths.unsqueeze(1)) - ) + return torch.arange(0, max_len, device=lengths.device) >= lengths.unsqueeze(1) def tile(x, count, dim=0):