Skip to content

Commit

Permalink
Add options to assign vocab and mergers file for mockdatamoddule (#11678
Browse files Browse the repository at this point in the history
)

* Add options to assign vocab and mergers file

Signed-off-by: Boxiang Wang <[email protected]>

* Apply isort and black reformatting

Signed-off-by: BoxiangW <[email protected]>

* Remove my settings

Signed-off-by: Boxiang Wang <[email protected]>

---------

Signed-off-by: Boxiang Wang <[email protected]>
Signed-off-by: BoxiangW <[email protected]>
Co-authored-by: BoxiangW <[email protected]>
  • Loading branch information
BoxiangW and BoxiangW authored Dec 23, 2024
1 parent 9a2f0bd commit 8aa4b60
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 1 deletion.
6 changes: 5 additions & 1 deletion nemo/collections/llm/gpt/data/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __init__(
pin_memory: bool = True,
persistent_workers: bool = False,
create_attention_mask: bool = False,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
):
super().__init__()
self.seq_length = seq_length
Expand All @@ -61,7 +63,9 @@ def __init__(
if tokenizer is None:
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
self.tokenizer = get_nmt_tokenizer(
"megatron", "GPT2BPETokenizer", vocab_file=vocab_file, merges_file=merges_file
)
else:
self.tokenizer = tokenizer

Expand Down
12 changes: 12 additions & 0 deletions scripts/llm/pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ def main():

pretrain.trainer.max_steps = 1000

# Change here and add your files to custom_mounts
vocab_file = None
merges_file = None
pretrain.data = MockDataModule(
seq_length=pretrain.data.seq_length,
global_batch_size=pretrain.data.global_batch_size,
micro_batch_size=pretrain.data.micro_batch_size,
vocab_file=vocab_file,
merges_file=merges_file,
)

executor: run.Executor

if args.slurm:
Expand All @@ -166,6 +177,7 @@ def main():
partition="",
nodes=pretrain.trainer.num_nodes,
devices=pretrain.trainer.devices,
custom_mounts=[],
)
else:
executor = local_executor_torchrun(nodes=pretrain.trainer.num_nodes, devices=pretrain.trainer.devices)
Expand Down

0 comments on commit 8aa4b60

Please sign in to comment.