NVIDIA · malay-nagda · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/llama3_8b.py
@@ -0,0 +1,129 @@
+import os
+from datetime import datetime
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+
+def llama3_8b_performance(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * (num_nodes * num_gpus_per_node)  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    num_nodes = 1
+    num_gpus_per_node = 8
+    mbs = 1
+    gbs = 128
+    tp_size = 1
+    pp_size = 1
+    cp_size = 2
+    vp_size = None
+    max_steps = 100
+
+    exp_name = f"llama3_8b_{args.compute_dtype}_{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_{mbs}mbs_{gbs}gbs"
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        num_nodes,
+        num_gpus_per_node,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = llama3_8b_performance(
+        args.compute_dtype,
+        num_nodes,
+        num_gpus_per_node,
+        mbs,
+        gbs,
+        tp_size,
+        pp_size,
+        cp_size,
+        vp_size,
+        max_steps,
+    )
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=[
+                PerfEnvPlugin(enable_vboost=True),
+                NsysPlugin(start_step=5, end_step=6),
+            ],
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
@@ -0,0 +1,150 @@
+import argparse
+import os
+from typing import Any, List, Optional
+
+import nemo_run as run
+from lightning.pytorch.callbacks.callback import Callback
+
+from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
+from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
+
+
+def slurm_executor(
+    account: str,
+    partition: str,
+    log_dir: str,
+    nodes: int,
+    num_gpus_per_node: int,
+    time_limit: str = "01:00:00",
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    custom_mounts: Optional[list[str]] = None,
+    custom_env_vars: Optional[dict[str, str]] = None,
+    retries: int = 0,
+) -> run.SlurmExecutor:
+    if not (log_dir and account and partition and nodes and num_gpus_per_node):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this ",
+            "function.",
+        )
+
+    mounts = []
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "1",
+        "NVTE_FLASH_ATTN": "0",
+    }
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.LocalTunnel(
+            job_dir=log_dir,
+        ),
+        nodes=nodes,
+        ntasks_per_node=num_gpus_per_node,
+        gpus_per_node=num_gpus_per_node,
+        mem="0",
+        exclusive=True,
+        gres="gpu:8",
+        packager=run.GitArchivePackager(),
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.retries = retries
+    executor.time = time_limit
+
+    return executor
+
+
+def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
+    """
+    AutoTokenizer first searches for tokenizer files locally in env var 'NEMO_HOME'.
+    If tokenizer files are not present locally, AutoTokenizer will try downloading from HuggingFace.
+    In the case tokenizer needs downloading, make sure env vars- 'TRANSFORMERS_OFFLINE=0' and
+    'HF_TOKEN:<token_value>' are set inside NeMo container.
+    """
+    return run.Config(
+        AutoTokenizer,
+        pretrained_model_name=model_name,
+        use_fast=True,
+    )
+
+
+def get_comm_overlap_callback_idx(callbacks: List[Callback]):
+    if callbacks:  # default is None in lightning
+        for idx, callback in enumerate(callbacks):
+            if isinstance(callback, MegatronCommOverlapCallback):
+                return idx
+    return -1
+
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning")
+
+    parser.add_argument(
+        "-a",
+        "--account",
+        type=str,
+        help="Slurm account to use for experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "-p",
+        "--partition",
+        type=str,
+        help="Slurm partition to use for experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "-l",
+        "--log_dir",
+        type=str,
+        help="Directory for logging experiment results. Defaults to '~/nemo_log_dir'",
+        required=False,
+        default=os.path.expanduser("~/nemo_log_dir"),
+    )
+    parser.add_argument(
+        "-t",
+        "--time_limit",
+        type=str,
+        help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')",
+        required=False,
+        default="00:30:00",
+    )
+    parser.add_argument(
+        "-i",
+        "--container_image",
+        type=str,
+        help="NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'\
+            Make sure your NGC credentials are accessible in your environment.",
+        required=False,
+        default="nvcr.io/nvidia/nemo:dev",
+    )
+    parser.add_argument(
+        "-c",
+        "--compute_dtype",
+        type=str,
+        help="Compute precision. Options- bf16 or fp8. Defaults to bf16",
+        required=False,
+        default="bf16",
+    )
+    parser.add_argument(
+        "-d",
+        "--dryrun",
+        help="If true, prints sbatch script to terminal without launching experiment.",
+        required=False,
+        action="store_true",
+    )
+
+    return parser