Add mini logan config

ratschlab · Nov 5, 2024 · 72dc3a0 · 72dc3a0
1 parent eaaaca4
commit 72dc3a0
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 0 deletions.
diff --git a/petagraph/mini_logan/config_petagraph_single_gpu.yaml b/petagraph/mini_logan/config_petagraph_single_gpu.yaml
@@ -0,0 +1,95 @@
+checkpoints:
+  checkpoint_interval: 1000
+  checkpoints_path: /home/manuel/projects/petagraph/nanotron-petagraph/logs/mini_logan/checkpoints
+  checkpoints_path_is_shared_file_system: true
+  resume_checkpoint_path: null # /users/burgerm/petagraph/logs/transcriptomics/base_ntp/checkpoints
+  save_initial_state: false
+data_stages:
+- data:
+    dataset: null # Custom dataloader will be used
+    num_loading_workers: 0
+    seed: 42
+    sequence_files_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data/eukaryota_transcriptomics_500_400_2024-10-03_09-33-14.csv"
+    all_sequences_resources_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data"
+    prefetch_buffer_seq_size: 20000
+  name: Stable Training Stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: mini-logan-ntp-transcriptomics
+  run: mini-logan-v1
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 50
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+    truncated_normal_bound: 2.0
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 1
+    hidden_act: silu
+    hidden_size: 128
+    initializer_range: 0.025
+    intermediate_size: 2048
+    is_llama_config: true
+    max_position_embeddings: 4096
+    num_attention_heads: 2
+    num_hidden_layers: 16
+    num_key_value_heads: 2
+    pad_token_id: 2
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 8
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 1.0e-04
+    lr_decay_starting_step: 50_000
+    lr_decay_steps: null
+    lr_decay_style: cosine
+    lr_warmup_steps: 5_000
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-07
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 1
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8
+  sequence_length: 4096
+  train_steps: 1_000_000
+  val_check_interval: -1
diff --git a/petagraph/mini_logan/run_multi_node.sh b/petagraph/mini_logan/run_multi_node.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH --job-name=petagraph-1b     # create a short name for your job
+#SBATCH --nodes=24              # total number of nodes
+#SBATCH --ntasks-per-node=1      # total number of tasks per node
+#SBATCH --gpus-per-task=4
+#SBATCH --time=6:00:00
+#SBATCH --output=/users/burgerm/petagraph/logs/transcriptomics/slurm/peta-1b_slurm_%x_%j.log
+#SBATCH --partition=normal
+#SBATCH --account=a02
+#SBATCH --cpus-per-task=288
+#SBATCH --reservation=sai-a02
+#SBATCH --mem=460000
+
+# Initialization.
+set -x
+cat $0
+
+# random master port in the range 20000 - 30000
+export MASTER_PORT=$((20000 + RANDOM % 10000))
+export MASTER_ADDR=$(hostname)
+export CUDA_DEVICE_MAX_CONNECTIONS=1         # required by nanotron
+
+# Run main script.
+srun -ul --environment=petagraph_python_env bash -c "
+  # Change cwd and run the main training script.
+  cd /users/burgerm/petagraph/nanotron-petagraph
+  pip install -e ./   # Only required the first time.
+
+  export OMP_NUM_THREADS=32
+ 
+  TORCHRUN_ARGS=\"
+   --node-rank=\${SLURM_PROCID} \
+   --master-addr=\${MASTER_ADDR} \
+   --master-port=\${MASTER_PORT} \
+   --nnodes=\${SLURM_NNODES} \
+   --nproc-per-node=\${SLURM_GPUS_PER_TASK}
+  \"
+
+  echo \"Running with node rank \${SLURM_PROCID}\"
+  echo \"Running with master addr \${MASTER_ADDR}\"
+  echo \"Running with master port \${MASTER_PORT}\"
+  echo \"Running with nnodes \${SLURM_NNODES}\"
+ 
+  numactl --membind=0-3 torchrun \${TORCHRUN_ARGS} petagraph/run_train.py --config-file /users/burgerm/petagraph/logs/transcriptomics/base_ntp/config_petagraph_multi_node.yaml
+"