diff --git a/petagraph/mini_logan/config_petagraph_single_gpu.yaml b/petagraph/mini_logan/config_petagraph_single_gpu.yaml new file mode 100644 index 00000000..69799c15 --- /dev/null +++ b/petagraph/mini_logan/config_petagraph_single_gpu.yaml @@ -0,0 +1,95 @@ +checkpoints: + checkpoint_interval: 1000 + checkpoints_path: /home/manuel/projects/petagraph/nanotron-petagraph/logs/mini_logan/checkpoints + checkpoints_path_is_shared_file_system: true + resume_checkpoint_path: null # /users/burgerm/petagraph/logs/transcriptomics/base_ntp/checkpoints + save_initial_state: false +data_stages: +- data: + dataset: null # Custom dataloader will be used + num_loading_workers: 0 + seed: 42 + sequence_files_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data/eukaryota_transcriptomics_500_400_2024-10-03_09-33-14.csv" + all_sequences_resources_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data" + prefetch_buffer_seq_size: 20000 + name: Stable Training Stage + start_training_step: 1 +general: + benchmark_csv_path: null + consumed_train_samples: null + ignore_sanity_checks: true + project: mini-logan-ntp-transcriptomics + run: mini-logan-v1 + seed: 42 + step: null +lighteval: null +logging: + iteration_step_info_interval: 50 + log_level: info + log_level_replica: info +model: + ddp_bucket_cap_mb: 25 + dtype: bfloat16 + init_method: + std: 0.025 + truncated_normal_bound: 2.0 + make_vocab_size_divisible_by: 1 + model_config: + bos_token_id: 0 + eos_token_id: 1 + hidden_act: silu + hidden_size: 128 + initializer_range: 0.025 + intermediate_size: 2048 + is_llama_config: true + max_position_embeddings: 4096 + num_attention_heads: 2 + num_hidden_layers: 16 + num_key_value_heads: 2 + pad_token_id: 2 + pretraining_tp: 1 + rms_norm_eps: 1.0e-05 + rope_scaling: null + tie_word_embeddings: true + use_cache: true + vocab_size: 8 +optimizer: + accumulate_grad_in_fp32: true + clip_grad: 1.0 + learning_rate_scheduler: + learning_rate: 1.0e-04 + lr_decay_starting_step: 50_000 + lr_decay_steps: null + lr_decay_style: cosine + lr_warmup_steps: 5_000 + lr_warmup_style: linear + min_decay_lr: 1.0e-05 + optimizer_factory: + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_eps: 1.0e-07 + name: adamW + torch_adam_is_fused: true + weight_decay: 0.01 + zero_stage: 0 +parallelism: + dp: 1 + expert_parallel_size: 1 + pp: 1 + pp_engine: 1f1b + tp: 1 + tp_linear_async_communication: true + tp_mode: REDUCE_SCATTER +profiler: null +tokenizer: + tokenizer_max_length: null + tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel + tokenizer_revision: null +tokens: + batch_accumulation_per_replica: 1 + limit_test_batches: 0 + limit_val_batches: 0 + micro_batch_size: 8 + sequence_length: 4096 + train_steps: 1_000_000 + val_check_interval: -1 diff --git a/petagraph/mini_logan/run_multi_node.sh b/petagraph/mini_logan/run_multi_node.sh new file mode 100644 index 00000000..22d079a9 --- /dev/null +++ b/petagraph/mini_logan/run_multi_node.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=petagraph-1b # create a short name for your job +#SBATCH --nodes=24 # total number of nodes +#SBATCH --ntasks-per-node=1 # total number of tasks per node +#SBATCH --gpus-per-task=4 +#SBATCH --time=6:00:00 +#SBATCH --output=/users/burgerm/petagraph/logs/transcriptomics/slurm/peta-1b_slurm_%x_%j.log +#SBATCH --partition=normal +#SBATCH --account=a02 +#SBATCH --cpus-per-task=288 +#SBATCH --reservation=sai-a02 +#SBATCH --mem=460000 + +# Initialization. +set -x +cat $0 + +# random master port in the range 20000 - 30000 +export MASTER_PORT=$((20000 + RANDOM % 10000)) +export MASTER_ADDR=$(hostname) +export CUDA_DEVICE_MAX_CONNECTIONS=1 # required by nanotron + +# Run main script. +srun -ul --environment=petagraph_python_env bash -c " + # Change cwd and run the main training script. + cd /users/burgerm/petagraph/nanotron-petagraph + pip install -e ./ # Only required the first time. + + export OMP_NUM_THREADS=32 + + TORCHRUN_ARGS=\" + --node-rank=\${SLURM_PROCID} \ + --master-addr=\${MASTER_ADDR} \ + --master-port=\${MASTER_PORT} \ + --nnodes=\${SLURM_NNODES} \ + --nproc-per-node=\${SLURM_GPUS_PER_TASK} + \" + + echo \"Running with node rank \${SLURM_PROCID}\" + echo \"Running with master addr \${MASTER_ADDR}\" + echo \"Running with master port \${MASTER_PORT}\" + echo \"Running with nnodes \${SLURM_NNODES}\" + + numactl --membind=0-3 torchrun \${TORCHRUN_ARGS} petagraph/run_train.py --config-file /users/burgerm/petagraph/logs/transcriptomics/base_ntp/config_petagraph_multi_node.yaml +" \ No newline at end of file