forked from huggingface/nanotron
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
eaaaca4
commit 72dc3a0
Showing
2 changed files
with
140 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
checkpoints: | ||
checkpoint_interval: 1000 | ||
checkpoints_path: /home/manuel/projects/petagraph/nanotron-petagraph/logs/mini_logan/checkpoints | ||
checkpoints_path_is_shared_file_system: true | ||
resume_checkpoint_path: null # /users/burgerm/petagraph/logs/transcriptomics/base_ntp/checkpoints | ||
save_initial_state: false | ||
data_stages: | ||
- data: | ||
dataset: null # Custom dataloader will be used | ||
num_loading_workers: 0 | ||
seed: 42 | ||
sequence_files_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data/eukaryota_transcriptomics_500_400_2024-10-03_09-33-14.csv" | ||
all_sequences_resources_path: "/home/manuel/projects/petagraph/projects2024-petagraph-ml/data" | ||
prefetch_buffer_seq_size: 20000 | ||
name: Stable Training Stage | ||
start_training_step: 1 | ||
general: | ||
benchmark_csv_path: null | ||
consumed_train_samples: null | ||
ignore_sanity_checks: true | ||
project: mini-logan-ntp-transcriptomics | ||
run: mini-logan-v1 | ||
seed: 42 | ||
step: null | ||
lighteval: null | ||
logging: | ||
iteration_step_info_interval: 50 | ||
log_level: info | ||
log_level_replica: info | ||
model: | ||
ddp_bucket_cap_mb: 25 | ||
dtype: bfloat16 | ||
init_method: | ||
std: 0.025 | ||
truncated_normal_bound: 2.0 | ||
make_vocab_size_divisible_by: 1 | ||
model_config: | ||
bos_token_id: 0 | ||
eos_token_id: 1 | ||
hidden_act: silu | ||
hidden_size: 128 | ||
initializer_range: 0.025 | ||
intermediate_size: 2048 | ||
is_llama_config: true | ||
max_position_embeddings: 4096 | ||
num_attention_heads: 2 | ||
num_hidden_layers: 16 | ||
num_key_value_heads: 2 | ||
pad_token_id: 2 | ||
pretraining_tp: 1 | ||
rms_norm_eps: 1.0e-05 | ||
rope_scaling: null | ||
tie_word_embeddings: true | ||
use_cache: true | ||
vocab_size: 8 | ||
optimizer: | ||
accumulate_grad_in_fp32: true | ||
clip_grad: 1.0 | ||
learning_rate_scheduler: | ||
learning_rate: 1.0e-04 | ||
lr_decay_starting_step: 50_000 | ||
lr_decay_steps: null | ||
lr_decay_style: cosine | ||
lr_warmup_steps: 5_000 | ||
lr_warmup_style: linear | ||
min_decay_lr: 1.0e-05 | ||
optimizer_factory: | ||
adam_beta1: 0.9 | ||
adam_beta2: 0.95 | ||
adam_eps: 1.0e-07 | ||
name: adamW | ||
torch_adam_is_fused: true | ||
weight_decay: 0.01 | ||
zero_stage: 0 | ||
parallelism: | ||
dp: 1 | ||
expert_parallel_size: 1 | ||
pp: 1 | ||
pp_engine: 1f1b | ||
tp: 1 | ||
tp_linear_async_communication: true | ||
tp_mode: REDUCE_SCATTER | ||
profiler: null | ||
tokenizer: | ||
tokenizer_max_length: null | ||
tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel | ||
tokenizer_revision: null | ||
tokens: | ||
batch_accumulation_per_replica: 1 | ||
limit_test_batches: 0 | ||
limit_val_batches: 0 | ||
micro_batch_size: 8 | ||
sequence_length: 4096 | ||
train_steps: 1_000_000 | ||
val_check_interval: -1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=petagraph-1b # create a short name for your job | ||
#SBATCH --nodes=24 # total number of nodes | ||
#SBATCH --ntasks-per-node=1 # total number of tasks per node | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --time=6:00:00 | ||
#SBATCH --output=/users/burgerm/petagraph/logs/transcriptomics/slurm/peta-1b_slurm_%x_%j.log | ||
#SBATCH --partition=normal | ||
#SBATCH --account=a02 | ||
#SBATCH --cpus-per-task=288 | ||
#SBATCH --reservation=sai-a02 | ||
#SBATCH --mem=460000 | ||
|
||
# Initialization. | ||
set -x | ||
cat $0 | ||
|
||
# random master port in the range 20000 - 30000 | ||
export MASTER_PORT=$((20000 + RANDOM % 10000)) | ||
export MASTER_ADDR=$(hostname) | ||
export CUDA_DEVICE_MAX_CONNECTIONS=1 # required by nanotron | ||
|
||
# Run main script. | ||
srun -ul --environment=petagraph_python_env bash -c " | ||
# Change cwd and run the main training script. | ||
cd /users/burgerm/petagraph/nanotron-petagraph | ||
pip install -e ./ # Only required the first time. | ||
export OMP_NUM_THREADS=32 | ||
TORCHRUN_ARGS=\" | ||
--node-rank=\${SLURM_PROCID} \ | ||
--master-addr=\${MASTER_ADDR} \ | ||
--master-port=\${MASTER_PORT} \ | ||
--nnodes=\${SLURM_NNODES} \ | ||
--nproc-per-node=\${SLURM_GPUS_PER_TASK} | ||
\" | ||
echo \"Running with node rank \${SLURM_PROCID}\" | ||
echo \"Running with master addr \${MASTER_ADDR}\" | ||
echo \"Running with master port \${MASTER_PORT}\" | ||
echo \"Running with nnodes \${SLURM_NNODES}\" | ||
numactl --membind=0-3 torchrun \${TORCHRUN_ARGS} petagraph/run_train.py --config-file /users/burgerm/petagraph/logs/transcriptomics/base_ntp/config_petagraph_multi_node.yaml | ||
" |