Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor changes for run #28

Merged
merged 4 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ train_config:
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ model_config:
block_size: 2048
vocab_size: 256000
separator_token: <bos>
separator_token_id: 2
# tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
# tokenizer_path: "google/gemma-2b"
31 changes: 31 additions & 0 deletions chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
train_config:
adam_beta1: 0.9
adam_beta2: 0.95
batch_size: 500000
dropout_prob: 0.1
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
warmup_steps: 0
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 18
block_size: 2048
vocab_size: 256000
separator_token: <bos>
separator_token_id: 2
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
1 change: 1 addition & 0 deletions chemlactica/config/default_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class ModelConfig:
block_size: int = 2048
vocab_size: int = 50000
separator_token: str = "</s>"
separator_token_id: int = 2
tokenizer_path: str = "chemlactica/tokenizer/ChemLacticaTokenizer66"


Expand Down
2 changes: 1 addition & 1 deletion chemlactica/config/galactica_accelerate_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_offload_params: false
fsdp_forward_prefetch: false
fsdp_forward_prefetch: true
fsdp_sharding_strategy: 1
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_transformer_layer_cls_to_wrap: OPTForCausalLM
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/custom_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class CustomArguments(TrainingArguments):
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
# the number of samples to print when the training begins, for debugging purposes
self.num_samples_to_print = 5
self.num_samples_to_print = 10
self.tokenizer_path = kwargs["args"].tokenizer_path
super().__init__(*args, **kwargs)

Expand Down
4 changes: 2 additions & 2 deletions chemlactica/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def samples_generator(
distributed_state.process_index,
):
returned = True
ret = format_sample(line)
yield ret
ret = format_sample(line)
yield ret
counter = counter + 1
shared_jsonl_files[file] = state
line = f.readline()
2 changes: 1 addition & 1 deletion chemlactica/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def process_dataset(
assay=True,
):
tokenizer = get_tokenizer(model_config.tokenizer_path)
eos_token_id = tokenizer.eos_token_id
eos_token_id = model_config.separator_token_id
rng = np.random.default_rng()

if assay:
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/utils/text_format_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def generate_formatted_string(compound_json, rng, model_config):
key_value_pairs = []
key = "SMILES"
value = compound_json.get(key, "")
if rng.integers(0, 1) == 0:
if rng.integers(2) == 0:
if value:
key_value_pairs.append(format_key_value(key, value, rng))
del compound_json[key]
Expand Down
321 changes: 103 additions & 218 deletions notebooks/playground.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions submit_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
import submitit

use_accelerate = True
rsync_enabled = False
executor_name = "local" # options are ["slurm", "local"]
rsync_enabled = True
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 2
num_gpus = 6
model_name = "galactica"
model_size = "125m"
train_type = "pretrain"
train_name = "_".join([model_name, model_size, train_type])
job_name = "gal_relform"
job_name = "gal_relform2"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 30,
"timeout_min": 60 * 24 * 2,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 20,
"mem_gb": num_gpus * 20.0 + 20.0,
"mem_gb": num_gpus * 40.0 + 20.0,
"stderr_to_stdout": True,
}

Expand Down Expand Up @@ -50,7 +50,7 @@
"dataloader_num_workers": 1,
"experiment_name": job_name,
"checkpoints_root_dir": "/nfs/dgx/raid/chem/checkpoints/",
"flash_attn": False,
"flash_attn": True,
"track": True,
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
Expand Down
12 changes: 6 additions & 6 deletions submit_run_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
rsync_enabled = True
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 4
num_gpus = 3
model_name = "gemma"
model_size = "2b"
train_type = "pretrain"
train_name = "_".join([model_name, model_size, train_type])
job_name = "gemma_4Btokens"
job_name = "gemma_400Mtokens_qedfirst"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 60 * 24 * 2,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 11,
"cpus_per_task": num_gpus * 17,
"mem_gb": num_gpus * 30.0 + 20.0,
"stderr_to_stdout": True,
}
Expand All @@ -43,10 +43,10 @@
"training_data_dirs": "/nfs/ap/mnt/sxtn/rdkit_computed_rel+form/train_rdkit_computed_rel+form",
# "training_data_dirs": "/auto/home/menuab/code/data",
"valid_data_dir": "/nfs/ap/mnt/sxtn/rdkit_computed_rel+form/valid_rdkit_computed_rel+form",
"max_steps": 30000,
"max_steps": 2100,
# "num_train_epochs": 2,
"eval_steps": 0,
"save_steps": 5000,
"save_steps": 1000,
"train_batch_size": 1,
# "valid_batch_size":,s
"dataloader_num_workers": 1,
Expand All @@ -57,7 +57,7 @@
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
# "profile_dir":,
"gradient_accumulation_steps": 16,
"gradient_accumulation_steps": 32,
# "gradient_checkpointing": False,
# "evaluate_only":,
# "check_reproducability":,
Expand Down
Loading