YerevaNN · MenuaB · May 23, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/chemlactica/config/config_yamls/galactica_125m_sft_config.yaml b/chemlactica/config/config_yamls/galactica_125m_sft_config.yaml
@@ -6,7 +6,7 @@ train_config:
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
-  max_learning_rate: 2.0e-5
+  max_learning_rate: 1.0e-4
   warmup_steps: 0
   weight_decay: 0.1
   bf16: true
@@ -27,4 +27,5 @@ model_config:
   block_size: 2048
   vocab_size: 50000
   separator_token: </s>
+  separator_token_id: 2
   tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
diff --git a/chemlactica/config/config_yamls/gemma_2b_sft_config.yaml b/chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
@@ -6,15 +6,15 @@ train_config:
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
-  max_learning_rate: 2.0e-5
-  warmup_steps: 0
+  max_learning_rate: 1.0e-4
+  warmup_steps: 305
   weight_decay: 0.1
   bf16: true
   bf16_full_eval: true
   fp16: false
   tf32: true
   evaluation_strategy: "steps"
-  save_total_limit: 4
+  save_total_limit: 8
   grad_accumulation_scheduler: false
   dynamic_grad_accumulation: false
   grad_accumulation_patience: 4000

diff --git a/chemlactica/config/default_train_config.py b/chemlactica/config/default_train_config.py
@@ -42,5 +42,5 @@ class TrainConfig:
 @dataclass
 class SFTTrainConfig:
     packing: bool = False
-    max_seq_length: int = 512
-    neftune_noise_alpha: int = 0
+    max_seq_length: int = 64
+    neftune_noise_alpha: int = 10
diff --git a/chemlactica/get_trainer.py b/chemlactica/get_trainer.py
@@ -25,7 +25,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
     elif train_type == "sft":
         sft_config = SFTTrainConfig()
         tokenizer = get_tokenizer(training_args.tokenizer_path)
-        response_template = "[PROPERTY]activity "
+        response_template = tokenizer.encode("[PROPERTY]activity")
         collator = DataCollatorForCompletionOnlyLM(
             response_template, tokenizer=tokenizer
         )

diff --git a/chemlactica/train.py b/chemlactica/train.py
@@ -66,6 +66,7 @@ def train(
     dir_data_types,
     valid_data_dir,
     learning_rate,
+    warmup_steps,
     scheduler_max_steps,
     eval_steps,
     save_steps,
@@ -115,7 +116,6 @@ def train(
         gradient_checkpointing=gradient_checkpointing,
         auth_token=auth_token,
     )
-
     # special_tokens = get_tokenizer_special_tokens(model_config.tokenizer_path)
     # print(f"{len(special_tokens)} {special_tokens} additional special tokens.")
     tokenizer_length = get_tokenizer_length(model_config)
@@ -237,7 +237,7 @@ def train(
             weight_decay=train_config.weight_decay,
             adam_beta1=train_config.adam_beta1,
             adam_beta2=train_config.adam_beta2,
-            warmup_steps=train_config.warmup_steps,
+            warmup_steps=warmup_steps if warmup_steps else train_config.warmup_steps,
             max_grad_norm=train_config.global_gradient_norm,
             evaluation_strategy=train_config.evaluation_strategy,
             max_steps=scheduler_max_steps,
@@ -285,7 +285,7 @@ def train(
         )
         if train_type == "sft":
             trainer_callback_dict["SFT numerical evaluation"] = SFTNumericalEval(
-                dataset, aim_callback
+                dataset, aim_callback, model_config.separator_token
             )
         elif train_type == "pretrain":
             if train_config.grad_accumulation_scheduler:

diff --git a/chemlactica/utils/callbacks.py b/chemlactica/utils/callbacks.py
@@ -341,10 +341,11 @@ def on_step_end(self, args, state, control, **kwargs):
 
 
 class SFTNumericalEval(TrainerCallback):
-    def __init__(self, dataset, aim_callback) -> None:
+    def __init__(self, dataset, aim_callback, separator_token) -> None:
         super().__init__()
         self.dataset = dataset
         self.aim = aim_callback
+        self.separator_token = separator_token
 
     def on_evaluate(
         self,
@@ -358,11 +359,20 @@ def on_evaluate(
         super().on_evaluate(args, state, control, **kwargs)
         model.eval()
         ground_truths, gens, diffs = [], [], []
+        eos_token_id = tokenizer.encode("[/PROPERTY]")[0]
         for sample in self.dataset["validation"]:
             ground_truth = round(sample["activity"], 2)
-            prompt = f"[START_SMILES]{sample['smiles']}[END_SMILES][PROPERTY]activity "
+            prompt = (
+                f"{self.separator_token}[START_SMILES]{sample['smiles']}"
+                "[END_SMILES][PROPERTY]activity"
+            )
             prompt = tokenizer(prompt, return_tensors="pt").to(model.device)
-            out = model.generate(prompt.input_ids, do_sample=False, max_length=100)
+            out = model.generate(
+                prompt.input_ids,
+                do_sample=False,
+                eos_token_id=eos_token_id,
+                max_new_tokens=100,
+            )
             out = tokenizer.batch_decode(out)[0]
             try:
                 gen = out[

diff --git a/chemlactica/utils/dataset_utils.py b/chemlactica/utils/dataset_utils.py
@@ -203,8 +203,8 @@ def sft_formatting_prompts_func(example):
     output_texts = []
     for i in range(len(example["smiles"])):
         text = (
-            f"[START_SMILES]{example['smiles'][i]}[END_SMILES]"
-            f"[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
+            f"<bos>[START_SMILES]{example['smiles'][i]}[END_SMILES]"
+            "[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
         )
         output_texts.append(text)
     return output_texts
diff --git a/chemlactica/utils/parseargs.py b/chemlactica/utils/parseargs.py
@@ -55,13 +55,22 @@ def init_parser():
     )
     parser.add_argument(
         "--learning_rate",
-        type=int,
+        type=float,
         metavar="LR",
         dest="learning_rate",
         required=False,
         default=None,
         help="learning rate",
     )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        metavar="WA",
+        dest="warmup_steps",
+        required=False,
+        default=None,
+        help="warmup steps",
+    )
     parser.add_argument(
         "--max_steps",
         type=int,

diff --git a/chemlactica/utils/utils.py b/chemlactica/utils/utils.py
@@ -43,6 +43,7 @@ def get_tokenizer(tokenizer_path):
 def create_tokenizer(tokenizer_path):
     tok = AutoTokenizer.from_pretrained(tokenizer_path)
     tok.add_bos_token = False
+    tok.padding_side = "right"
     print(f"Process {os.getpid()} created a tokenizer")
     return tok
 

diff --git a/environment.yml b/environment.yml
@@ -1,4 +1,4 @@
-name: cl11.9_t_4.39
+name: gemma_env_new
 channels:
   - pytorch
   - nvidia
@@ -180,6 +180,8 @@ dependencies:
       - backoff==2.2.1
       - base58==2.0.1
       - bitsandbytes==0.43.0
+      - boto3==1.34.84
+      - botocore==1.34.84
       - cachetools==5.3.3
       - certifi==2024.2.2
       - cffi==1.16.0
@@ -208,6 +210,7 @@ dependencies:
       - huggingface-hub==0.22.2
       - identify==2.5.35
       - idna==3.6
+      - jmespath==1.0.1
       - joblib==1.3.2
       - kiwisolver==1.4.5
       - mako==1.3.2
@@ -239,6 +242,7 @@ dependencies:
       - requests==2.31.0
       - restrictedpython==7.1
       - rich==13.7.1
+      - s3transfer==0.10.1
       - safetensors==0.4.2
       - scikit-learn==1.4.1.post1
       - scipy==1.12.0
@@ -253,7 +257,7 @@ dependencies:
       - tokenizers==0.15.2
       - tqdm==4.66.2
       - transformers==4.39.0
-      - trl==0.8.1
+      - trl==0.8.6
       - tyro==0.7.3
       - tzdata==2024.1
       - urllib3==2.2.1
@@ -262,4 +266,4 @@ dependencies:
       - xmltodict==0.13.0
       - xxhash==3.4.1
       - yarl==1.9.4
-prefix: /home/philipp/miniforge3/envs/cl11.9_t_4.39
+prefix: /auto/home/menuab/miniforge3/envs/gemma_env_new
diff --git a/local_submit_files/submit_run_HLM.py b/local_submit_files/submit_run_HLM.py
@@ -0,0 +1,133 @@
+import sys
+from contextlib import contextmanager
+from datetime import datetime
+import submitit
+
+use_accelerate = False
+rsync_enabled = False
+executor_name = "slurm"  # options are ["slurm", "local"]
+root_path = ""
+num_gpus = 1
+# model_name = "gemma"
+# model_size = "2b"
+model_name = "galactica"
+model_size = "125m"
+train_type = "sft"
+train_name = "_".join([model_name, model_size, train_type])
+job_name = "HLM_chem_3wu_32bs_15ep_1e4_nef20"
+
+slurm_params = {
+    "slurm_job_name": job_name,
+    "timeout_min": 60 * 3,
+    "nodes": 1,
+    "tasks_per_node": 1,
+    "gpus_per_node": num_gpus,
+    "cpus_per_task": num_gpus * 8,
+    "mem_gb": num_gpus * 40.0 + 20.0,
+    "stderr_to_stdout": True,
+}
+
+accelerate_config = {"num_processes": num_gpus}
+
+env_variables = {
+    "TOKENIZERS_PARALLELISM": "true",
+    "CUDA_VISIBLE_DEVICES": "0, 1, 2, 3, 4, 5, 6, 7",
+    # "CUDA_VISIBLE_DEVICES": "3",
+}
+
+cli_arguments = {
+    "train_type": train_type,
+    # "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
+    #     "galactica-125m/9954e52e400b43d18d3a40f6/checkpoint-20480",
+    # "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
+    #     "galactica-125m/1f289ff103034364bd27e1c3/checkpoint-18000/",
+    # "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/google/"\
+    #     "gemma-2b/d6e6a76e91814ad68d5fa264/checkpoint-11000",
+    # "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"\
+    #     "google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-12000",
+    "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"
+    "google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-18000",
+    "model_config": train_name,
+    "dir_data_types": "computed",
+    "training_data_dirs": "/auto/home/menuab/code/sft_data/ADME_HLM",
+    "valid_data_dir": "",
+    # "max_steps":120000,
+    "num_train_epochs": 15,
+    "learning_rate": 0.0001,
+    "warmup": 180,
+    "eval_steps": 60,
+    "save_steps": 2440,
+    "train_batch_size": 32,
+    "valid_batch_size": 32,
+    "dataloader_num_workers": 1,
+    "experiment_name": job_name,
+    "checkpoints_root_dir": "/nfs/dgx/raid/chem/checkpoints/",
+    "flash_attn": False,
+    "track": True,
+    "track_dir": "/nfs/dgx/raid/chem/aim/",
+    # "profile":,
+    # "profile_dir":,
+    # "gradient_accumulation_steps":,
+    # "gradient_checkpointing":,
+    # "evaluate_only":,
+    # "check_reproducability":,
+}
+
+
+def get_command(use_accelerate):
+    python_executable = sys.executable
+    command = [python_executable]
+    if use_accelerate:
+        accelerate_path = "chemlactica/config/accelerate_config.yaml"
+        command.extend(
+            f"-m accelerate.commands.launch --config_file {accelerate_path}".split(" ")
+        )
+        for k, v in accelerate_config.items():
+            command.append(f"--{k}={v}")
+    command.append("chemlactica/train.py")
+    for x, y in cli_arguments.items():
+        if isinstance(y, bool):
+            if y:
+                command.append(f"--{x}")
+        else:
+            command.append(f"--{x}={y}")
+
+    print(f'command being executed: {" ".join(command)}')
+    return command
+
+
+@contextmanager
+def conditional_context_manager(rsync_enabled, repo_path):
+    if rsync_enabled:
+        with submitit.helpers.RsyncSnapshot(repo_path) as cm:
+            yield cm
+    else:
+        yield None
+
+
+def get_executor(executor_name, logs_path):
+    if executor_name == "slurm":
+        executor = submitit.AutoExecutor(folder=logs_path)
+    elif executor_name == "local":
+        executor = submitit.local.local.LocalExecutor(folder=logs_path)
+    return executor
+
+
+if __name__ == "__main__":
+    logs_path = "submitit_logs/%j"
+    logs_path = "/nfs/dgx/raid/chem/" + logs_path if rsync_enabled else logs_path
+    repo_path = (
+        "/nfs/dgx/raid/chem/rsyncsnapshots/"
+        f"{train_name}-{datetime.now().strftime('%Y-%m-%d-%H:%M')}"
+    )
+
+    with conditional_context_manager(rsync_enabled, repo_path):
+        command = get_command(use_accelerate)
+        executor = get_executor(executor_name, logs_path)
+        executor.update_parameters(**slurm_params)
+        print("train_name: ", train_name)
+        print("logs_path: ", logs_path)
+        print("repo path: ", repo_path)
+        function = submitit.helpers.CommandFunction(command, env=env_variables)
+        job = executor.submit(function)
+        # print(job.result())