YerevaNN · MenuaB · May 2, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/chemlactica/config/config_yamls/gemma_2b_pretrain_config.yaml b/chemlactica/config/config_yamls/gemma_2b_pretrain_config.yaml
@@ -2,7 +2,7 @@ train_config:
   adam_beta1: 0.9
   adam_beta2: 0.95
   batch_size: 500000
-  dropout_prob: 0.1
+  dropout_prob: 0
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
@@ -14,8 +14,8 @@ train_config:
   fp16: false
   tf32: true
   evaluation_strategy: "no"
-  save_total_limit: 10
-  grad_accumulation_scheduler: true
+  save_total_limit: 4
+  grad_accumulation_scheduler: false
   dynamic_grad_accumulation: false
   grad_accumulation_patience: 4000
   grad_accumulation_max: 256
@@ -27,4 +27,6 @@ model_config:
   block_size: 2048
   vocab_size: 256000
   separator_token: <bos>
-  tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
+  # tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
+  tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
+  # tokenizer_path: "google/gemma-2b"
diff --git a/chemlactica/jsonl_dataset.py b/chemlactica/jsonl_dataset.py
@@ -39,6 +39,7 @@ def samples_generator(
 ):
     file_states = setup_generator(shared_jsonl_files, files)
 
+    # TODO: there should be a more elegant way to do this without per line conditions
     returned = True
     while returned:
         returned = False
@@ -49,14 +50,14 @@ def samples_generator(
                 counter = 0
                 while line:
                     state["position"] = f.tell()
-                    if should_yield_on_current_rank(
-                        counter,
-                        distributed_state.num_processes,
-                        distributed_state.process_index,
-                    ):
-                        returned = True
-                        ret = format_sample(line)
-                        yield ret
+                    # if should_yield_on_current_rank(
+                    #     counter,
+                    #     distributed_state.num_processes,
+                    #     distributed_state.process_index,
+                    # ):
+                    #     returned = True
+                    ret = format_sample(line)
+                    yield ret
                     counter = counter + 1
                     shared_jsonl_files[file] = state
                     line = f.readline()
diff --git a/chemlactica/train.py b/chemlactica/train.py
@@ -243,6 +243,7 @@ def train(
             num_train_epochs=num_train_epochs,
             eval_steps=eval_steps,
             save_steps=save_steps,
+            dispatch_batches=False,
             dataloader_drop_last=True,
             dataloader_pin_memory=True,
             # torch_compile=True,

diff --git a/chemlactica/utils/dataset_utils.py b/chemlactica/utils/dataset_utils.py
@@ -168,32 +168,32 @@ def process_dataset(
                 num_proc=4,
             )
         else:
-            with state.main_process_first():
-                dataset = dataset.map(
-                    process_str,
-                    fn_kwargs={
-                        "random_number_generator": rng,
-                        "model_config": model_config,
-                    },
-                )
-            with state.main_process_first():
-                tokenized_datasets = dataset.map(
-                    tokenize_function,
-                    batched=True,
-                    fn_kwargs={"model_config": model_config, "tokenizer": tokenizer},
-                    batch_size=process_batch_sizes[0],
-                    remove_columns=["text"],
-                )
-            with state.main_process_first():
-                lm_datasets = tokenized_datasets.map(
-                    group_texts,
-                    batched=True,
-                    batch_size=process_batch_sizes[1],
-                    fn_kwargs={
-                        "model_config": model_config,
-                        "eos_token_id": eos_token_id,
-                    },
-                )
+            # with state.main_process_first():
+            dataset = dataset.map(
+                process_str,
+                fn_kwargs={
+                    "random_number_generator": rng,
+                    "model_config": model_config,
+                },
+            )
+            # with state.main_process_first():
+            tokenized_datasets = dataset.map(
+                tokenize_function,
+                batched=True,
+                fn_kwargs={"model_config": model_config, "tokenizer": tokenizer},
+                batch_size=process_batch_sizes[0],
+                remove_columns=["text"],
+            )
+            # with state.main_process_first():
+            lm_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                batch_size=process_batch_sizes[1],
+                fn_kwargs={
+                    "model_config": model_config,
+                    "eos_token_id": eos_token_id,
+                },
+            )
 
     return lm_datasets
 

diff --git a/chemlactica/utils/text_format_utils.py b/chemlactica/utils/text_format_utils.py
@@ -87,10 +87,12 @@ def delete_empty_tags(compound_json):
 
 def generate_formatted_string(compound_json, rng, model_config):
     key_value_pairs = []
-    if compound_json.get("SMILES") and rng.random() < 0.5:
-        key = "SMILES"
-        key_value_pairs.append(format_key_value(key, compound_json[key], rng))
-        del compound_json["SMILES"]
+    key = "SMILES"
+    value = compound_json.get(key, "")
+    if rng.integers(0, 1) == 0:
+        if value:
+            key_value_pairs.append(format_key_value(key, value, rng))
+            del compound_json[key]
     keys = list(compound_json.keys())
     rng.shuffle(keys)
 
@@ -126,10 +128,12 @@ def format_key_value(key, value, rng):
             if SPECIAL_TAGS[key].get("type") is float:
                 value = "{:.2f}".format(float(value))
                 assert len(value.split(".")[-1]) == 2
+            start = SPECIAL_TAGS[key]["start"]
+            end = SPECIAL_TAGS[key]["end"]
         except Exception as e:
             print(e)
-        start = SPECIAL_TAGS[key]["start"]
-        end = SPECIAL_TAGS[key]["end"]
+            print("Failed to parse: ", key, value)
+            start = value = end = ""
         return f"{start}{value}{end}"
 
     return formatted_string

diff --git a/chemlactica/utils/utils.py b/chemlactica/utils/utils.py
@@ -42,12 +42,7 @@ def get_tokenizer(tokenizer_path):
 
 def create_tokenizer(tokenizer_path):
     tok = AutoTokenizer.from_pretrained(tokenizer_path)
-    tok.bos_token = "<s>"
-    tok.bos_token_id = 0
-    tok.pad_token = "<pad>"
-    tok.pad_token_id = 1
-    tok.eos_token = "</s>"
-    tok.eos_token_id = 2
+    tok.add_bos_token = False
     print(f"Process {os.getpid()} created a tokenizer")
     return tok