Merge branch 'main' into train_errors

YerevaNN · Nov 15, 2023 · 9578b98 · 9578b98
2 parents 56e29a6 + aeadf1d
commit 9578b98
Show file tree

Hide file tree

Showing 23 changed files with 428 additions and 188 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.11'
+        python-version: '3.10'
 
     - name: Set up Conda
       uses: conda-incubator/setup-miniconda@v2
@@ -28,4 +28,4 @@ jobs:
 
     - name: Run unittests
       shell: bash -l {0}
-      run: python3 -m unittest src/tests/precommit_test.py
+      run: python3 confirm_test.py --confirm
diff --git a/confirm_tests.py b/confirm_tests.py
@@ -0,0 +1,86 @@
+import subprocess
+import argparse
+import unittest
+import yaml
+import os
+
+
+def write_test_status(git_commit_hash: str, status: str="FAIL", file_name: str="test_status"):
+    data = {git_commit_hash: status}
+    with open(f"{file_name}.yaml", "w") as _f:
+        yaml.dump(data, _f)
+
+
+def read_test_status(git_commit_hash: str, file_name: str="test_status"):
+    with open(f"{file_name}.yaml", "r") as _f:
+        data = yaml.full_load(_f)
+    return data.get(git_commit_hash)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--run",
+        action="store_true",
+        dest="run",
+        help="whether or not run tests",
+    )
+    parser.add_argument(
+        "--no_run",
+        action="store_false",
+        dest="run",
+        help="whether or not run tests",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        dest="gpus",
+        help="comma seperated string of gpus indices to use for testing \
+              (please choose at least 2 for proper testing, default is '0, 1').",
+        required=False,
+        default="0, 1"
+    )
+    parser.set_defaults(run=False)
+    parser.add_argument(
+        "--confirm",
+        action="store_true",
+        dest="confirm",
+        help="whether or not confirm already run tests",
+    )
+    parser.add_argument(
+        "--no_confirm",
+        action="store_false",
+        dest="confirm",
+        help="whether or not confirm already run tests",
+    )
+    parser.set_defaults(confirm=False)
+    args = parser.parse_args()
+    run = args.run
+    confirm = args.confirm
+    gpus = args.gpus
+    if run:
+        git_commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
+        assert git_commit_hash
+        os.environ["CUDA_VISIBLE_DEVICES"] = gpus
+        print(f"NOTE: Using GPU(s) '{gpus}' for testing.")
+        loader = unittest.TestLoader()
+        tests = loader.discover("tests", pattern="test_*.py")
+        testRunner = unittest.runner.TextTestRunner(verbosity=2)
+        test_results = testRunner.run(tests)
+        if len(test_results.errors) == 0 and len(test_results.failures) == 0 and test_results.wasSuccessful:
+            status = "PASS"
+        else:
+            status = "FAIL"
+        write_test_status(git_commit_hash, status=status)
+    elif confirm:
+        git_commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD~1']).decode().strip()
+        assert git_commit_hash
+        status = read_test_status(git_commit_hash)  
+        if status == "FAIL":
+            raise Exception(f"Commit '{git_commit_hash}' failed.")
+        elif status == "PASS":
+            print(f"Commit '{git_commit_hash}' passed.")
+        else:
+            raise Exception(f"Commit '{git_commit_hash}' has an unexpected status '{status}'.")
+    else:
+        raise Exception("Please pass the proper option in command line.")
diff --git a/environment.yml b/environment.yml
@@ -15,7 +15,6 @@ dependencies:
   - cuda-nvrtc=11.7.99=0
   - cuda-nvtx=11.7.91=0
   - cuda-runtime=11.7.1=0
-  - filelock=3.9.0=py310h06a4308_0
   - gmp=6.2.1=h295c915_3
   - gmpy2=2.1.2=py310heeb90bb_0
   - intel-openmp=2023.1.0=hdb19cb5_46305
@@ -77,31 +76,38 @@ dependencies:
       - cachetools==5.3.1
       - certifi==2023.7.22
       - cffi==1.16.0
+      - cfgv==3.4.0
       - charset-normalizer==3.3.0
       - click==8.1.7
       - cryptography==41.0.4
       - datasets==2.14.4
       - dill==0.3.7
+      - distlib==0.3.7
       - einops==0.7.0
       - exceptiongroup==1.1.3
       - fastapi==0.103.2
+      - filelock==3.13.1
       - flash-attn==2.3.2
       - frozenlist==1.4.0
       - fsspec==2023.9.2
       - greenlet==3.0.0
       - grpcio==1.59.0
       - h11==0.14.0
       - huggingface-hub==0.17.3
+      - identify==2.5.31
       - idna==3.4
       - mako==1.2.4
       - monotonic==1.6
       - multidict==6.0.4
       - multiprocess==0.70.15
       - ninja==1.11.1.1
+      - nodeenv==1.8.0
       - numpy==1.26.0
       - packaging==23.2
       - pandas==2.1.1
       - pillow==10.0.1
+      - platformdirs==3.11.0
+      - pre-commit==3.5.0
       - protobuf==4.24.4
       - psutil==5.9.5
       - py3nvml==0.2.7
@@ -127,7 +133,8 @@ dependencies:
       - tzdata==2023.3
       - urllib3==2.0.6
       - uvicorn==0.23.2
+      - virtualenv==20.24.6
       - xmltodict==0.13.0
       - xxhash==3.4.1
       - yarl==1.9.2
-prefix: /home/tigranfahradyan/miniconda3/envs/cl11.7
+prefix: /home/philipp/miniconda3/envs/cl11.7
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/config/create_train_config.py b/src/config/create_train_config.py
@@ -12,6 +12,8 @@
 for key in model_train_configs.keys():
     model_train_configs[key]["max_learning_rate"] *= 0.08
 
+model_train_configs["125m"]["max_learning_rate"] = 5e-4
+
 model_train_configs["1.3b"]["warmup_steps"] = 2000
 model_train_configs["1.3b"]["max_learning_rate"] = 1.0e-5
 model_train_configs["1.3b"]["global_gradient_norm"] = 0.1
diff --git a/src/config/test_configs/fsdp_config.yaml b/src/config/test_configs/fsdp_config.yaml
@@ -19,4 +19,4 @@ tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
-# main_process_port: 30000
+main_process_port: 30001
diff --git a/src/custom_trainer.py b/src/custom_trainer.py
@@ -11,6 +11,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
             print("**disk is full didn't save**")
 
     def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
+        """
+            This code is added because we had a failure when resuming training.
+            Basically, we load the model with fsdp when the model is not fsdp wrapped.
+            In the future versions transformers this issue is handled, by adding an extra check,
+            but not in 4.31.0 version. So this is our manual check addition to solve the problem.
+        """
         if type(self.model) != FSDP: return
         return super()._load_from_checkpoint(resume_from_checkpoint, model)
 

diff --git a/src/jsonl_dataset.py b/src/jsonl_dataset.py
@@ -1,4 +1,3 @@
-from multiprocessing import Manager, Pool
 from typing import List
 import torch
 from io import StringIO
@@ -10,13 +9,15 @@ def samples_generator(
         chunk_size=25000, return_line_info=False
     ):
     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+        print("sharded_jsonl_files", shared_jsonl_files)
         print(f"TOK_PAR: {os.environ['TOKENIZERS_PARALLELISM']}")
         print("process id", os.getpid(), files)
 
-        file_states = {file: {"position": 0, "line_number": 0} for file in files}
+        file_states = {f: {"position": 0, "line_number": 0} for f in files}
         for file in file_states.keys():
             if shared_jsonl_files.get(file):
                 jsonl_state = shared_jsonl_files[file]
+                file_states[file] = jsonl_state
                 print(f"loaded {file}: {jsonl_state['position']}")
 
         returned = True
@@ -33,7 +34,6 @@ def samples_generator(
                     batch = [line.rstrip("\n") for line in batch]
                     state["position"] = f.tell()
                     state["line_number"] += len(batch)
-
                     for i, sample in enumerate(batch, start=1):
                         returned = True
                         ret = {"text": sample}

diff --git a/src/tests/fsdp_precommit_test.py b/src/tests/fsdp_precommit_test.py
diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py