Skip to content

Commit

Permalink
Merge branch 'main' into train_errors
Browse files Browse the repository at this point in the history
  • Loading branch information
philippguevorguian committed Nov 15, 2023
2 parents 56e29a6 + aeadf1d commit 9578b98
Show file tree
Hide file tree
Showing 23 changed files with 428 additions and 188 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.11'
python-version: '3.10'

- name: Set up Conda
uses: conda-incubator/setup-miniconda@v2
Expand All @@ -28,4 +28,4 @@ jobs:

- name: Run unittests
shell: bash -l {0}
run: python3 -m unittest src/tests/precommit_test.py
run: python3 confirm_test.py --confirm
86 changes: 86 additions & 0 deletions confirm_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import subprocess
import argparse
import unittest
import yaml
import os


def write_test_status(git_commit_hash: str, status: str="FAIL", file_name: str="test_status"):
data = {git_commit_hash: status}
with open(f"{file_name}.yaml", "w") as _f:
yaml.dump(data, _f)


def read_test_status(git_commit_hash: str, file_name: str="test_status"):
with open(f"{file_name}.yaml", "r") as _f:
data = yaml.full_load(_f)
return data.get(git_commit_hash)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--run",
action="store_true",
dest="run",
help="whether or not run tests",
)
parser.add_argument(
"--no_run",
action="store_false",
dest="run",
help="whether or not run tests",
)
parser.add_argument(
"--gpus",
type=str,
dest="gpus",
help="comma seperated string of gpus indices to use for testing \
(please choose at least 2 for proper testing, default is '0, 1').",
required=False,
default="0, 1"
)
parser.set_defaults(run=False)
parser.add_argument(
"--confirm",
action="store_true",
dest="confirm",
help="whether or not confirm already run tests",
)
parser.add_argument(
"--no_confirm",
action="store_false",
dest="confirm",
help="whether or not confirm already run tests",
)
parser.set_defaults(confirm=False)
args = parser.parse_args()
run = args.run
confirm = args.confirm
gpus = args.gpus
if run:
git_commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
assert git_commit_hash
os.environ["CUDA_VISIBLE_DEVICES"] = gpus
print(f"NOTE: Using GPU(s) '{gpus}' for testing.")
loader = unittest.TestLoader()
tests = loader.discover("tests", pattern="test_*.py")
testRunner = unittest.runner.TextTestRunner(verbosity=2)
test_results = testRunner.run(tests)
if len(test_results.errors) == 0 and len(test_results.failures) == 0 and test_results.wasSuccessful:
status = "PASS"
else:
status = "FAIL"
write_test_status(git_commit_hash, status=status)
elif confirm:
git_commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD~1']).decode().strip()
assert git_commit_hash
status = read_test_status(git_commit_hash)
if status == "FAIL":
raise Exception(f"Commit '{git_commit_hash}' failed.")
elif status == "PASS":
print(f"Commit '{git_commit_hash}' passed.")
else:
raise Exception(f"Commit '{git_commit_hash}' has an unexpected status '{status}'.")
else:
raise Exception("Please pass the proper option in command line.")
11 changes: 9 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ dependencies:
- cuda-nvrtc=11.7.99=0
- cuda-nvtx=11.7.91=0
- cuda-runtime=11.7.1=0
- filelock=3.9.0=py310h06a4308_0
- gmp=6.2.1=h295c915_3
- gmpy2=2.1.2=py310heeb90bb_0
- intel-openmp=2023.1.0=hdb19cb5_46305
Expand Down Expand Up @@ -77,31 +76,38 @@ dependencies:
- cachetools==5.3.1
- certifi==2023.7.22
- cffi==1.16.0
- cfgv==3.4.0
- charset-normalizer==3.3.0
- click==8.1.7
- cryptography==41.0.4
- datasets==2.14.4
- dill==0.3.7
- distlib==0.3.7
- einops==0.7.0
- exceptiongroup==1.1.3
- fastapi==0.103.2
- filelock==3.13.1
- flash-attn==2.3.2
- frozenlist==1.4.0
- fsspec==2023.9.2
- greenlet==3.0.0
- grpcio==1.59.0
- h11==0.14.0
- huggingface-hub==0.17.3
- identify==2.5.31
- idna==3.4
- mako==1.2.4
- monotonic==1.6
- multidict==6.0.4
- multiprocess==0.70.15
- ninja==1.11.1.1
- nodeenv==1.8.0
- numpy==1.26.0
- packaging==23.2
- pandas==2.1.1
- pillow==10.0.1
- platformdirs==3.11.0
- pre-commit==3.5.0
- protobuf==4.24.4
- psutil==5.9.5
- py3nvml==0.2.7
Expand All @@ -127,7 +133,8 @@ dependencies:
- tzdata==2023.3
- urllib3==2.0.6
- uvicorn==0.23.2
- virtualenv==20.24.6
- xmltodict==0.13.0
- xxhash==3.4.1
- yarl==1.9.2
prefix: /home/tigranfahradyan/miniconda3/envs/cl11.7
prefix: /home/philipp/miniconda3/envs/cl11.7
Empty file added src/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions src/config/create_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
for key in model_train_configs.keys():
model_train_configs[key]["max_learning_rate"] *= 0.08

model_train_configs["125m"]["max_learning_rate"] = 5e-4

model_train_configs["1.3b"]["warmup_steps"] = 2000
model_train_configs["1.3b"]["max_learning_rate"] = 1.0e-5
model_train_configs["1.3b"]["global_gradient_norm"] = 0.1
2 changes: 1 addition & 1 deletion src/config/test_configs/fsdp_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
# main_process_port: 30000
main_process_port: 30001
6 changes: 6 additions & 0 deletions src/custom_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
print("**disk is full didn't save**")

def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
"""
This code is added because we had a failure when resuming training.
Basically, we load the model with fsdp when the model is not fsdp wrapped.
In the future versions transformers this issue is handled, by adding an extra check,
but not in 4.31.0 version. So this is our manual check addition to solve the problem.
"""
if type(self.model) != FSDP: return
return super()._load_from_checkpoint(resume_from_checkpoint, model)

Expand Down
6 changes: 3 additions & 3 deletions src/jsonl_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from multiprocessing import Manager, Pool
from typing import List
import torch
from io import StringIO
Expand All @@ -10,13 +9,15 @@ def samples_generator(
chunk_size=25000, return_line_info=False
):
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
print("sharded_jsonl_files", shared_jsonl_files)
print(f"TOK_PAR: {os.environ['TOKENIZERS_PARALLELISM']}")
print("process id", os.getpid(), files)

file_states = {file: {"position": 0, "line_number": 0} for file in files}
file_states = {f: {"position": 0, "line_number": 0} for f in files}
for file in file_states.keys():
if shared_jsonl_files.get(file):
jsonl_state = shared_jsonl_files[file]
file_states[file] = jsonl_state
print(f"loaded {file}: {jsonl_state['position']}")

returned = True
Expand All @@ -33,7 +34,6 @@ def samples_generator(
batch = [line.rstrip("\n") for line in batch]
state["position"] = f.tell()
state["line_number"] += len(batch)

for i, sample in enumerate(batch, start=1):
returned = True
ret = {"text": sample}
Expand Down
143 changes: 0 additions & 143 deletions src/tests/fsdp_precommit_test.py

This file was deleted.

25 changes: 0 additions & 25 deletions src/tests/test_utils.py

This file was deleted.

Loading

0 comments on commit 9578b98

Please sign in to comment.