Skip to content

Commit

Permalink
Merge pull request #31 from YerevaNN/minor_changes_for_run
Browse files Browse the repository at this point in the history
Minor changes for run
  • Loading branch information
MenuaB authored May 23, 2024
2 parents 6e6e81a + 20e4a80 commit a388d5f
Show file tree
Hide file tree
Showing 23 changed files with 2,430 additions and 734 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
max_learning_rate: 1.0e-4
warmup_steps: 0
weight_decay: 0.1
bf16: true
Expand All @@ -27,4 +27,5 @@ model_config:
block_size: 2048
vocab_size: 50000
separator_token: </s>
separator_token_id: 2
tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
6 changes: 3 additions & 3 deletions chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
warmup_steps: 0
max_learning_rate: 1.0e-4
warmup_steps: 305
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
Expand Down
4 changes: 2 additions & 2 deletions chemlactica/config/default_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ class TrainConfig:
@dataclass
class SFTTrainConfig:
packing: bool = False
max_seq_length: int = 512
neftune_noise_alpha: int = 0
max_seq_length: int = 64
neftune_noise_alpha: int = 10
2 changes: 1 addition & 1 deletion chemlactica/get_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
elif train_type == "sft":
sft_config = SFTTrainConfig()
tokenizer = get_tokenizer(training_args.tokenizer_path)
response_template = "[PROPERTY]activity "
response_template = tokenizer.encode("[PROPERTY]activity")
collator = DataCollatorForCompletionOnlyLM(
response_template, tokenizer=tokenizer
)
Expand Down
6 changes: 3 additions & 3 deletions chemlactica/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def train(
dir_data_types,
valid_data_dir,
learning_rate,
warmup_steps,
scheduler_max_steps,
eval_steps,
save_steps,
Expand Down Expand Up @@ -115,7 +116,6 @@ def train(
gradient_checkpointing=gradient_checkpointing,
auth_token=auth_token,
)

# special_tokens = get_tokenizer_special_tokens(model_config.tokenizer_path)
# print(f"{len(special_tokens)} {special_tokens} additional special tokens.")
tokenizer_length = get_tokenizer_length(model_config)
Expand Down Expand Up @@ -237,7 +237,7 @@ def train(
weight_decay=train_config.weight_decay,
adam_beta1=train_config.adam_beta1,
adam_beta2=train_config.adam_beta2,
warmup_steps=train_config.warmup_steps,
warmup_steps=warmup_steps if warmup_steps else train_config.warmup_steps,
max_grad_norm=train_config.global_gradient_norm,
evaluation_strategy=train_config.evaluation_strategy,
max_steps=scheduler_max_steps,
Expand Down Expand Up @@ -285,7 +285,7 @@ def train(
)
if train_type == "sft":
trainer_callback_dict["SFT numerical evaluation"] = SFTNumericalEval(
dataset, aim_callback
dataset, aim_callback, model_config.separator_token
)
elif train_type == "pretrain":
if train_config.grad_accumulation_scheduler:
Expand Down
16 changes: 13 additions & 3 deletions chemlactica/utils/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,11 @@ def on_step_end(self, args, state, control, **kwargs):


class SFTNumericalEval(TrainerCallback):
def __init__(self, dataset, aim_callback) -> None:
def __init__(self, dataset, aim_callback, separator_token) -> None:
super().__init__()
self.dataset = dataset
self.aim = aim_callback
self.separator_token = separator_token

def on_evaluate(
self,
Expand All @@ -358,11 +359,20 @@ def on_evaluate(
super().on_evaluate(args, state, control, **kwargs)
model.eval()
ground_truths, gens, diffs = [], [], []
eos_token_id = tokenizer.encode("[/PROPERTY]")[0]
for sample in self.dataset["validation"]:
ground_truth = round(sample["activity"], 2)
prompt = f"[START_SMILES]{sample['smiles']}[END_SMILES][PROPERTY]activity "
prompt = (
f"{self.separator_token}[START_SMILES]{sample['smiles']}"
"[END_SMILES][PROPERTY]activity"
)
prompt = tokenizer(prompt, return_tensors="pt").to(model.device)
out = model.generate(prompt.input_ids, do_sample=False, max_length=100)
out = model.generate(
prompt.input_ids,
do_sample=False,
eos_token_id=eos_token_id,
max_new_tokens=100,
)
out = tokenizer.batch_decode(out)[0]
try:
gen = out[
Expand Down
4 changes: 2 additions & 2 deletions chemlactica/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def sft_formatting_prompts_func(example):
output_texts = []
for i in range(len(example["smiles"])):
text = (
f"[START_SMILES]{example['smiles'][i]}[END_SMILES]"
f"[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
f"<bos>[START_SMILES]{example['smiles'][i]}[END_SMILES]"
"[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
)
output_texts.append(text)
return output_texts
11 changes: 10 additions & 1 deletion chemlactica/utils/parseargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,22 @@ def init_parser():
)
parser.add_argument(
"--learning_rate",
type=int,
type=float,
metavar="LR",
dest="learning_rate",
required=False,
default=None,
help="learning rate",
)
parser.add_argument(
"--warmup",
type=int,
metavar="WA",
dest="warmup_steps",
required=False,
default=None,
help="warmup steps",
)
parser.add_argument(
"--max_steps",
type=int,
Expand Down
1 change: 1 addition & 0 deletions chemlactica/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def get_tokenizer(tokenizer_path):
def create_tokenizer(tokenizer_path):
tok = AutoTokenizer.from_pretrained(tokenizer_path)
tok.add_bos_token = False
tok.padding_side = "right"
print(f"Process {os.getpid()} created a tokenizer")
return tok

Expand Down
10 changes: 7 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: cl11.9_t_4.39
name: gemma_env_new
channels:
- pytorch
- nvidia
Expand Down Expand Up @@ -180,6 +180,8 @@ dependencies:
- backoff==2.2.1
- base58==2.0.1
- bitsandbytes==0.43.0
- boto3==1.34.84
- botocore==1.34.84
- cachetools==5.3.3
- certifi==2024.2.2
- cffi==1.16.0
Expand Down Expand Up @@ -208,6 +210,7 @@ dependencies:
- huggingface-hub==0.22.2
- identify==2.5.35
- idna==3.6
- jmespath==1.0.1
- joblib==1.3.2
- kiwisolver==1.4.5
- mako==1.3.2
Expand Down Expand Up @@ -239,6 +242,7 @@ dependencies:
- requests==2.31.0
- restrictedpython==7.1
- rich==13.7.1
- s3transfer==0.10.1
- safetensors==0.4.2
- scikit-learn==1.4.1.post1
- scipy==1.12.0
Expand All @@ -253,7 +257,7 @@ dependencies:
- tokenizers==0.15.2
- tqdm==4.66.2
- transformers==4.39.0
- trl==0.8.1
- trl==0.8.6
- tyro==0.7.3
- tzdata==2024.1
- urllib3==2.2.1
Expand All @@ -262,4 +266,4 @@ dependencies:
- xmltodict==0.13.0
- xxhash==3.4.1
- yarl==1.9.4
prefix: /home/philipp/miniforge3/envs/cl11.9_t_4.39
prefix: /auto/home/menuab/miniforge3/envs/gemma_env_new
133 changes: 133 additions & 0 deletions local_submit_files/submit_run_HLM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import sys
from contextlib import contextmanager
from datetime import datetime
import submitit

use_accelerate = False
rsync_enabled = False
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 1
# model_name = "gemma"
# model_size = "2b"
model_name = "galactica"
model_size = "125m"
train_type = "sft"
train_name = "_".join([model_name, model_size, train_type])
job_name = "HLM_chem_3wu_32bs_15ep_1e4_nef20"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 60 * 3,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 8,
"mem_gb": num_gpus * 40.0 + 20.0,
"stderr_to_stdout": True,
}

accelerate_config = {"num_processes": num_gpus}

env_variables = {
"TOKENIZERS_PARALLELISM": "true",
"CUDA_VISIBLE_DEVICES": "0, 1, 2, 3, 4, 5, 6, 7",
# "CUDA_VISIBLE_DEVICES": "3",
}

cli_arguments = {
"train_type": train_type,
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
# "galactica-125m/9954e52e400b43d18d3a40f6/checkpoint-20480",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
# "galactica-125m/1f289ff103034364bd27e1c3/checkpoint-18000/",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/google/"\
# "gemma-2b/d6e6a76e91814ad68d5fa264/checkpoint-11000",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"\
# "google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-12000",
"from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"
"google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-18000",
"model_config": train_name,
"dir_data_types": "computed",
"training_data_dirs": "/auto/home/menuab/code/sft_data/ADME_HLM",
"valid_data_dir": "",
# "max_steps":120000,
"num_train_epochs": 15,
"learning_rate": 0.0001,
"warmup": 180,
"eval_steps": 60,
"save_steps": 2440,
"train_batch_size": 32,
"valid_batch_size": 32,
"dataloader_num_workers": 1,
"experiment_name": job_name,
"checkpoints_root_dir": "/nfs/dgx/raid/chem/checkpoints/",
"flash_attn": False,
"track": True,
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
# "profile_dir":,
# "gradient_accumulation_steps":,
# "gradient_checkpointing":,
# "evaluate_only":,
# "check_reproducability":,
}


def get_command(use_accelerate):
python_executable = sys.executable
command = [python_executable]
if use_accelerate:
accelerate_path = "chemlactica/config/accelerate_config.yaml"
command.extend(
f"-m accelerate.commands.launch --config_file {accelerate_path}".split(" ")
)
for k, v in accelerate_config.items():
command.append(f"--{k}={v}")
command.append("chemlactica/train.py")
for x, y in cli_arguments.items():
if isinstance(y, bool):
if y:
command.append(f"--{x}")
else:
command.append(f"--{x}={y}")

print(f'command being executed: {" ".join(command)}')
return command


@contextmanager
def conditional_context_manager(rsync_enabled, repo_path):
if rsync_enabled:
with submitit.helpers.RsyncSnapshot(repo_path) as cm:
yield cm
else:
yield None


def get_executor(executor_name, logs_path):
if executor_name == "slurm":
executor = submitit.AutoExecutor(folder=logs_path)
elif executor_name == "local":
executor = submitit.local.local.LocalExecutor(folder=logs_path)
return executor


if __name__ == "__main__":
logs_path = "submitit_logs/%j"
logs_path = "/nfs/dgx/raid/chem/" + logs_path if rsync_enabled else logs_path
repo_path = (
"/nfs/dgx/raid/chem/rsyncsnapshots/"
f"{train_name}-{datetime.now().strftime('%Y-%m-%d-%H:%M')}"
)

with conditional_context_manager(rsync_enabled, repo_path):
command = get_command(use_accelerate)
executor = get_executor(executor_name, logs_path)
executor.update_parameters(**slurm_params)
print("train_name: ", train_name)
print("logs_path: ", logs_path)
print("repo path: ", repo_path)
function = submitit.helpers.CommandFunction(command, env=env_variables)
job = executor.submit(function)
# print(job.result())
Loading

0 comments on commit a388d5f

Please sign in to comment.