Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor changes for run #31

Merged
merged 9 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
max_learning_rate: 1.0e-4
warmup_steps: 0
weight_decay: 0.1
bf16: true
Expand All @@ -27,4 +27,5 @@ model_config:
block_size: 2048
vocab_size: 50000
separator_token: </s>
separator_token_id: 2
tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
6 changes: 3 additions & 3 deletions chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
warmup_steps: 0
max_learning_rate: 1.0e-4
warmup_steps: 305
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
Expand Down
4 changes: 2 additions & 2 deletions chemlactica/config/default_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ class TrainConfig:
@dataclass
class SFTTrainConfig:
packing: bool = False
max_seq_length: int = 512
neftune_noise_alpha: int = 0
max_seq_length: int = 64
neftune_noise_alpha: int = 10
2 changes: 1 addition & 1 deletion chemlactica/get_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
elif train_type == "sft":
sft_config = SFTTrainConfig()
tokenizer = get_tokenizer(training_args.tokenizer_path)
response_template = "[PROPERTY]activity "
response_template = tokenizer.encode("[PROPERTY]activity")
collator = DataCollatorForCompletionOnlyLM(
response_template, tokenizer=tokenizer
)
Expand Down
6 changes: 3 additions & 3 deletions chemlactica/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def train(
dir_data_types,
valid_data_dir,
learning_rate,
warmup_steps,
scheduler_max_steps,
eval_steps,
save_steps,
Expand Down Expand Up @@ -115,7 +116,6 @@ def train(
gradient_checkpointing=gradient_checkpointing,
auth_token=auth_token,
)

# special_tokens = get_tokenizer_special_tokens(model_config.tokenizer_path)
# print(f"{len(special_tokens)} {special_tokens} additional special tokens.")
tokenizer_length = get_tokenizer_length(model_config)
Expand Down Expand Up @@ -237,7 +237,7 @@ def train(
weight_decay=train_config.weight_decay,
adam_beta1=train_config.adam_beta1,
adam_beta2=train_config.adam_beta2,
warmup_steps=train_config.warmup_steps,
warmup_steps=warmup_steps if warmup_steps else train_config.warmup_steps,
max_grad_norm=train_config.global_gradient_norm,
evaluation_strategy=train_config.evaluation_strategy,
max_steps=scheduler_max_steps,
Expand Down Expand Up @@ -285,7 +285,7 @@ def train(
)
if train_type == "sft":
trainer_callback_dict["SFT numerical evaluation"] = SFTNumericalEval(
dataset, aim_callback
dataset, aim_callback, model_config.separator_token
)
elif train_type == "pretrain":
if train_config.grad_accumulation_scheduler:
Expand Down
16 changes: 13 additions & 3 deletions chemlactica/utils/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,11 @@ def on_step_end(self, args, state, control, **kwargs):


class SFTNumericalEval(TrainerCallback):
def __init__(self, dataset, aim_callback) -> None:
def __init__(self, dataset, aim_callback, separator_token) -> None:
super().__init__()
self.dataset = dataset
self.aim = aim_callback
self.separator_token = separator_token

def on_evaluate(
self,
Expand All @@ -358,11 +359,20 @@ def on_evaluate(
super().on_evaluate(args, state, control, **kwargs)
model.eval()
ground_truths, gens, diffs = [], [], []
eos_token_id = tokenizer.encode("[/PROPERTY]")[0]
for sample in self.dataset["validation"]:
ground_truth = round(sample["activity"], 2)
prompt = f"[START_SMILES]{sample['smiles']}[END_SMILES][PROPERTY]activity "
prompt = (
f"{self.separator_token}[START_SMILES]{sample['smiles']}"
"[END_SMILES][PROPERTY]activity"
)
prompt = tokenizer(prompt, return_tensors="pt").to(model.device)
out = model.generate(prompt.input_ids, do_sample=False, max_length=100)
out = model.generate(
prompt.input_ids,
do_sample=False,
eos_token_id=eos_token_id,
max_new_tokens=100,
)
out = tokenizer.batch_decode(out)[0]
try:
gen = out[
Expand Down
4 changes: 2 additions & 2 deletions chemlactica/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def sft_formatting_prompts_func(example):
output_texts = []
for i in range(len(example["smiles"])):
text = (
f"[START_SMILES]{example['smiles'][i]}[END_SMILES]"
f"[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
f"<bos>[START_SMILES]{example['smiles'][i]}[END_SMILES]"
"[PROPERTY]activity {round(example['activity'][i], 2)}[/PROPERTY]"
)
output_texts.append(text)
return output_texts
11 changes: 10 additions & 1 deletion chemlactica/utils/parseargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,22 @@ def init_parser():
)
parser.add_argument(
"--learning_rate",
type=int,
type=float,
metavar="LR",
dest="learning_rate",
required=False,
default=None,
help="learning rate",
)
parser.add_argument(
"--warmup",
type=int,
metavar="WA",
dest="warmup_steps",
required=False,
default=None,
help="warmup steps",
)
parser.add_argument(
"--max_steps",
type=int,
Expand Down
1 change: 1 addition & 0 deletions chemlactica/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def get_tokenizer(tokenizer_path):
def create_tokenizer(tokenizer_path):
tok = AutoTokenizer.from_pretrained(tokenizer_path)
tok.add_bos_token = False
tok.padding_side = "right"
print(f"Process {os.getpid()} created a tokenizer")
return tok

Expand Down
10 changes: 7 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: cl11.9_t_4.39
name: gemma_env_new
channels:
- pytorch
- nvidia
Expand Down Expand Up @@ -180,6 +180,8 @@ dependencies:
- backoff==2.2.1
- base58==2.0.1
- bitsandbytes==0.43.0
- boto3==1.34.84
- botocore==1.34.84
- cachetools==5.3.3
- certifi==2024.2.2
- cffi==1.16.0
Expand Down Expand Up @@ -208,6 +210,7 @@ dependencies:
- huggingface-hub==0.22.2
- identify==2.5.35
- idna==3.6
- jmespath==1.0.1
- joblib==1.3.2
- kiwisolver==1.4.5
- mako==1.3.2
Expand Down Expand Up @@ -239,6 +242,7 @@ dependencies:
- requests==2.31.0
- restrictedpython==7.1
- rich==13.7.1
- s3transfer==0.10.1
- safetensors==0.4.2
- scikit-learn==1.4.1.post1
- scipy==1.12.0
Expand All @@ -253,7 +257,7 @@ dependencies:
- tokenizers==0.15.2
- tqdm==4.66.2
- transformers==4.39.0
- trl==0.8.1
- trl==0.8.6
- tyro==0.7.3
- tzdata==2024.1
- urllib3==2.2.1
Expand All @@ -262,4 +266,4 @@ dependencies:
- xmltodict==0.13.0
- xxhash==3.4.1
- yarl==1.9.4
prefix: /home/philipp/miniforge3/envs/cl11.9_t_4.39
prefix: /auto/home/menuab/miniforge3/envs/gemma_env_new
133 changes: 133 additions & 0 deletions local_submit_files/submit_run_HLM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import sys
from contextlib import contextmanager
from datetime import datetime
import submitit

use_accelerate = False
rsync_enabled = False
executor_name = "slurm" # options are ["slurm", "local"]
root_path = ""
num_gpus = 1
# model_name = "gemma"
# model_size = "2b"
model_name = "galactica"
model_size = "125m"
train_type = "sft"
train_name = "_".join([model_name, model_size, train_type])
job_name = "HLM_chem_3wu_32bs_15ep_1e4_nef20"

slurm_params = {
"slurm_job_name": job_name,
"timeout_min": 60 * 3,
"nodes": 1,
"tasks_per_node": 1,
"gpus_per_node": num_gpus,
"cpus_per_task": num_gpus * 8,
"mem_gb": num_gpus * 40.0 + 20.0,
"stderr_to_stdout": True,
}

accelerate_config = {"num_processes": num_gpus}

env_variables = {
"TOKENIZERS_PARALLELISM": "true",
"CUDA_VISIBLE_DEVICES": "0, 1, 2, 3, 4, 5, 6, 7",
# "CUDA_VISIBLE_DEVICES": "3",
}

cli_arguments = {
"train_type": train_type,
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
# "galactica-125m/9954e52e400b43d18d3a40f6/checkpoint-20480",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/facebook/"\
# "galactica-125m/1f289ff103034364bd27e1c3/checkpoint-18000/",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/google/"\
# "gemma-2b/d6e6a76e91814ad68d5fa264/checkpoint-11000",
# "from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"\
# "google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-12000",
"from_pretrained": "/nfs/dgx/raid/chem/checkpoints/h100/"
"google/gemma-2b/0717d445bcf44e31b2887892/checkpoint-18000",
"model_config": train_name,
"dir_data_types": "computed",
"training_data_dirs": "/auto/home/menuab/code/sft_data/ADME_HLM",
"valid_data_dir": "",
# "max_steps":120000,
"num_train_epochs": 15,
"learning_rate": 0.0001,
"warmup": 180,
"eval_steps": 60,
"save_steps": 2440,
"train_batch_size": 32,
"valid_batch_size": 32,
"dataloader_num_workers": 1,
"experiment_name": job_name,
"checkpoints_root_dir": "/nfs/dgx/raid/chem/checkpoints/",
"flash_attn": False,
"track": True,
"track_dir": "/nfs/dgx/raid/chem/aim/",
# "profile":,
# "profile_dir":,
# "gradient_accumulation_steps":,
# "gradient_checkpointing":,
# "evaluate_only":,
# "check_reproducability":,
}


def get_command(use_accelerate):
python_executable = sys.executable
command = [python_executable]
if use_accelerate:
accelerate_path = "chemlactica/config/accelerate_config.yaml"
command.extend(
f"-m accelerate.commands.launch --config_file {accelerate_path}".split(" ")
)
for k, v in accelerate_config.items():
command.append(f"--{k}={v}")
command.append("chemlactica/train.py")
for x, y in cli_arguments.items():
if isinstance(y, bool):
if y:
command.append(f"--{x}")
else:
command.append(f"--{x}={y}")

print(f'command being executed: {" ".join(command)}')
return command


@contextmanager
def conditional_context_manager(rsync_enabled, repo_path):
if rsync_enabled:
with submitit.helpers.RsyncSnapshot(repo_path) as cm:
yield cm
else:
yield None


def get_executor(executor_name, logs_path):
if executor_name == "slurm":
executor = submitit.AutoExecutor(folder=logs_path)
elif executor_name == "local":
executor = submitit.local.local.LocalExecutor(folder=logs_path)
return executor


if __name__ == "__main__":
logs_path = "submitit_logs/%j"
logs_path = "/nfs/dgx/raid/chem/" + logs_path if rsync_enabled else logs_path
repo_path = (
"/nfs/dgx/raid/chem/rsyncsnapshots/"
f"{train_name}-{datetime.now().strftime('%Y-%m-%d-%H:%M')}"
)

with conditional_context_manager(rsync_enabled, repo_path):
command = get_command(use_accelerate)
executor = get_executor(executor_name, logs_path)
executor.update_parameters(**slurm_params)
print("train_name: ", train_name)
print("logs_path: ", logs_path)
print("repo path: ", repo_path)
function = submitit.helpers.CommandFunction(command, env=env_variables)
job = executor.submit(function)
# print(job.result())
Loading
Loading