Skip to content

Commit

Permalink
Refactor finetune (#247)
Browse files Browse the repository at this point in the history
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced a streamlined approach to finetuning within the workflow,
integrating parameters directly without separate step creation.

- **Bug Fixes**
- Revised logic for model initialization to improve clarity and reduce
complexity.

- **Refactor**
- Removed outdated classes and methods related to modifying training
scripts, simplifying the `PrepRunDPTrain` class.
- Enhanced decision-making processes by eliminating redundant conditions
across various functions.
- Updated test cases to remove references to outdated "finetune"
operations, refining the testing strategy.
- Adjusted test cases and configurations to reflect the removal of
"finetune" references and enhance clarity.

- **Chores**
- Cleansed the logic for handling "finetune" patterns in specific
utility functions for better maintainability.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: zjgemi <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
zjgemi and pre-commit-ci[bot] authored Aug 5, 2024
1 parent f5c5d95 commit 777c7fa
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 450 deletions.
14 changes: 6 additions & 8 deletions dpgen2/entrypoint/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,14 +374,12 @@ def input_args():
doc_mass_map = "The mass map. e.g. [27., 24.]. Al and Mg will be set with mass 27. and 24. amu, respectively."
doc_mixed_type = "Use `deepmd/npy/mixed` format for storing training data."
doc_do_finetune = (
"Finetune the pretrained model before the first iteration. If it is set to True, then an additional step, finetune-step, "
'which is based on a branch of "PrepRunDPTrain," will be added before the dpgen_step. In the '
'finetune-step, the internal flag finetune_mode is set to "finetune," which means SuperOP "PrepRunDPTrain" '
'is now used as the "Finetune." In this step, we finetune the pretrained model in the train step and modify '
'the template after training. After that, in the normal dpgen-step, the flag do_finetune is set as "train-init," '
'which means we use `--init-frz-model` to train based on models from the previous iteration. The "do_finetune" flag '
'is set to False by default, while the internal flag finetune_mode is set to "no," which means anything related '
"to finetuning will not be done."
"Finetune the pretrained model during the first iteration. If it is set to True, then in the first iteration, "
'the internal flag finetune_mode is set to "finetune". In this step, we finetune the pretrained model in the '
'train step. After that, in the following iterations, init_model_policy is forced to be "yes", the flag '
'finetune_mode is set as "no", which means we use `--init-frz-model` or `--init-model` to train based on '
'models from the previous iteration. The "do_finetune" flag is set to False by default, while the internal '
'flag finetune_mode is set to "no", which means anything related to finetuning will not be done.'
)
doc_do_finetune = textwrap.dedent(doc_do_finetune)
doc_init_data_prefix = "The prefix of initial data systems"
Expand Down
87 changes: 7 additions & 80 deletions dpgen2/entrypoint/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,53 +414,6 @@ def make_optional_parameter(
return {"data_mixed_type": mixed_type, "finetune_mode": finetune_mode}


def make_finetune_step(
config,
prep_train_config,
run_train_config,
upload_python_packages,
numb_models,
template_script,
train_config,
init_models,
init_data,
iter_data,
valid_data=None,
):
finetune_optional_parameter = {
"mixed_type": config["inputs"]["mixed_type"],
"finetune_mode": "finetune",
}

finetune_op = PrepRunDPTrain(
"finetune",
PrepDPTrain,
RunDPTrain,
prep_config=prep_train_config,
run_config=run_train_config,
upload_python_packages=upload_python_packages,
finetune=True,
valid_data=valid_data,
)
finetune_step = Step(
"finetune-step",
template=finetune_op,
parameters={
"block_id": "finetune",
"numb_models": numb_models,
"template_script": template_script,
"train_config": train_config,
"run_optional_parameter": finetune_optional_parameter,
},
artifacts={
"init_models": init_models,
"init_data": init_data,
"iter_data": iter_data,
},
)
return finetune_step


def get_systems_from_data(data, data_prefix=None):
data = [data] if isinstance(data, str) else data
assert isinstance(data, list)
Expand All @@ -472,7 +425,7 @@ def get_systems_from_data(data, data_prefix=None):

def workflow_concurrent_learning(
config: Dict,
) -> Tuple[Step, Optional[Step]]:
) -> Step:
default_config = config["default_step_config"]

train_config = config["train"]["config"]
Expand Down Expand Up @@ -614,32 +567,17 @@ def workflow_concurrent_learning(
else:
init_models = None

finetune_step = None
optional_parameter = make_optional_parameter(
config["inputs"]["mixed_type"],
)

if config["inputs"].get("do_finetune", False):
finetune_step = make_finetune_step(
config,
prep_train_config,
run_train_config,
upload_python_packages,
numb_models,
template_script,
train_config,
init_models,
init_data,
iter_data,
valid_data=valid_data,
)

init_models = finetune_step.outputs.artifacts["models"]
template_script = finetune_step.outputs.parameters["template_script"]

if train_config["init_model_policy"] != "yes":
logging.warning("In finetune mode, init_model_policy is forced to be 'yes'")
train_config["init_model_policy"] = "yes"
optional_parameter = make_optional_parameter(
config["inputs"]["mixed_type"],
finetune_mode="train-init",
finetune_mode="finetune",
)

# here the scheduler is passed as input parameter to the concurrent_learning_op
Expand All @@ -662,7 +600,7 @@ def workflow_concurrent_learning(
"iter_data": iter_data,
},
)
return dpgen_step, finetune_step
return dpgen_step


def get_scheduler_ids(
Expand Down Expand Up @@ -747,9 +685,7 @@ def submit_concurrent_learning(

global_config_workflow(wf_config)

dpgen_step, finetune_step = workflow_concurrent_learning(
wf_config,
)
dpgen_step = workflow_concurrent_learning(wf_config)

if reuse_step is not None and replace_scheduler:
scheduler_new = copy.deepcopy(
Expand Down Expand Up @@ -785,17 +721,9 @@ def submit_concurrent_learning(
"conf_selector",
selector,
)
# the modify-train-script step will be added as reuse step.
# the following hack is not needed anymore.
# wf_config["inputs"]["do_finetune"] = False
# finetune will not be done again if the old process is reused.

wf = Workflow(name=wf_config["name"], parallelism=wf_config["parallelism"])

if wf_config["inputs"].get("do_finetune", False):
assert finetune_step is not None
wf.add(finetune_step)

wf.add(dpgen_step)

# for debug purpose, we may not really submit the wf
Expand Down Expand Up @@ -889,7 +817,6 @@ def get_resubmit_keys(
"prep-run-train",
"prep-train",
"run-train",
"modify-train-script",
"prep-caly-input",
"prep-caly-model-devi",
"run-caly-model-devi",
Expand Down
11 changes: 10 additions & 1 deletion dpgen2/flow/dpgen_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ def make_block_optional_parameter(cl_optional_parameter):
}


def make_next_optional_parameter(optional_parameter):
return {
"data_mixed_type": optional_parameter["data_mixed_type"],
"finetune_mode": "no", # not to do finetune for `next` loop
}


class SchedulerWrapper(OP):
@classmethod
def get_input_sign(cls):
Expand Down Expand Up @@ -426,7 +433,9 @@ def _loop(
"exploration_scheduler": scheduler_step.outputs.parameters[
"exploration_scheduler"
],
"optional_parameter": steps.inputs.parameters["optional_parameter"],
"optional_parameter": make_next_optional_parameter(
steps.inputs.parameters["optional_parameter"]
),
"expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"],
}
next_step = Step(
Expand Down
68 changes: 2 additions & 66 deletions dpgen2/op/run_dp_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,9 @@ def _make_train_command(
return command
# case of init model and finetune
assert checkpoint is None
do_init_model_or_train_init = do_init_model or finetune_mode == "train-init"
case_init_model = do_init_model_or_train_init and (not init_model_with_finetune)
case_init_model = do_init_model and (not init_model_with_finetune)
case_finetune = finetune_mode == "finetune" or (
do_init_model_or_train_init and init_model_with_finetune
do_init_model and init_model_with_finetune
)
if case_init_model:
init_flag = "--init-frz-model" if impl == "tensorflow" else "--init-model"
Expand All @@ -101,69 +100,6 @@ def _make_train_command(
return command


def _make_train_command_old(
dp_command,
train_script_name,
impl,
do_init_model,
init_model,
finetune_mode,
finetune_args,
init_model_with_finetune,
):
if impl == "tensorflow" and os.path.isfile("checkpoint"):
command = dp_command + [
"train",
"--restart",
"model.ckpt",
train_script_name,
]
elif impl == "pytorch" and len(glob.glob("model.ckpt-[0-9]*.pt")) > 0:
checkpoint = "model.ckpt-%s.pt" % max(
[int(f[11:-3]) for f in glob.glob("model.ckpt-[0-9]*.pt")]
)
command = dp_command + [
"train",
"--restart",
checkpoint,
train_script_name,
]
elif (
do_init_model or finetune_mode == "train-init"
) and not init_model_with_finetune:
if impl == "pytorch":
command = dp_command + [
"train",
"--init-model",
str(init_model),
train_script_name,
]
else:
command = dp_command + [
"train",
"--init-frz-model",
str(init_model),
train_script_name,
]
elif finetune_mode == "finetune" or (
(do_init_model or finetune_mode == "train-init") and init_model_with_finetune
):
command = (
dp_command
+ [
"train",
train_script_name,
"--finetune",
str(init_model),
]
+ finetune_args.split()
)
else:
command = dp_command + ["train", train_script_name]

return command


class RunDPTrain(OP):
r"""Execute a DP training task. Train and freeze a DP model.
Expand Down
Loading

0 comments on commit 777c7fa

Please sign in to comment.