Skip to content

Commit

Permalink
add checkpoint_done to last model
Browse files Browse the repository at this point in the history
  • Loading branch information
gongel committed Apr 10, 2024
1 parent 766b993 commit 90b87fc
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2047,6 +2047,11 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op
# recover unified_checkpoint_config for not trine stage
if not self.is_in_train:
self.args.unified_checkpoint_config = unified_checkpoint_config_backup
if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
# save checkpoint_done file to ensure checkpoint is complete
if self.args.should_save_model_state and self.args.should_save:
# For ckpt integrity
paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))

def _save_checkpoint(self, model, metrics=None):
# assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
Expand Down

0 comments on commit 90b87fc

Please sign in to comment.