From f59f9146dea0b742053ca5cee170583b79b6f056 Mon Sep 17 00:00:00 2001 From: pangda Date: Wed, 7 Dec 2022 18:33:00 +0800 Subject: [PATCH] fix save_pretrained & load_checkpoint bug in DDP mode Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11012439 * fix save_pretrained & load_checkpoint bug in DDP mode --- modelscope/trainers/hooks/checkpoint_hook.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index d5925dbe..5e2fedde 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -215,6 +215,10 @@ class CheckpointHook(Hook): # TODO a temp fix to avoid pipeline_name and task mismatch config['pipeline'] = {'type': config['task']} + # remove parallel module that is not JSON serializable + if 'parallel' in config and 'module' in config['parallel']: + del config['parallel']['module'] + class SaveConfig: def __init__(self, output_dir, config): @@ -422,4 +426,5 @@ class BestCkptSaverHook(CheckpointHook): def after_run(self, trainer): if self.restore_best: - self.load_checkpoint(self._best_ckpt_file, trainer) + if is_master(): + self.load_checkpoint(self._best_ckpt_file, trainer)