diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py new file mode 100644 index 00000000..69fdd400 --- /dev/null +++ b/modelscope/trainers/default_config.py @@ -0,0 +1,14 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +DEFAULT_CONFIG = { + 'train': { + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 10 + }, { + 'type': 'IterTimerHook' + }] + } +} diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py index d54c110a..4826c81f 100644 --- a/modelscope/trainers/hooks/__init__.py +++ b/modelscope/trainers/hooks/__init__.py @@ -6,11 +6,12 @@ from .hook import Hook from .iter_timer_hook import IterTimerHook from .logger.text_logger_hook import TextLoggerHook from .lr_scheduler_hook import LrSchedulerHook -from .optimizer_hook import OptimizerHook +from .optimizer_hook import (ApexAMPOptimizerHook, OptimizerHook, + TorchAMPOptimizerHook) from .priority import Priority __all__ = [ 'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', 'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', - 'IterTimerHook' + 'IterTimerHook', 'TorchAMPOptimizerHook', 'ApexAMPOptimizerHook' ] diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index 6892ac9c..6fb53e57 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -3,6 +3,7 @@ import os from modelscope import __version__ from modelscope.utils.checkpoint import save_checkpoint +from modelscope.utils.constant import LogKeys from modelscope.utils.logger import get_logger from modelscope.utils.torch_utils import get_dist_info from .builder import HOOKS @@ -58,11 +59,11 @@ class CheckpointHook(Hook): def _save_checkpoint(self, trainer): if self.by_epoch: - cur_save_name = os.path.join(self.save_dir, - f'epoch_{trainer.epoch + 1}.pth') + cur_save_name = os.path.join( + self.save_dir, f'{LogKeys.EPOCH}_{trainer.epoch + 1}.pth') else: - cur_save_name = os.path.join(self.save_dir, - f'iter_{trainer.epoch + 1}.pth') + cur_save_name = os.path.join( + self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth') rank, _ = get_dist_info() if rank == 0: diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py index 7b06a170..325606f5 100644 --- a/modelscope/trainers/hooks/evaluation_hook.py +++ b/modelscope/trainers/hooks/evaluation_hook.py @@ -1,4 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os + +from modelscope.utils.checkpoint import save_checkpoint +from modelscope.utils.constant import LogKeys +from modelscope.utils.logger import get_logger +from modelscope.utils.torch_utils import get_dist_info from .builder import HOOKS from .hook import Hook from .priority import Priority @@ -12,17 +18,56 @@ class EvaluationHook(Hook): by_epoch (bool): Evaluate by epoch or by iteration. start_idx (int | None, optional): The epoch/iterations validation begins. Default: None, validate every interval epochs/iterations from scratch. + save_best_ckpt (bool): Whether save the best checkpoint during evaluation. + monitor_key (str): Monitor key to compare rule for best score, only valid when `save_best_ckpt` is true. + rule (str): Comparison rule for best score, only valid when `save_best_ckpt` is true. + Support "max" and "min". If rule is "max", the checkpoint at the maximum `monitor_key` + will be saved, If rule is "min", the checkpoint at the minimum `monitor_key` will be saved. + out_dir (str): Output directory to save best checkpoint. """ PRIORITY = Priority.NORMAL + rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y} - def __init__(self, interval=1, by_epoch=True, start_idx=None): - + def __init__(self, + interval=1, + by_epoch=True, + start_idx=None, + save_best_ckpt=False, + monitor_key=None, + rule='max', + out_dir=None): assert interval > 0, 'interval must be a positive number' + if save_best_ckpt: + assert monitor_key is not None, 'Must provide `monitor_key` when `save_best_ckpt` is True.' + assert rule in ['max', + 'min'], 'Only support "max" or "min" rule now.' self.interval = interval self.start_idx = start_idx self.by_epoch = by_epoch + self.save_best_ckpt = save_best_ckpt + self.monitor_key = monitor_key + self.rule = rule + self.out_dir = out_dir + self._best_metric = None + self._best_ckpt_file = None + + def before_run(self, trainer): + if not self.out_dir: + self.out_dir = trainer.work_dir + if not os.path.exists(self.out_dir): + rank, _ = get_dist_info() + if rank == 0: + os.makedirs(self.out_dir) + + if self.save_best_ckpt: + if not hasattr(trainer, 'logger'): + self.logger = get_logger(__name__) + else: + self.logger = trainer.logger + self.logger.info( + f'Best checkpoint will be saved to {self.out_dir}') def after_train_iter(self, trainer): """Called after every training iter to evaluate the results.""" @@ -42,6 +87,46 @@ class EvaluationHook(Hook): trainer.log_buffer.ready = True + if self.save_best_ckpt and self._is_best_metric(eval_res): + # remove the previous best model and save the latest best model + if self._best_ckpt_file is not None and os.path.exists( + self._best_ckpt_file): + os.remove(self._best_ckpt_file) + self._save_checkpoint(trainer) + + def _is_best_metric(self, eval_res): + if self.monitor_key not in eval_res: + raise ValueError( + f'Not find monitor_key: {self.monitor_key} in {eval_res}') + + if self._best_metric is None: + self._best_metric = eval_res[self.monitor_key] + return True + else: + compare_fn = self.rule_map[self.rule] + if compare_fn(eval_res[self.monitor_key], self._best_metric): + self._best_metric = eval_res[self.monitor_key] + return True + return False + + def _save_checkpoint(self, trainer): + if self.by_epoch: + cur_save_name = os.path.join( + self.out_dir, + f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.monitor_key}{self._best_metric}.pth' + ) + else: + cur_save_name = os.path.join( + self.out_dir, + f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.monitor_key}{self._best_metric}.pth' + ) + + rank, _ = get_dist_info() + if rank == 0: + save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) + + self._best_ckpt_file = cur_save_name + def _should_evaluate(self, trainer): """Judge whether to perform evaluation. diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py index e7ad2c37..3a58557b 100644 --- a/modelscope/trainers/hooks/hook.py +++ b/modelscope/trainers/hooks/hook.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.utils.constant import TrainerStages from modelscope.utils.import_utils import is_method_overridden from .priority import Priority @@ -9,11 +10,12 @@ class Hook: The Hook base class of any modelscope trainer. You can build your own hook inherited from this class. """ - # TODO @jiangnana.jnn use constant variable for stages - stages = ('before_run', 'before_train_epoch', 'before_train_iter', - 'after_train_iter', 'after_train_epoch', 'before_val_epoch', - 'before_val_iter', 'after_val_iter', 'after_val_epoch', - 'after_run') + stages = (TrainerStages.before_run, TrainerStages.before_train_epoch, + TrainerStages.before_train_iter, TrainerStages.after_train_iter, + TrainerStages.after_train_epoch, TrainerStages.before_val_epoch, + TrainerStages.before_val_iter, TrainerStages.after_val_iter, + TrainerStages.after_val_epoch, TrainerStages.after_run) + PRIORITY = Priority.NORMAL def before_run(self, trainer): @@ -171,6 +173,13 @@ class Hook: """ return (trainer.epoch + 1) % n == 0 if n > 0 else False + def every_n_inner_iters(self, runner, n): + """ + Whether to reach every ``n`` iterations at every epoch + Returns: bool + """ + return (runner.inner_iter + 1) % n == 0 if n > 0 else False + def every_n_iters(self, trainer, n): """ Whether to reach every ``n`` iterations diff --git a/modelscope/trainers/hooks/iter_timer_hook.py b/modelscope/trainers/hooks/iter_timer_hook.py index b57d8653..70d8508b 100644 --- a/modelscope/trainers/hooks/iter_timer_hook.py +++ b/modelscope/trainers/hooks/iter_timer_hook.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import time +from modelscope.utils.constant import LogKeys from .builder import HOOKS from .hook import Hook from .priority import Priority @@ -15,8 +16,9 @@ class IterTimerHook(Hook): def before_iter(self, trainer): trainer.log_buffer.update( - {'data_load_time': time.time() - self.start_time}) + {LogKeys.DATA_LOAD_TIME: time.time() - self.start_time}) def after_iter(self, trainer): - trainer.log_buffer.update({'time': time.time() - self.start_time}) + trainer.log_buffer.update( + {LogKeys.ITER_TIME: time.time() - self.start_time}) self.start_time = time.time() diff --git a/modelscope/trainers/hooks/logger/__init__.py b/modelscope/trainers/hooks/logger/__init__.py index 16eb8797..f5cd544b 100644 --- a/modelscope/trainers/hooks/logger/__init__.py +++ b/modelscope/trainers/hooks/logger/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.trainers.utils.log_buffer import LogBuffer from .base import LoggerHook +from .tensorboard_hook import TensorboardHook from .text_logger_hook import TextLoggerHook -__all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer'] +__all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer', 'TensorboardHook'] diff --git a/modelscope/trainers/hooks/logger/base.py b/modelscope/trainers/hooks/logger/base.py index 98c5e421..18ef6eaf 100644 --- a/modelscope/trainers/hooks/logger/base.py +++ b/modelscope/trainers/hooks/logger/base.py @@ -7,6 +7,7 @@ import numpy as np import torch from modelscope.trainers.hooks.hook import Hook +from modelscope.utils.constant import ModeKeys from ..priority import Priority @@ -60,15 +61,12 @@ class LoggerHook(Hook): return False def get_epoch(self, trainer): - if trainer.mode == 'train': + if trainer.mode in [ModeKeys.TRAIN, ModeKeys.EVAL]: epoch = trainer.epoch + 1 - elif trainer.mode == 'val': - # normal val mode - # trainer.epoch += 1 has been done before val workflow - epoch = trainer.epoch else: - raise ValueError(f"trainer mode should be 'train' or 'val', " - f'but got {trainer.mode}') + raise ValueError( + f'trainer mode should be {ModeKeys.TRAIN} or {ModeKeys.EVAL}, ' + f'but got {trainer.mode}') return epoch def get_iter(self, trainer, inner_iter=False): @@ -89,7 +87,7 @@ class LoggerHook(Hook): trainer.log_buffer.clear() # clear logs of last epoch def after_train_iter(self, trainer): - if self.by_epoch and self.every_n_epochs(trainer, self.interval): + if self.by_epoch and self.every_n_inner_iters(trainer, self.interval): trainer.log_buffer.average(self.interval) elif not self.by_epoch and self.every_n_iters(trainer, self.interval): trainer.log_buffer.average(self.interval) diff --git a/modelscope/trainers/hooks/logger/tensorboard_hook.py b/modelscope/trainers/hooks/logger/tensorboard_hook.py new file mode 100644 index 00000000..a6a68768 --- /dev/null +++ b/modelscope/trainers/hooks/logger/tensorboard_hook.py @@ -0,0 +1,68 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +from modelscope.trainers.hooks.builder import HOOKS +from modelscope.utils.constant import LogKeys +from modelscope.utils.torch_utils import master_only +from .base import LoggerHook + + +@HOOKS.register_module() +class TensorboardHook(LoggerHook): + """TensorBoard hook for visualization. + Args: + out_dir: output directory to save tensorboard files + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging. + by_epoch (bool): Whether EpochBasedtrainer is used. + skip_keys (list): list of keys which will not add to tensorboard + """ + + def __init__(self, + out_dir=None, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True, + skip_keys=[LogKeys.ITER_TIME, LogKeys.DATA_LOAD_TIME]): + super(TensorboardHook, self).__init__( + interval=interval, + ignore_last=ignore_last, + reset_flag=reset_flag, + by_epoch=by_epoch) + self.out_dir = out_dir + self.skip_keys = skip_keys + + @master_only + def before_run(self, trainer): + super(TensorboardHook, self).before_run(trainer) + try: + from torch.utils.tensorboard import SummaryWriter + except ImportError as e: + raise ImportError( + e.msg + ' ' + 'Please pip install tensorboard by ``pip install future tensorboard`` ' + 'or upgrade version by ``pip install future tensorboard --upgrade``.' + ) + + if self.out_dir is None: + self.out_dir = os.path.join(trainer.work_dir, 'tensorboard_output') + self.writer = SummaryWriter(self.out_dir) + + @master_only + def log(self, trainer): + for key, val in trainer.log_buffer.output.items(): + if key in self.skip_keys: + continue + if isinstance(val, str): + self.writer.add_text(key, val, self.get_iter(trainer)) + elif self.is_scalar(val): + self.writer.add_scalar(key, val, self.get_iter(trainer)) + else: + pass + + @master_only + def after_run(self, trainer): + self.writer.close() diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py index c6e39400..7fb4e397 100644 --- a/modelscope/trainers/hooks/logger/text_logger_hook.py +++ b/modelscope/trainers/hooks/logger/text_logger_hook.py @@ -8,6 +8,7 @@ import json import torch from torch import distributed as dist +from modelscope.utils.constant import LogKeys, ModeKeys from modelscope.utils.torch_utils import get_dist_info from ..builder import HOOKS from .base import LoggerHook @@ -72,44 +73,53 @@ class TextLoggerHook(LoggerHook): return mem_mb.item() def _log_info(self, log_dict, trainer): - if log_dict['mode'] == 'train': - if isinstance(log_dict['lr'], dict): + lr_key = LogKeys.LR + epoch_key = LogKeys.EPOCH + iter_key = LogKeys.ITER + mode_key = LogKeys.MODE + iter_time_key = LogKeys.ITER_TIME + data_load_time_key = LogKeys.DATA_LOAD_TIME + eta_key = LogKeys.ETA + + if log_dict[mode_key] == ModeKeys.TRAIN: + if isinstance(log_dict[lr_key], dict): lr_str = [] - for k, val in log_dict['lr'].items(): - lr_str.append(f'lr_{k}: {val:.3e}') + for k, val in log_dict[lr_key].items(): + lr_str.append(f'{lr_key}_{k}: {val:.3e}') lr_str = ' '.join(lr_str) else: - lr_str = f'lr: {log_dict["lr"]:.3e}' + lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}' if self.by_epoch: - log_str = f'Epoch [{log_dict["epoch"]}][{log_dict["iter"]}/{len(trainer.data_loader)}]\t' + log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t' else: - log_str = f'Iter [{log_dict["iter"]}/{trainer.max_iters}]\t' + log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t' log_str += f'{lr_str}, ' - self._logged_keys.extend(['lr', 'mode', 'iter', 'epoch']) + self._logged_keys.extend([lr_key, mode_key, iter_key, epoch_key]) - if 'time' in log_dict.keys(): - self.time_sec_tot += (log_dict['time'] * self.interval) + if iter_time_key in log_dict.keys(): + self.time_sec_tot += (log_dict[iter_time_key] * self.interval) time_sec_avg = self.time_sec_tot / ( trainer.iter - self.start_iter + 1) eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) - log_str += f'eta: {eta_str}, ' - log_str += f'time: {log_dict["time"]:.3f}, data_load_time: {log_dict["data_load_time"]:.3f}, ' + log_str += f'{eta_key}: {eta_str}, ' + log_str += f'{iter_time_key}: {log_dict[iter_time_key]:.3f}, ' + log_str += f'{data_load_time_key}: {log_dict[data_load_time_key]:.3f}, ' self._logged_keys.extend([ - 'time', - 'data_load_time', + iter_time_key, + data_load_time_key, ]) else: # val/test time # here 1000 is the length of the val dataloader - # by epoch: Epoch[val] [4][1000] - # by iter: Iter[val] [1000] + # by epoch: epoch[val] [4][1000] + # by iter: iter[val] [1000] if self.by_epoch: - log_str = f'Epoch({log_dict["mode"]}) [{log_dict["epoch"]}][{log_dict["iter"]}]\t' + log_str = f'{epoch_key}({log_dict[mode_key]}) [{log_dict[epoch_key]}][{log_dict[iter_key]}]\t' else: - log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' - self._logged_keys.extend(['mode', 'iter', 'epoch']) + log_str = f'{iter_key}({log_dict[mode_key]}) [{log_dict[iter_key]}]\t' + self._logged_keys.extend([mode_key, iter_key, epoch_key]) log_items = [] for name, val in log_dict.items(): @@ -150,7 +160,7 @@ class TextLoggerHook(LoggerHook): # statistic memory if torch.cuda.is_available(): - log_dict['memory'] = self._get_max_memory(trainer) + log_dict[LogKeys.MEMORY] = self._get_max_memory(trainer) log_dict = dict(log_dict, **trainer.log_buffer.output) diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py index 08545ffc..c29c96a1 100644 --- a/modelscope/trainers/hooks/lr_scheduler_hook.py +++ b/modelscope/trainers/hooks/lr_scheduler_hook.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.trainers.lrscheduler.builder import build_lr_scheduler +from modelscope.utils.constant import LogKeys from .builder import HOOKS from .hook import Hook from .priority import Priority @@ -46,7 +47,7 @@ class LrSchedulerHook(Hook): return lr def before_train_iter(self, trainer): - trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) + trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) def before_train_epoch(self, trainer): if self.by_epoch: @@ -54,7 +55,7 @@ class LrSchedulerHook(Hook): self.warmup_lr_scheduler.step() else: trainer.lr_scheduler.step() - trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) + trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) def _get_log_lr(self, trainer): cur_lr = self.get_current_lr(trainer) diff --git a/modelscope/trainers/hooks/optimizer_hook.py b/modelscope/trainers/hooks/optimizer_hook.py index 28bd9492..32d58f40 100644 --- a/modelscope/trainers/hooks/optimizer_hook.py +++ b/modelscope/trainers/hooks/optimizer_hook.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import logging + from torch.nn.utils import clip_grad from .builder import HOOKS @@ -8,14 +10,28 @@ from .priority import Priority @HOOKS.register_module() class OptimizerHook(Hook): + """Optimizer hook + + Args: + cumulative_iters (int): interval of gradients accumulation. Default: 1 + grad_clip (dict): Default None. Containing keys: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. + More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_` + loss_keys (str | list): keys list of loss + """ PRIORITY = Priority.ABOVE_NORMAL - def __init__(self, grad_clip=None, loss_keys='loss') -> None: + def __init__(self, + cumulative_iters=1, + grad_clip=None, + loss_keys='loss') -> None: if isinstance(loss_keys, str): loss_keys = [loss_keys] assert isinstance(loss_keys, (tuple, list)) self.loss_keys = loss_keys + self.cumulative_iters = cumulative_iters self.grad_clip = grad_clip def clip_grads(self, params, **clip_args): @@ -24,14 +40,163 @@ class OptimizerHook(Hook): if len(params) > 0: return clip_grad.clip_grad_norm_(params, **clip_args) - def after_train_iter(self, trainer): + def before_run(self, trainer): trainer.optimizer.zero_grad() + def after_train_iter(self, trainer): for k in self.loss_keys: + trainer.train_outputs[k] /= self.cumulative_iters trainer.train_outputs[k].backward() - clip_args = self.grad_clip - if clip_args is not None: - self.clip_grads(trainer.model.parameters(), **clip_args) + if self.every_n_iters(trainer, self.cumulative_iters): + if self.grad_clip is not None: + self.clip_grads(trainer.model.parameters(), **self.grad_clip) + + trainer.optimizer.step() + trainer.optimizer.zero_grad() + + +@HOOKS.register_module() +class TorchAMPOptimizerHook(OptimizerHook): + """Fp16 optimizer, if torch version is less than 1.6.0, + you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default + Args: + cumulative_iters (int): interval of gradients accumulation. Default: 1 + grad_clip (dict): Default None. Containing keys: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. + More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_` + loss_keys (str | list): keys list of loss + loss_scale (float | dict): grade scale config. If loss_scale is a float, + static loss scaling will be used with the specified scale. + It can also be a dict containing arguments of GradScalar. For Pytorch >= 1.6, + we use official torch.cuda.amp.GradScaler. + please refer to: https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler for the parameters. + """ + + def __init__(self, + cumulative_iters=1, + grad_clip=None, + loss_keys='loss', + loss_scale={}): + + super(TorchAMPOptimizerHook, self).__init__( + grad_clip=grad_clip, loss_keys=loss_keys) + self.cumulative_iters = cumulative_iters + self._scale_update_param = None + + from torch.cuda import amp + + if isinstance(loss_scale, float): + self._scale_update_param = loss_scale + self.scaler = amp.GradScaler(init_scale=loss_scale) + elif isinstance(loss_scale, dict): + self.scaler = amp.GradScaler(**loss_scale) + else: + raise ValueError( + '`loss_scale` type must be in [float, dict], but got {loss_scale}' + ) + + def before_run(self, trainer): + logging.info('open fp16') + trainer.optimizer.zero_grad() + + if hasattr(trainer.model, 'module'): + self._ori_model_forward = trainer.model.module.forward + self._model = trainer.model.module + else: + self._ori_model_forward = trainer.model.forward + self._model = trainer.model + + self.ori_model_forward = trainer.model.forward + + def before_train_iter(self, trainer): + from torch.cuda import amp + setattr(self._model, 'forward', amp.autocast()(self._model.forward)) + + def after_train_iter(self, trainer): + for k in self.loss_keys: + trainer.train_outputs[k] /= self.cumulative_iters + + for k in self.loss_keys: + self.scaler.scale(trainer.train_outputs[k]).backward() + + if self.every_n_iters(trainer, self.cumulative_iters): + self.scaler.unscale_(trainer.optimizer) + if self.grad_clip is not None: + self.clip_grads(trainer.model.parameters(), **self.grad_clip) + + self.scaler.step(trainer.optimizer) + self.scaler.update(self._scale_update_param) + trainer.optimizer.zero_grad() + + setattr(self._model, 'forward', self._ori_model_forward) + + +@HOOKS.register_module() +class ApexAMPOptimizerHook(OptimizerHook): + """Fp16 optimizer, if torch version is less than 1.6.0, + you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default + Args: + cumulative_iters (int): interval of gradients accumulation. Default: 1 + grad_clip (dict): Default None. Containing keys: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. + More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_` + loss_keys (str | list): keys list of loss + opt_level (str): "O0" and "O3" are not true mixed precision, + but they are useful for establishing accuracy and speed baselines, respectively. + "O1" and "O2" are different implementations of mixed precision. + Try both, and see what gives the best speedup and accuracy for your model. + """ + + def __init__(self, + cumulative_iters=1, + grad_clip=None, + loss_keys='loss', + opt_level='O1'): + + super(ApexAMPOptimizerHook, self).__init__( + grad_clip=grad_clip, loss_keys=loss_keys) + self.cumulative_iters = cumulative_iters + self.opt_level = opt_level + + try: + from apex import amp + except ImportError: + raise ValueError( + 'apex not installed, please install apex from https://www.github.com/nvidia/apex.' + ) + + def before_run(self, trainer): + from apex import amp + + logging.info('open fp16') + # TODO: fix it should initialze amp with model not wrapper by DDP or DP + if hasattr(trainer.model, 'module'): + trainer.model, trainer.optimizer = amp.initialize( + trainer.model.module, + trainer.optimizer, + opt_level=self.opt_level) + else: + trainer.model, trainer.optimizer = amp.initialize( + trainer.model, trainer.optimizer, opt_level=self.opt_level) + + trainer.optimizer.zero_grad() + + def after_train_iter(self, trainer): + for k in self.loss_keys: + trainer.train_outputs[k] /= self.cumulative_iters + + from apex import amp + for k in self.loss_keys: + with amp.scale_loss(trainer.train_outputs[k], + trainer.optimizer) as scaled_loss: + scaled_loss.backward() + + if self.every_n_iters(trainer, self.cumulative_iters): + if self.grad_clip is not None: + self.clip_grads(trainer.model.parameters(), **self.grad_clip) - trainer.optimizer.step() + trainer.optimizer.step() + trainer.optimizer.zero_grad() diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 6be0be29..6249c82d 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -26,14 +26,16 @@ from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.optimizer.builder import build_optimizer -from modelscope.utils.config import ConfigDict -from modelscope.utils.constant import Hubs, ModelFile, Tasks +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.constant import (Hubs, ModeKeys, ModelFile, Tasks, + TrainerStages) from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.tensor_utils import torch_default_data_collator from modelscope.utils.torch_utils import get_dist_info from .base import BaseTrainer from .builder import TRAINERS +from .default_config import DEFAULT_CONFIG from .hooks.hook import Hook @@ -97,6 +99,10 @@ class EpochBasedTrainer(BaseTrainer): self.model = model super().__init__(cfg_file, arg_parse_fn) + + # add default config + self.cfg.merge_from_dict(self._get_default_config(), force=False) + if 'work_dir' in kwargs: self.work_dir = kwargs['work_dir'] else: @@ -112,14 +118,14 @@ class EpochBasedTrainer(BaseTrainer): self.device = int( os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None self.train_dataset = self.to_task_dataset( - train_dataset, mode='train', preprocessor=self.preprocessor) + train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) self.eval_dataset = self.to_task_dataset( - eval_dataset, mode='eval', preprocessor=self.preprocessor) + eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) self.data_collator = data_collator if data_collator is not None else torch_default_data_collator self.metrics = self.get_metrics() self.optimizers = optimizers self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) - self._mode = 'train' + self._mode = ModeKeys.TRAIN self._hooks: List[Hook] = [] self._epoch = 0 self._iter = 0 @@ -132,6 +138,8 @@ class EpochBasedTrainer(BaseTrainer): else: self._max_epochs = kwargs['max_epochs'] + self.use_fp16 = kwargs.get('use_fp16', False) + # TODO @wenmeng.zwm add seed init fn self._seed = 0 @@ -245,7 +253,7 @@ class EpochBasedTrainer(BaseTrainer): def train(self, *args, **kwargs): self.model.train() - self._mode = 'train' + self._mode = ModeKeys.TRAIN if self.train_dataset is None: self.train_dataloader = self.get_train_dataloader() @@ -261,7 +269,7 @@ class EpochBasedTrainer(BaseTrainer): def evaluate(self, checkpoint_path=None): self.model.eval() - self._mode = 'val' + self._mode = ModeKeys.EVAL if self.eval_dataset is None: self.eval_dataloader = self.get_eval_data_loader() @@ -329,7 +337,7 @@ class EpochBasedTrainer(BaseTrainer): # EvaluationHook will do evaluate and change mode to val, return to train mode # TODO: find more pretty way to change mode model.train() - self._mode = 'train' + self._mode = ModeKeys.TRAIN inputs = self.collate_fn(inputs) if isinstance(inputs, dict): train_outputs = model.forward(**inputs) @@ -394,7 +402,8 @@ class EpochBasedTrainer(BaseTrainer): """ train_data = self.cfg.dataset.train if self.train_dataset is None: - self.train_dataset = self.build_dataset(train_data, mode='train') + self.train_dataset = self.build_dataset( + train_data, mode=ModeKeys.TRAIN) data_loader = self._build_dataloader_with_dataset( self.train_dataset, **self.cfg.train.get('dataloader', {})) @@ -409,7 +418,8 @@ class EpochBasedTrainer(BaseTrainer): """ val_data = self.cfg.dataset.val if self.eval_dataset is None: - self.eval_dataset = self.build_dataset(val_data, mode='eval') + self.eval_dataset = self.build_dataset( + val_data, mode=ModeKeys.TRAIN) batch_size = self.cfg.evaluation.batch_size workers = self.cfg.evaluation.workers @@ -492,7 +502,10 @@ class EpochBasedTrainer(BaseTrainer): _, _, optim_options, lr_options = self.create_optimizer_and_scheduler() lr_hook = dict(type='LrSchedulerHook', **lr_options) - optim_hook = dict(type='OptimizerHook', **optim_options) + if self.use_fp16: + optim_hook = dict(type='TorchAMPOptimizerHook', **optim_options) + else: + optim_hook = dict(type='OptimizerHook', **optim_options) self.register_hook_from_cfg([lr_hook, optim_hook]) @@ -578,26 +591,26 @@ class EpochBasedTrainer(BaseTrainer): def train_loop(self, data_loader): """ Training loop used by `EpochBasedTrainer.train()` """ - self.invoke_hook('before_run') + self.invoke_hook(TrainerStages.before_run) self._epoch = 0 kwargs = {} for _ in range(self._epoch, self._max_epochs): - self.invoke_hook('before_train_epoch') + self.invoke_hook(TrainerStages.before_train_epoch) time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(data_loader): self.data_batch = data_batch self._inner_iter = i - self.invoke_hook('before_train_iter') + self.invoke_hook(TrainerStages.before_train_iter) self.train_step(self.model, data_batch, **kwargs) - self.invoke_hook('after_train_iter') + self.invoke_hook(TrainerStages.after_train_iter) del self.data_batch self._iter += 1 - self.invoke_hook('after_train_epoch') + self.invoke_hook(TrainerStages.after_train_epoch) self._epoch += 1 time.sleep(1) # wait for some hooks like loggers to finish - self.invoke_hook('after_run') + self.invoke_hook(TrainerStages.after_run) def evaluation_loop(self, data_loader, checkpoint_path, metric_classes): """ Evaluation loop used by `EpochBasedTrainer.evaluate()`. @@ -693,6 +706,9 @@ class EpochBasedTrainer(BaseTrainer): stage_hook_infos.append(info) return '\n'.join(stage_hook_infos) + def _get_default_config(self): + return DEFAULT_CONFIG + def worker_init_fn(worker_id, num_workers, rank, seed): # The seed of each worker equals to diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index 4b2096c5..4a455b5e 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -20,9 +20,9 @@ def single_gpu_test(model, """Test model with a single gpu. Args: - data_collate_fn: An optional data_collate_fn before fed into the model model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. + data_collate_fn: An optional data_collate_fn before fed into the model metric_classes(List): List of Metric class that uses to collect metrics Returns: @@ -62,10 +62,10 @@ def multi_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. - data_collate_fn: An optional data_collate_fn before fed into the model tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. + data_collate_fn: An optional data_collate_fn before fed into the model metric_classes(List): List of Metric class that uses to collect metrics Returns: diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index 79307f17..e6da6d0b 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -1,6 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import ast import copy import os import os.path as osp @@ -9,24 +8,15 @@ import shutil import sys import tempfile import types -import uuid -from importlib import import_module from pathlib import Path from typing import Dict import addict from yapf.yapflib.yapf_api import FormatCode -from modelscope.utils.import_utils import (import_modules, - import_modules_from_file, - validate_py_syntax) +from modelscope.utils.import_utils import import_modules_from_file from modelscope.utils.logger import get_logger -if platform.system() == 'Windows': - import regex as re # type: ignore -else: - import re # type: ignore - logger = get_logger() BASE_KEY = '_base_' @@ -380,8 +370,8 @@ class Config: file_format = file.split('.')[-1] return dump(cfg_dict, file=file, file_format=file_format) - def merge_from_dict(self, options, allow_list_keys=True): - """Merge list into cfg_dict. + def merge_from_dict(self, options, allow_list_keys=True, force=True): + """Merge dict into cfg_dict. Merge the dict parsed by MultipleKVAction into this cfg. @@ -392,9 +382,9 @@ class Config: >>> cfg.merge_from_dict(options) >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') >>> assert cfg_dict == dict( - ... model=dict(backbone=dict(depth=50, with_cp=True))) + ... model=dict(backbone=dict(type='ResNet', depth=50, with_cp=True))) - >>> # Merge list element + >>> # Merge list element for replace target index >>> cfg = Config(dict(pipeline=[ ... dict(type='Resize'), dict(type='RandomDistortion')])) >>> options = dict(pipeline={'0': dict(type='MyResize')}) @@ -403,12 +393,38 @@ class Config: >>> assert cfg_dict == dict(pipeline=[ ... dict(type='MyResize'), dict(type='RandomDistortion')]) + >>> # Merge list element for replace args and add to list, only support list of type dict with key ``type``, + >>> # if you add new list element, the list does not guarantee the order, + >>> # it is only suitable for the case where the order of the list is not concerned. + >>> cfg = Config(dict(pipeline=[ + ... dict(type='Resize', size=224), dict(type='RandomDistortion')])) + >>> options = dict(pipeline=[dict(type='Resize', size=256), dict(type='RandomFlip')]) + >>> cfg.merge_from_dict(options, allow_list_keys=True) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict(pipeline=[ + ... dict(type='Resize', size=256), dict(type='RandomDistortion'), dict(type='RandomFlip')]) + + >>> # force usage + >>> options = {'model.backbone.depth': 18, + ... 'model.backbone.with_cp':True} + >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet', depth=50)))) + >>> cfg.merge_from_dict(options, force=False) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict( + ... model=dict(backbone=dict(type='ResNet', depth=50, with_cp=True))) + Args: options (dict): dict of configs to merge from. allow_list_keys (bool): If True, int string keys (e.g. '0', '1') are allowed in ``options`` and will replace the element of the corresponding index in the config if the config is a list. + Or you can directly replace args for list or add new list element, + only support list of type dict with key ``type``, + but if you add new list element, the list does not guarantee the order, + It is only suitable for the case where the order of the list is not concerned. Default: True. + force (bool): If True, existing key-value will be replaced by new given. + If False, existing key-value will not be updated. """ option_cfg_dict = {} for full_key, v in options.items(): @@ -424,7 +440,122 @@ class Config: super(Config, self).__setattr__( '_cfg_dict', Config._merge_a_into_b( - option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys)) + option_cfg_dict, + cfg_dict, + allow_list_keys=allow_list_keys, + force=force)) + + @staticmethod + def _merge_a_into_b(a, b, allow_list_keys=False, force=True): + """merge dict ``a`` into dict ``b`` (non-inplace). + + Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid + in-place modifications. + + Args: + a (dict): The source dict to be merged into ``b``. + b (dict): The origin dict to be fetch keys from ``a``. + allow_list_keys (bool): If True, int string keys (e.g. '0', '1') + are allowed in source ``a`` and will replace the element of the + corresponding index in b if b is a list. Default: False. + force (bool): If True, existing key-value will be replaced by new given. + If False, existing key-value will not be updated. + + Returns: + dict: The modified dict of ``b`` using ``a``. + + Examples: + # Normally merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # Delete b first and merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # b is a list + >>> Config._merge_a_into_b( + ... {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True) + [{'a': 2}, {'b': 2}] + + # value of a and b are both list, only support list of type dict with key ``type``, + # You can directly replace args for list or add new list element, + # but if you add new list element, the list does not guarantee the order, + # it is only suitable for the case where the order of the list is not concerned. + >>> Config._merge_a_into_b( + ... {'k': [dict(a=2), dict(c=3)]}, {'k': [dict(a=1), dict(b=2)]}, True) + {'k': [dict(a=2), dict(b=2), dict(c=3)]} + + # force is False + >>> Config._merge_a_into_b( + ... dict(obj=dict(a=2, b=2)), dict(obj=dict(a=1))), True, force=False) + {'obj': {'a': 1, b=2}} + """ + b = b.copy() + for k, v in a.items(): + if allow_list_keys and k.isdigit() and isinstance(b, list): + k = int(k) + if len(b) <= k: + raise KeyError(f'Index {k} exceeds the length of list {b}') + b[k] = Config._merge_a_into_b( + v, b[k], allow_list_keys, force=force) + elif allow_list_keys and isinstance(v, list) and k in b: + if not isinstance(b[k], list): + raise ValueError( + f'type mismatch {type(v)} and {type(b[k])} between a and b for key {k}' + ) + _is_dict_with_type = True + for list_i in b[k] + v: + if not isinstance(list_i, dict) or 'type' not in list_i: + if k not in b or force: + b[k] = v + _is_dict_with_type = False + if _is_dict_with_type: + res_list = [] + added_index_bk, added_index_v = [], [] + for i, b_li in enumerate(b[k]): + for j, a_lj in enumerate(v): + if a_lj['type'] == b_li['type']: + res_list.append( + Config._merge_a_into_b( + a_lj, + b_li, + allow_list_keys, + force=force)) + added_index_v.append(j) + added_index_bk.append(i) + break + rest_bk = [ + b[k][i] for i in range(len(b[k])) + if i not in added_index_bk + ] + rest_v = [ + v[i] for i in range(len(v)) if i not in added_index_v + ] + rest = rest_bk + rest_v + res_list += [ + Config._merge_a_into_b( + rest[i], {}, allow_list_keys, force=force) + for i in range(len(rest)) + ] + b[k] = res_list + elif isinstance(v, + dict) and k in b and not v.pop(DELETE_KEY, False): + allowed_types = (dict, list) if allow_list_keys else dict + if not isinstance(b[k], allowed_types): + raise TypeError( + f'{k}={v} in child config cannot inherit from base ' + f'because {k} is a dict in the child config but is of ' + f'type {type(b[k])} in base config. You may set ' + f'`{DELETE_KEY}=True` to ignore the base config') + b[k] = Config._merge_a_into_b( + v, b[k], allow_list_keys, force=force) + else: + if k not in b or force: + b[k] = v + return b def to_dict(self) -> Dict: """ Convert Config object to python dict diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 0b8da090..d6afb35a 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -163,3 +163,33 @@ PYTORCH = 'pytorch' DEFAULT_MODEL_REVISION = 'master' DEFAULT_DATASET_REVISION = 'master' + + +class ModeKeys: + TRAIN = 'train' + EVAL = 'eval' + + +class LogKeys: + ITER = 'iter' + ITER_TIME = 'iter_time' + EPOCH = 'epoch' + LR = 'lr' # learning rate + MODE = 'mode' + DATA_LOAD_TIME = 'data_load_time' + ETA = 'eta' # estimated time of arrival + MEMORY = 'memory' + LOSS = 'loss' + + +class TrainerStages: + before_run = 'before_run' + before_train_epoch = 'before_train_epoch' + before_train_iter = 'before_train_iter' + after_train_iter = 'after_train_iter' + after_train_epoch = 'after_train_epoch' + before_val_epoch = 'before_val_epoch' + before_val_iter = 'before_val_iter' + after_val_iter = 'after_val_iter' + after_val_epoch = 'after_val_epoch' + after_run = 'after_run' diff --git a/tests/trainers/hooks/logger/__init__.py b/tests/trainers/hooks/logger/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py new file mode 100644 index 00000000..1d3c0e76 --- /dev/null +++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py @@ -0,0 +1,112 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import LogKeys, ModelFile + + +class DummyDataset(Dataset, metaclass=ABCMeta): + + def __len__(self): + return 20 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class TensorboardHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_tensorboard_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01 + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + }, + 'hooks': [{ + 'type': 'TensorboardHook', + 'interval': 2 + }] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=DummyDataset(), + max_epochs=2) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + tb_out_dir = os.path.join(self.tmp_dir, 'tensorboard_output') + + events_files = glob.glob( + os.path.join(tb_out_dir, 'events.out.tfevents.*')) + self.assertEqual(len(events_files), 1) + + from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + ea = EventAccumulator(events_files[0]) + ea.Reload() + self.assertEqual(len(ea.Scalars(LogKeys.LOSS)), 10) + self.assertEqual(len(ea.Scalars(LogKeys.LR)), 10) + for i in range(5): + self.assertAlmostEqual( + ea.Scalars(LogKeys.LR)[i].value, 0.01, delta=0.001) + for i in range(5, 10): + self.assertAlmostEqual( + ea.Scalars(LogKeys.LR)[i].value, 0.001, delta=0.0001) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py index 4e839f0c..afb68869 100644 --- a/tests/trainers/hooks/test_checkpoint_hook.py +++ b/tests/trainers/hooks/test_checkpoint_hook.py @@ -11,7 +11,7 @@ from torch import nn from torch.utils.data import Dataset from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import LogKeys, ModelFile class DummyDataset(Dataset, metaclass=ABCMeta): @@ -100,8 +100,8 @@ class CheckpointHookTest(unittest.TestCase): trainer = build_trainer(trainer_name, kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) - self.assertIn('epoch_1.pth', results_files) - self.assertIn('epoch_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) if __name__ == '__main__': diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py new file mode 100644 index 00000000..4d13b2e0 --- /dev/null +++ b/tests/trainers/hooks/test_evaluation_hook.py @@ -0,0 +1,195 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.utils.data import Dataset + +from modelscope.metrics.builder import METRICS, MetricKeys +from modelscope.trainers import build_trainer +from modelscope.utils.constant import LogKeys, ModelFile +from modelscope.utils.registry import default_group + +_global_iter = 0 + + +@METRICS.register_module(group_key=default_group, module_name='DummyMetric') +class DummyMetric: + + _fake_acc_by_epoch = {1: 0.1, 2: 0.5, 3: 0.2} + + def add(*args, **kwargs): + pass + + def evaluate(self): + global _global_iter + _global_iter += 1 + return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} + + +class DummyDataset(Dataset, metaclass=ABCMeta): + + def __len__(self): + return 20 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class EvaluationHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_best_ckpt_rule_max(self): + global _global_iter + _global_iter = 0 + + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01, + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + }, + 'hooks': [{ + 'type': 'EvaluationHook', + 'interval': 1, + 'save_best_ckpt': True, + 'monitor_key': MetricKeys.ACCURACY + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['DummyMetric'] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=DummyDataset(), + eval_dataset=DummyDataset(), + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertIn(f'best_{LogKeys.EPOCH}2_{MetricKeys.ACCURACY}0.5.pth', + results_files) + + def test_best_ckpt_rule_min(self): + global _global_iter + _global_iter = 0 + + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01, + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + }, + 'hooks': [{ + 'type': 'EvaluationHook', + 'interval': 1, + 'save_best_ckpt': True, + 'monitor_key': 'accuracy', + 'rule': 'min', + 'out_dir': os.path.join(self.tmp_dir, 'best_ckpt') + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['DummyMetric'] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=DummyDataset(), + eval_dataset=DummyDataset(), + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth', + os.listdir(os.path.join(self.tmp_dir, 'best_ckpt'))) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py index 27ee000f..575edfd7 100644 --- a/tests/trainers/hooks/test_lr_scheduler_hook.py +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -13,7 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR from torch.utils.data import Dataset from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages class DummyDataset(Dataset, metaclass=ABCMeta): @@ -66,7 +66,7 @@ class LrSchedulerHookTest(unittest.TestCase): } } - config_path = os.path.join(self.tmp_dir, 'config.json') + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) with open(config_path, 'w') as f: json.dump(json_cfg, f) @@ -86,23 +86,23 @@ class LrSchedulerHookTest(unittest.TestCase): trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) trainer.register_optimizers_hook() - trainer.invoke_hook('before_run') + trainer.invoke_hook(TrainerStages.before_run) log_lrs = [] optim_lrs = [] for _ in range(trainer._epoch, trainer._max_epochs): - trainer.invoke_hook('before_train_epoch') + trainer.invoke_hook(TrainerStages.before_train_epoch) for _, data_batch in enumerate(train_dataloader): - trainer.invoke_hook('before_train_iter') + trainer.invoke_hook(TrainerStages.before_train_iter) - log_lrs.append(trainer.log_buffer.output['lr']) + log_lrs.append(trainer.log_buffer.output[LogKeys.LR]) optim_lrs.append(optimizer.param_groups[0]['lr']) trainer.train_step(trainer.model, data_batch) - trainer.invoke_hook('after_train_iter') + trainer.invoke_hook(TrainerStages.after_train_iter) - trainer.invoke_hook('after_train_epoch') + trainer.invoke_hook(TrainerStages.after_train_epoch) trainer._epoch += 1 - trainer.invoke_hook('after_run') + trainer.invoke_hook(TrainerStages.after_run) iters = 5 target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 @@ -157,23 +157,23 @@ class LrSchedulerHookTest(unittest.TestCase): trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) trainer.register_optimizers_hook() - trainer.invoke_hook('before_run') + trainer.invoke_hook(TrainerStages.before_run) log_lrs = [] optim_lrs = [] for _ in range(trainer._epoch, trainer._max_epochs): - trainer.invoke_hook('before_train_epoch') + trainer.invoke_hook(TrainerStages.before_train_epoch) for _, data_batch in enumerate(train_dataloader): - trainer.invoke_hook('before_train_iter') + trainer.invoke_hook(TrainerStages.before_train_iter) - log_lrs.append(round(trainer.log_buffer.output['lr'], 5)) + log_lrs.append(round(trainer.log_buffer.output[LogKeys.LR], 5)) optim_lrs.append( round(trainer.optimizer.param_groups[0]['lr'], 5)) trainer.train_step(trainer.model, data_batch) - trainer.invoke_hook('after_train_iter') + trainer.invoke_hook(TrainerStages.after_train_iter) - trainer.invoke_hook('after_train_epoch') - trainer.invoke_hook('after_run') + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer.invoke_hook(TrainerStages.after_run) iters = 5 target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py new file mode 100644 index 00000000..98dbfef5 --- /dev/null +++ b/tests/trainers/hooks/test_optimizer_hook.py @@ -0,0 +1,184 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.optim import SGD +from torch.optim.lr_scheduler import MultiStepLR +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile, TrainerStages + + +class DummyDataset(Dataset, metaclass=ABCMeta): + """Base Dataset + """ + + def __len__(self): + return 10 + + def __getitem__(self, idx): + return dict(feat=torch.rand((2, 2)), label=torch.randint(0, 2, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 2) + self.bn = nn.BatchNorm1d(2) + + def forward(self, feat, labels): + x = self.linear(feat) + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class OptimizerHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_optimizer_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + } + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = MultiStepLR(optimizer, milestones=[1, 2]) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=DummyDataset(), + optimizers=(optimizer, lr_scheduler), + max_epochs=2) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + + trainer.invoke_hook(TrainerStages.before_run) + + for _ in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook(TrainerStages.before_train_epoch) + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook(TrainerStages.before_train_iter) + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook(TrainerStages.after_train_iter) + + self.assertEqual( + len(trainer.optimizer.param_groups[0]['params']), 4) + for i in range(4): + self.assertTrue(trainer.optimizer.param_groups[0]['params'] + [i].requires_grad) + + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer._epoch += 1 + trainer.invoke_hook(TrainerStages.after_run) + + +class TorchAMPOptimizerHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_amp_optimizer_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + } + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel().cuda() + optimizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = MultiStepLR(optimizer, milestones=[1, 2]) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=DummyDataset(), + optimizers=(optimizer, lr_scheduler), + max_epochs=2, + use_fp16=True) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + + trainer.invoke_hook(TrainerStages.before_run) + + for _ in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook(TrainerStages.before_train_epoch) + for _, data_batch in enumerate(train_dataloader): + for k, v in data_batch.items(): + data_batch[k] = v.cuda() + trainer.invoke_hook(TrainerStages.before_train_iter) + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook(TrainerStages.after_train_iter) + + self.assertEqual(trainer.train_outputs['logits'].dtype, + torch.float16) + + # test if `after_train_iter`, whether the model is reset to fp32 + trainer.train_step(trainer.model, data_batch) + self.assertEqual(trainer.train_outputs['logits'].dtype, + torch.float32) + + self.assertEqual( + len(trainer.optimizer.param_groups[0]['params']), 4) + for i in range(4): + self.assertTrue(trainer.optimizer.param_groups[0]['params'] + [i].requires_grad) + + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer._epoch += 1 + trainer.invoke_hook(TrainerStages.after_run) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py index b8864aef..5fafbfbb 100644 --- a/tests/trainers/hooks/test_timer_hook.py +++ b/tests/trainers/hooks/test_timer_hook.py @@ -13,7 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR from torch.utils.data import Dataset from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages class DummyDataset(Dataset, metaclass=ABCMeta): @@ -89,39 +89,43 @@ class IterTimerHookTest(unittest.TestCase): trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) trainer.register_optimizers_hook() trainer.register_hook_from_cfg(trainer.cfg.train.hooks) - - trainer.invoke_hook('before_run') + trainer.data_loader = train_dataloader + trainer.invoke_hook(TrainerStages.before_run) for i in range(trainer._epoch, trainer._max_epochs): - trainer.invoke_hook('before_train_epoch') + trainer.invoke_hook(TrainerStages.before_train_epoch) for _, data_batch in enumerate(train_dataloader): - trainer.invoke_hook('before_train_iter') + trainer.invoke_hook(TrainerStages.before_train_iter) trainer.train_step(trainer.model, data_batch) - trainer.invoke_hook('after_train_iter') + trainer.invoke_hook(TrainerStages.after_train_iter) - self.assertIn('data_load_time', trainer.log_buffer.val_history) - self.assertIn('time', trainer.log_buffer.val_history) - self.assertIn('loss', trainer.log_buffer.val_history) + self.assertIn(LogKeys.DATA_LOAD_TIME, + trainer.log_buffer.val_history) + self.assertIn(LogKeys.ITER_TIME, + trainer.log_buffer.val_history) + self.assertIn(LogKeys.LOSS, trainer.log_buffer.val_history) - trainer.invoke_hook('after_train_epoch') + trainer.invoke_hook(TrainerStages.after_train_epoch) - target_len = 5 * (i + 1) + target_len = 5 self.assertEqual( - len(trainer.log_buffer.val_history['data_load_time']), + len(trainer.log_buffer.val_history[LogKeys.DATA_LOAD_TIME]), target_len) self.assertEqual( - len(trainer.log_buffer.val_history['time']), target_len) + len(trainer.log_buffer.val_history[LogKeys.ITER_TIME]), + target_len) self.assertEqual( - len(trainer.log_buffer.val_history['loss']), target_len) + len(trainer.log_buffer.val_history[LogKeys.LOSS]), target_len) self.assertEqual( - len(trainer.log_buffer.n_history['data_load_time']), + len(trainer.log_buffer.n_history[LogKeys.DATA_LOAD_TIME]), target_len) self.assertEqual( - len(trainer.log_buffer.n_history['time']), target_len) + len(trainer.log_buffer.n_history[LogKeys.ITER_TIME]), + target_len) self.assertEqual( - len(trainer.log_buffer.n_history['loss']), target_len) + len(trainer.log_buffer.n_history[LogKeys.LOSS]), target_len) - trainer.invoke_hook('after_run') + trainer.invoke_hook(TrainerStages.after_run) if __name__ == '__main__': diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index 22874262..a949c6ec 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -12,17 +12,12 @@ from torch.optim import SGD from torch.optim.lr_scheduler import StepLR from torch.utils.data import Dataset +from modelscope.metrics.builder import MetricKeys from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import test_level -class DummyMetric: - - def __call__(self, ground_truth, predict_results): - return {'accuracy': 0.5} - - class DummyDataset(Dataset, metaclass=ABCMeta): """Base Dataset """ @@ -130,9 +125,9 @@ class TrainerTest(unittest.TestCase): results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) - self.assertIn('epoch_1.pth', results_files) - self.assertIn('epoch_2.pth', results_files) - self.assertIn('epoch_3.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_train_1(self): @@ -167,7 +162,7 @@ class TrainerTest(unittest.TestCase): } } - config_path = os.path.join(self.tmp_dir, 'config.json') + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) with open(config_path, 'w') as f: json.dump(json_cfg, f) @@ -189,9 +184,133 @@ class TrainerTest(unittest.TestCase): results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) - self.assertIn('epoch_1.pth', results_files) - self.assertIn('epoch_2.pth', results_files) - self.assertIn('epoch_3.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_train_with_default_config(self): + json_cfg = { + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'hooks': [{ + 'type': 'EvaluationHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['seq_cls_metric'] + } + } + + class _DummyDataset(DummyDataset): + """Base Dataset + """ + + def __len__(self): + return 40 + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimmizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = StepLR(optimmizer, 2) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + data_collator=None, + train_dataset=_DummyDataset(), + eval_dataset=DummyDataset(), + optimizers=(optimmizer, lr_scheduler), + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + + json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json') + with open(json_file, 'r') as f: + lines = [i.strip() for i in f.readlines()] + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 1, + LogKeys.ITER: 10, + LogKeys.LR: 0.01 + }, json.loads(lines[0])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 1, + LogKeys.ITER: 20, + LogKeys.LR: 0.01 + }, json.loads(lines[1])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 1, + LogKeys.ITER: 20 + }, json.loads(lines[2])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 2, + LogKeys.ITER: 10, + LogKeys.LR: 0.001 + }, json.loads(lines[3])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 2, + LogKeys.ITER: 20, + LogKeys.LR: 0.001 + }, json.loads(lines[4])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 2, + LogKeys.ITER: 20 + }, json.loads(lines[5])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 3, + LogKeys.ITER: 10, + LogKeys.LR: 0.001 + }, json.loads(lines[6])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 3, + LogKeys.ITER: 20, + LogKeys.LR: 0.001 + }, json.loads(lines[7])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 3, + LogKeys.ITER: 20 + }, json.loads(lines[8])) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + for i in [0, 1, 3, 4, 6, 7]: + self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i]) + self.assertIn(LogKeys.ITER_TIME, lines[i]) + for i in [2, 5, 8]: + self.assertIn(MetricKeys.ACCURACY, lines[i]) class DummyTrainerTest(unittest.TestCase): diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index a3770f0d..77bca8d5 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import argparse +import copy import tempfile import unittest @@ -77,6 +78,148 @@ class ConfigTest(unittest.TestCase): self.assertEqual(args.optimizer, 'Adam') self.assertEqual(args.save_checkpoint_epochs, 20) + def test_merge_from_dict(self): + base_cfg = copy.deepcopy(obj) + base_cfg.update({'dict_list': [dict(l1=1), dict(l2=2)]}) + + cfg = Config(base_cfg) + + merge_dict = { + 'a': 2, + 'b.d': 'ee', + 'b.c': [3, 3, 3], + 'dict_list': { + '0': dict(l1=3) + }, + 'c': 'test' + } + + cfg1 = copy.deepcopy(cfg) + cfg1.merge_from_dict(merge_dict) + self.assertDictEqual( + cfg1._cfg_dict, { + 'a': 2, + 'b': { + 'c': [3, 3, 3], + 'd': 'ee' + }, + 'dict_list': [dict(l1=3), dict(l2=2)], + 'c': 'test' + }) + + cfg2 = copy.deepcopy(cfg) + cfg2.merge_from_dict(merge_dict, force=False) + self.assertDictEqual( + cfg2._cfg_dict, { + 'a': 1, + 'b': { + 'c': [1, 2, 3], + 'd': 'dd' + }, + 'dict_list': [dict(l1=1), dict(l2=2)], + 'c': 'test' + }) + + def test_merge_from_dict_with_list(self): + base_cfg = { + 'a': + 1, + 'b': { + 'c': [1, 2, 3], + 'd': 'dd' + }, + 'dict_list': [dict(type='l1', v=1), + dict(type='l2', v=2)], + 'dict_list2': [ + dict( + type='l1', + v=[dict(type='l1_1', v=1), + dict(type='l1_2', v=2)]), + dict(type='l2', v=2) + ] + } + cfg = Config(base_cfg) + + merge_dict_for_list = { + 'a': + 2, + 'b.c': [3, 3, 3], + 'b.d': + 'ee', + 'dict_list': [dict(type='l1', v=8), + dict(type='l3', v=8)], + 'dict_list2': [ + dict( + type='l1', + v=[ + dict(type='l1_1', v=8), + dict(type='l1_2', v=2), + dict(type='l1_3', v=8), + ]), + dict(type='l2', v=8) + ], + 'c': + 'test' + } + + cfg1 = copy.deepcopy(cfg) + cfg1.merge_from_dict(merge_dict_for_list, force=False) + self.assertDictEqual( + cfg1._cfg_dict, { + 'a': + 1, + 'b': { + 'c': [1, 2, 3], + 'd': 'dd' + }, + 'dict_list': [ + dict(type='l1', v=1), + dict(type='l2', v=2), + dict(type='l3', v=8) + ], + 'dict_list2': [ + dict( + type='l1', + v=[ + dict(type='l1_1', v=1), + dict(type='l1_2', v=2), + dict(type='l1_3', v=8), + ]), + dict(type='l2', v=2) + ], + 'c': + 'test' + }) + + cfg2 = copy.deepcopy(cfg) + cfg2.merge_from_dict(merge_dict_for_list, force=True) + self.assertDictEqual( + cfg2._cfg_dict, { + 'a': + 2, + 'b': { + 'c': [3, 3, 3], + 'd': 'ee' + }, + 'dict_list': [ + dict(type='l1', v=8), + dict(type='l2', v=2), + dict(type='l3', v=8) + ], + 'dict_list2': [ + dict( + type='l1', + v=[ + dict(type='l1_1', v=8), + dict(type='l1_2', v=2), + dict(type='l1_3', v=8), + ]), + dict(type='l2', v=8) + ], + 'c': + 'test' + }) + if __name__ == '__main__': unittest.main()