diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py index 4826c81f..6a581759 100644 --- a/modelscope/trainers/hooks/__init__.py +++ b/modelscope/trainers/hooks/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from .builder import HOOKS, build_hook -from .checkpoint_hook import CheckpointHook +from .checkpoint_hook import BestCkptSaverHook, CheckpointHook from .evaluation_hook import EvaluationHook from .hook import Hook from .iter_timer_hook import IterTimerHook @@ -13,5 +13,6 @@ from .priority import Priority __all__ = [ 'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', 'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', - 'IterTimerHook', 'TorchAMPOptimizerHook', 'ApexAMPOptimizerHook' + 'IterTimerHook', 'TorchAMPOptimizerHook', 'ApexAMPOptimizerHook', + 'BestCkptSaverHook' ] diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index c7ed3252..4cb08130 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -42,6 +42,9 @@ class CheckpointHook(Hook): if not self.save_dir: self.save_dir = trainer.work_dir + if not os.path.exists(self.save_dir) and is_master(): + os.makedirs(self.save_dir) + if not hasattr(trainer, 'logger'): self.logger = get_logger(__name__) else: @@ -93,3 +96,72 @@ class CheckpointHook(Hook): and check_last(trainer)): return True return False + + +@HOOKS.register_module() +class BestCkptSaverHook(CheckpointHook): + """Save best checkpoints hook. + Args: + metric_key (str): Metric key to compare rule for best score. + rule (str): Comparison rule for best score. + Support "max" and "min". If rule is "max", the checkpoint at the maximum `metric_key` + will be saved, If rule is "min", the checkpoint at the minimum `metric_key` will be saved. + by_epoch (bool): Save best checkpoints by epoch or by iteration. + save_optimizer (bool): Whether to save optimizer state dict. Default: True. + save_dir (str): Output directory to save best checkpoint. + """ + + PRIORITY = Priority.NORMAL + rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y} + + def __init__(self, + metric_key, + rule='max', + by_epoch=True, + save_optimizer=True, + save_dir=None): + assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.' + super().__init__( + by_epoch=by_epoch, + save_optimizer=save_optimizer, + save_dir=save_dir, + ) + self.metric_key = metric_key + self.rule = rule + self._best_metric = None + self._best_ckpt_file = None + + def _should_save(self, trainer): + return self._is_best_metric(trainer.metric_values) + + def _is_best_metric(self, metric_values): + if metric_values is None: + return False + + if self.metric_key not in metric_values: + raise ValueError( + f'Not find metric_key: {self.metric_key} in {metric_values}') + + if self._best_metric is None: + self._best_metric = metric_values[self.metric_key] + return True + else: + compare_fn = self.rule_map[self.rule] + if compare_fn(metric_values[self.metric_key], self._best_metric): + self._best_metric = metric_values[self.metric_key] + return True + return False + + def _save_checkpoint(self, trainer): + if self.by_epoch: + cur_save_name = os.path.join( + self.save_dir, + f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth' + ) + else: + cur_save_name = os.path.join( + self.save_dir, + f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth' + ) + save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) + self._best_ckpt_file = cur_save_name diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py index 325606f5..aea27f2f 100644 --- a/modelscope/trainers/hooks/evaluation_hook.py +++ b/modelscope/trainers/hooks/evaluation_hook.py @@ -1,13 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os - -from modelscope.utils.checkpoint import save_checkpoint -from modelscope.utils.constant import LogKeys -from modelscope.utils.logger import get_logger -from modelscope.utils.torch_utils import get_dist_info from .builder import HOOKS from .hook import Hook -from .priority import Priority @HOOKS.register_module() @@ -18,56 +11,13 @@ class EvaluationHook(Hook): by_epoch (bool): Evaluate by epoch or by iteration. start_idx (int | None, optional): The epoch/iterations validation begins. Default: None, validate every interval epochs/iterations from scratch. - save_best_ckpt (bool): Whether save the best checkpoint during evaluation. - monitor_key (str): Monitor key to compare rule for best score, only valid when `save_best_ckpt` is true. - rule (str): Comparison rule for best score, only valid when `save_best_ckpt` is true. - Support "max" and "min". If rule is "max", the checkpoint at the maximum `monitor_key` - will be saved, If rule is "min", the checkpoint at the minimum `monitor_key` will be saved. - out_dir (str): Output directory to save best checkpoint. """ - PRIORITY = Priority.NORMAL - rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y} - - def __init__(self, - interval=1, - by_epoch=True, - start_idx=None, - save_best_ckpt=False, - monitor_key=None, - rule='max', - out_dir=None): + def __init__(self, interval=1, by_epoch=True, start_idx=None): assert interval > 0, 'interval must be a positive number' - if save_best_ckpt: - assert monitor_key is not None, 'Must provide `monitor_key` when `save_best_ckpt` is True.' - assert rule in ['max', - 'min'], 'Only support "max" or "min" rule now.' - self.interval = interval self.start_idx = start_idx self.by_epoch = by_epoch - self.save_best_ckpt = save_best_ckpt - self.monitor_key = monitor_key - self.rule = rule - self.out_dir = out_dir - self._best_metric = None - self._best_ckpt_file = None - - def before_run(self, trainer): - if not self.out_dir: - self.out_dir = trainer.work_dir - if not os.path.exists(self.out_dir): - rank, _ = get_dist_info() - if rank == 0: - os.makedirs(self.out_dir) - - if self.save_best_ckpt: - if not hasattr(trainer, 'logger'): - self.logger = get_logger(__name__) - else: - self.logger = trainer.logger - self.logger.info( - f'Best checkpoint will be saved to {self.out_dir}') def after_train_iter(self, trainer): """Called after every training iter to evaluate the results.""" @@ -87,46 +37,6 @@ class EvaluationHook(Hook): trainer.log_buffer.ready = True - if self.save_best_ckpt and self._is_best_metric(eval_res): - # remove the previous best model and save the latest best model - if self._best_ckpt_file is not None and os.path.exists( - self._best_ckpt_file): - os.remove(self._best_ckpt_file) - self._save_checkpoint(trainer) - - def _is_best_metric(self, eval_res): - if self.monitor_key not in eval_res: - raise ValueError( - f'Not find monitor_key: {self.monitor_key} in {eval_res}') - - if self._best_metric is None: - self._best_metric = eval_res[self.monitor_key] - return True - else: - compare_fn = self.rule_map[self.rule] - if compare_fn(eval_res[self.monitor_key], self._best_metric): - self._best_metric = eval_res[self.monitor_key] - return True - return False - - def _save_checkpoint(self, trainer): - if self.by_epoch: - cur_save_name = os.path.join( - self.out_dir, - f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.monitor_key}{self._best_metric}.pth' - ) - else: - cur_save_name = os.path.join( - self.out_dir, - f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.monitor_key}{self._best_metric}.pth' - ) - - rank, _ = get_dist_info() - if rank == 0: - save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) - - self._best_ckpt_file = cur_save_name - def _should_evaluate(self, trainer): """Judge whether to perform evaluation. diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py index c29c96a1..d84ca3b9 100644 --- a/modelscope/trainers/hooks/lr_scheduler_hook.py +++ b/modelscope/trainers/hooks/lr_scheduler_hook.py @@ -1,6 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.utils.constant import LogKeys +from modelscope.utils.logger import get_logger +from modelscope.utils.torch_utils import is_master from .builder import HOOKS from .hook import Hook from .priority import Priority @@ -50,12 +52,14 @@ class LrSchedulerHook(Hook): trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) def before_train_epoch(self, trainer): + trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) + + def after_train_epoch(self, trainer): if self.by_epoch: if self.warmup_lr_scheduler is not None: self.warmup_lr_scheduler.step() else: trainer.lr_scheduler.step() - trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer) def _get_log_lr(self, trainer): cur_lr = self.get_current_lr(trainer) @@ -70,3 +74,44 @@ class LrSchedulerHook(Hook): lr.update({k: lr_[0]}) return lr + + +@HOOKS.register_module() +class PlateauLrSchedulerHook(LrSchedulerHook): + """Lr scheduler hook for `ReduceLROnPlateau`. + + Args: + metric_key (str): Metric key returned from `trainer.metric_values`, + get the value of metric key and pass it to `ReduceLROnPlateau.step`. + by_epoch (bool): Whether lr changes by epoch + warmup (dict): warm up config + """ + PRIORITY = Priority.LOW # should be after EvaluationHook + + def __init__(self, metric_key, by_epoch=True, warmup=None) -> None: + super().__init__(by_epoch=by_epoch, warmup=warmup) + self.metric_key = metric_key + + def before_run(self, trainer): + super().before_run(trainer) + if not hasattr(trainer, 'logger'): + self.logger = get_logger(__name__) + else: + self.logger = trainer.logger + + def after_train_epoch(self, trainer): + # adapt to evaluation intervel is greater than 1 + if trainer.metric_values is None: + if is_master(): + self.logger.warning( + f'Current epoch {trainer.epoch} has no evaluation metric values, skip lr_scheduler.step() !' + ) + return + + metrics = trainer.metric_values[self.metric_key] + + if self.by_epoch: + if self.warmup_lr_scheduler is not None: + self.warmup_lr_scheduler.step(metrics=metrics) + else: + trainer.lr_scheduler.step(metrics=metrics) diff --git a/modelscope/trainers/lrscheduler/builder.py b/modelscope/trainers/lrscheduler/builder.py index f284b1a9..dc458d56 100644 --- a/modelscope/trainers/lrscheduler/builder.py +++ b/modelscope/trainers/lrscheduler/builder.py @@ -40,7 +40,8 @@ def register_torch_lr_scheduler(): members = inspect.getmembers(lr_scheduler) for name, obj in members: - if inspect.isclass(obj) and issubclass(obj, _LRScheduler): + if (inspect.isclass(obj) and issubclass( + obj, _LRScheduler)) or name in ['ReduceLROnPlateau']: LR_SCHEDULER.register_module(module_name=name, module_cls=obj) diff --git a/modelscope/trainers/lrscheduler/warmup/base.py b/modelscope/trainers/lrscheduler/warmup/base.py index 78e9b5fb..81497817 100644 --- a/modelscope/trainers/lrscheduler/warmup/base.py +++ b/modelscope/trainers/lrscheduler/warmup/base.py @@ -52,12 +52,12 @@ class BaseWarmup(_LRScheduler): for i, group in enumerate(self.optimizer.param_groups): group['lr'] *= scale_value[i] - def step(self, epoch=None): + def step(self, *args, **kwargs): """ When ``self.base_scheduler._step_count`` is less than ``self.warmup_iters``, multiply lr by scale """ if self.base_scheduler._step_count > self.warmup_iters: - return self.base_scheduler.step(epoch=epoch) + return self.base_scheduler.step(*args, **kwargs) for group, lr in zip(self.optimizer.param_groups, self.base_lrs): group['lr'] = lr @@ -66,7 +66,7 @@ class BaseWarmup(_LRScheduler): if self._is_init_step: self._is_init_step = False else: - self.base_scheduler.step(epoch=epoch) + self.base_scheduler.step(*args, **kwargs) self.scale() diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 10a2f189..e83654a2 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -7,6 +7,7 @@ from distutils.version import LooseVersion from functools import partial from typing import Callable, List, Optional, Tuple, Union +import json import numpy as np import torch from addict import Dict @@ -135,6 +136,7 @@ class EpochBasedTrainer(BaseTrainer): self.data_collator = data_collator if data_collator is not None else torch_default_data_collator self.metrics = self.get_metrics() + self._metric_values = None self.optimizers = optimizers self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) self._mode = ModeKeys.TRAIN @@ -322,17 +324,16 @@ class EpochBasedTrainer(BaseTrainer): **self.cfg.evaluation.get('dataloader', {})) self.data_loader = self.eval_dataloader metric_classes = [build_metric(metric) for metric in self.metrics] - self.evaluation_loop(self.eval_dataloader, checkpoint_path, - metric_classes) - rank, world_size = get_dist_info() - metric_values = {} - if rank == 0: - for metric_cls in metric_classes: - metric_values.update(metric_cls.evaluate()) - if world_size > 1: - metric_values = broadcast(metric_values, 0) + metric_values = self.evaluation_loop(self.eval_dataloader, + checkpoint_path, metric_classes) + + self._metric_values = metric_values return metric_values + @property + def metric_values(self): + return self._metric_values + def build_model(self) -> Union[nn.Module, TorchModel]: """ Instantiate a pytorch model and return. @@ -530,8 +531,6 @@ class EpochBasedTrainer(BaseTrainer): We provide a default implementation, if you want to customize your own optimizer and lr scheduler, you can either pass a tuple through trainer init function or subclass this class and override this method. - - """ optimizer, lr_scheduler = self.optimizers if optimizer is None: @@ -563,22 +562,38 @@ class EpochBasedTrainer(BaseTrainer): def register_optimizers_hook(self): """ Register optimizer hook and lr scheduler hook. """ - optimizer, lr_scheduler = self.optimizers - opti_error_msg = 'optimizers should be a tuple of `torch.optim.Optimizer`'\ - ' and `torch.optim.lr_scheduler._LRScheduler`' - if optimizer is not None: - assert isinstance(optimizer, torch.optim.Optimizer), opti_error_msg - if lr_scheduler is not None: - assert isinstance( - lr_scheduler, - torch.optim.lr_scheduler._LRScheduler), opti_error_msg + _, lr_scheduler, optim_options, lr_options = self.create_optimizer_and_scheduler( + ) - _, _, optim_options, lr_options = self.create_optimizer_and_scheduler() - lr_hook = dict(type='LrSchedulerHook', **lr_options) - if self.use_fp16: - optim_hook = dict(type='TorchAMPOptimizerHook', **optim_options) - else: - optim_hook = dict(type='OptimizerHook', **optim_options) + optim_hook = self.cfg.train.get('optimizer_hook', None) + lr_hook = self.cfg.train.get('lr_scheduler_hook', None) + + # adapt to `ReduceLROnPlateau` + from torch.optim.lr_scheduler import ReduceLROnPlateau + if isinstance(lr_scheduler, ReduceLROnPlateau) and lr_hook is None: + plateau_cfg = { + 'train': { + 'lr_scheduler_hook': { + 'type': 'PlateauLrSchedulerHook', + 'metric_key': + 'Metric Key used for PlateauLrSchedulerHook' + } + } + } + plateau_cfg = json.dumps( + plateau_cfg, sort_keys=False, indent=4, separators=(',', ':')) + raise ValueError( + 'Must add `lr_scheduler_hook` to configuration for `ReduceLROnPlateau` lr scheduler as follows:' + + '\n' + plateau_cfg) + + if lr_hook is None: + lr_hook = dict(type='LrSchedulerHook', **lr_options) + if optim_hook is None: + if self.use_fp16: + optim_hook = dict( + type='TorchAMPOptimizerHook', **optim_options) + else: + optim_hook = dict(type='OptimizerHook', **optim_options) self.register_hook_from_cfg([lr_hook, optim_hook]) @@ -692,7 +707,7 @@ class EpochBasedTrainer(BaseTrainer): """ if self._dist: from modelscope.trainers.utils.inference import multi_gpu_test - multi_gpu_test( + metric_values = multi_gpu_test( self.model, data_loader, tmpdir=None, @@ -701,12 +716,14 @@ class EpochBasedTrainer(BaseTrainer): metric_classes=metric_classes) else: from modelscope.trainers.utils.inference import single_gpu_test - single_gpu_test( + metric_values = single_gpu_test( self.model, data_loader, data_collate_fn=self.collate_fn, metric_classes=metric_classes) + return metric_values + def register_hook(self, hook: Hook) -> None: """Register a hook into the hook list. diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index 173c25f8..c30d1d15 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -10,7 +10,8 @@ import torch from torch import distributed as dist from tqdm import tqdm -from modelscope.utils.torch_utils import get_dist_info, is_master, make_tmp_dir +from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, + make_tmp_dir) from modelscope.utils.utils import if_func_receive_dict_inputs @@ -51,6 +52,12 @@ def single_gpu_test(model, for _ in range(batch_size): pbar.update() + metric_values = {} + for metric_cls in metric_classes: + metric_values.update(metric_cls.evaluate()) + + return metric_values + def multi_gpu_test(model, data_loader, @@ -132,6 +139,15 @@ def multi_gpu_test(model, for metric_cls in metric_classes: metric_cls.add(results[i], data_list[i]) + metric_values = {} + if rank == 0: + for metric_cls in metric_classes: + metric_values.update(metric_cls.evaluate()) + if world_size > 1: + metric_values = broadcast(metric_values, 0) + + return metric_values + def collect_results_cpu(result_part, size, tmpdir=None): """Collect results under cpu mode. diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index 511e21f6..3cf88114 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -56,7 +56,8 @@ class Registry(object): def _register_module(self, group_key=default_group, module_name=None, - module_cls=None): + module_cls=None, + force=False): assert isinstance(group_key, str), 'group_key is required and must be str' @@ -69,7 +70,7 @@ class Registry(object): if module_name is None: module_name = module_cls.__name__ - if module_name in self._modules[group_key]: + if module_name in self._modules[group_key] and not force: raise KeyError(f'{module_name} is already registered in ' f'{self._name}[{group_key}]') self._modules[group_key][module_name] = module_cls @@ -78,7 +79,8 @@ class Registry(object): def register_module(self, group_key: str = default_group, module_name: str = None, - module_cls: type = None): + module_cls: type = None, + force=False): """ Register module Example: @@ -102,6 +104,8 @@ class Registry(object): default group name is 'default' module_name: Module name module_cls: Module class object + force (bool, optional): Whether to override an existing class with + the same name. Default: False. """ if not (module_name is None or isinstance(module_name, str)): @@ -111,7 +115,8 @@ class Registry(object): self._register_module( group_key=group_key, module_name=module_name, - module_cls=module_cls) + module_cls=module_cls, + force=force) return module_cls # if module_cls is None, should return a decorator function @@ -119,7 +124,8 @@ class Registry(object): self._register_module( group_key=group_key, module_name=module_name, - module_cls=module_cls) + module_cls=module_cls, + force=force) return module_cls return _register diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py index dc4a5e83..6c08f160 100644 --- a/tests/trainers/hooks/logger/test_tensorboard_hook.py +++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py @@ -99,7 +99,7 @@ class TensorboardHookTest(unittest.TestCase): ea.Scalars(LogKeys.LR)[i].value, 0.01, delta=0.001) for i in range(5, 10): self.assertAlmostEqual( - ea.Scalars(LogKeys.LR)[i].value, 0.001, delta=0.0001) + ea.Scalars(LogKeys.LR)[i].value, 0.01, delta=0.0001) if __name__ == '__main__': diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py index 8375a001..8dbd0130 100644 --- a/tests/trainers/hooks/test_checkpoint_hook.py +++ b/tests/trainers/hooks/test_checkpoint_hook.py @@ -9,10 +9,31 @@ import numpy as np import torch from torch import nn +from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.trainers import build_trainer from modelscope.utils.constant import LogKeys, ModelFile +from modelscope.utils.registry import default_group from modelscope.utils.test_utils import create_dummy_test_dataset + +def create_dummy_metric(): + _global_iter = 0 + + @METRICS.register_module( + group_key=default_group, module_name='DummyMetric', force=True) + class DummyMetric: + + _fake_acc_by_epoch = {1: 0.1, 2: 0.5, 3: 0.2} + + def add(*args, **kwargs): + pass + + def evaluate(self): + global _global_iter + _global_iter += 1 + return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} + + dummy_dataset = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) @@ -39,12 +60,16 @@ class CheckpointHookTest(unittest.TestCase): self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) + create_dummy_metric() def tearDown(self): super().tearDown() shutil.rmtree(self.tmp_dir) def test_checkpoint_hook(self): + global _global_iter + _global_iter = 0 + json_cfg = { 'task': 'image_classification', 'train': { @@ -98,5 +123,80 @@ class CheckpointHookTest(unittest.TestCase): self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) +class BestCkptSaverHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + create_dummy_metric() + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_best_checkpoint_hook(self): + global _global_iter + _global_iter = 0 + + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01 + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2 + }, + 'hooks': [{ + 'type': 'BestCkptSaverHook', + 'metric_key': MetricKeys.ACCURACY, + 'rule': 'min' + }, { + 'type': 'EvaluationHook', + 'interval': 1, + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['DummyMetric'] + } + } + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=dummy_dataset, + eval_dataset=dummy_dataset, + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth', + results_files) + + if __name__ == '__main__': unittest.main() diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py index ad225aed..2927db47 100644 --- a/tests/trainers/hooks/test_evaluation_hook.py +++ b/tests/trainers/hooks/test_evaluation_hook.py @@ -15,21 +15,18 @@ from modelscope.utils.constant import LogKeys, ModelFile from modelscope.utils.registry import default_group from modelscope.utils.test_utils import create_dummy_test_dataset -_global_iter = 0 +def create_dummy_metric(): -@METRICS.register_module(group_key=default_group, module_name='DummyMetric') -class DummyMetric: + @METRICS.register_module( + group_key=default_group, module_name='DummyMetric', force=True) + class DummyMetric: - _fake_acc_by_epoch = {1: 0.1, 2: 0.5, 3: 0.2} + def add(*args, **kwargs): + pass - def add(*args, **kwargs): - pass - - def evaluate(self): - global _global_iter - _global_iter += 1 - return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} + def evaluate(self): + return {MetricKeys.ACCURACY: 0.5} dummy_dataset = create_dummy_test_dataset( @@ -58,80 +55,17 @@ class EvaluationHookTest(unittest.TestCase): self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) + create_dummy_metric() def tearDown(self): super().tearDown() shutil.rmtree(self.tmp_dir) - def test_best_ckpt_rule_max(self): - global _global_iter - _global_iter = 0 - - json_cfg = { - 'task': 'image_classification', - 'train': { - 'work_dir': - self.tmp_dir, - 'dataloader': { - 'batch_size_per_gpu': 2, - 'workers_per_gpu': 1 - }, - 'optimizer': { - 'type': 'SGD', - 'lr': 0.01, - }, - 'lr_scheduler': { - 'type': 'StepLR', - 'step_size': 2, - }, - 'hooks': [{ - 'type': 'EvaluationHook', - 'interval': 1, - 'save_best_ckpt': True, - 'monitor_key': MetricKeys.ACCURACY - }] - }, - 'evaluation': { - 'dataloader': { - 'batch_size_per_gpu': 2, - 'workers_per_gpu': 1, - 'shuffle': False - }, - 'metrics': ['DummyMetric'] - } - } - - config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) - with open(config_path, 'w') as f: - json.dump(json_cfg, f) - - trainer_name = 'EpochBasedTrainer' - kwargs = dict( - cfg_file=config_path, - model=DummyModel(), - data_collator=None, - train_dataset=dummy_dataset, - eval_dataset=dummy_dataset, - max_epochs=3) - - trainer = build_trainer(trainer_name, kwargs) - trainer.train() - results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) - self.assertIn(f'best_{LogKeys.EPOCH}2_{MetricKeys.ACCURACY}0.5.pth', - results_files) - - def test_best_ckpt_rule_min(self): - global _global_iter - _global_iter = 0 - + def test_evaluation_hook(self): json_cfg = { 'task': 'image_classification', 'train': { - 'work_dir': - self.tmp_dir, + 'work_dir': self.tmp_dir, 'dataloader': { 'batch_size_per_gpu': 2, 'workers_per_gpu': 1 @@ -147,10 +81,6 @@ class EvaluationHookTest(unittest.TestCase): 'hooks': [{ 'type': 'EvaluationHook', 'interval': 1, - 'save_best_ckpt': True, - 'monitor_key': 'accuracy', - 'rule': 'min', - 'out_dir': os.path.join(self.tmp_dir, 'best_ckpt') }] }, 'evaluation': { @@ -174,16 +104,11 @@ class EvaluationHookTest(unittest.TestCase): data_collator=None, train_dataset=dummy_dataset, eval_dataset=dummy_dataset, - max_epochs=3) + max_epochs=1) trainer = build_trainer(trainer_name, kwargs) trainer.train() - results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) - self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth', - os.listdir(os.path.join(self.tmp_dir, 'best_ckpt'))) + self.assertDictEqual(trainer.metric_values, {'accuracy': 0.5}) if __name__ == '__main__': diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py index ddf4b3fd..afb887a4 100644 --- a/tests/trainers/hooks/test_lr_scheduler_hook.py +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -9,16 +9,36 @@ import numpy as np import torch from torch import nn from torch.optim import SGD -from torch.optim.lr_scheduler import MultiStepLR +from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau +from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.trainers import build_trainer from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages +from modelscope.utils.registry import default_group from modelscope.utils.test_utils import create_dummy_test_dataset dummy_dataset = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10) +def create_dummy_metric(): + _global_iter = 0 + + @METRICS.register_module( + group_key=default_group, module_name='DummyMetric', force=True) + class DummyMetric: + + _fake_acc_by_epoch = {1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0.3} + + def add(*args, **kwargs): + pass + + def evaluate(self): + global _global_iter + _global_iter += 1 + return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]} + + class DummyModel(nn.Module): def __init__(self): @@ -41,12 +61,16 @@ class LrSchedulerHookTest(unittest.TestCase): self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) + create_dummy_metric() def tearDown(self): super().tearDown() shutil.rmtree(self.tmp_dir) def test_lr_scheduler_hook(self): + global _global_iter + _global_iter = 0 + json_cfg = { 'task': 'image_classification', 'train': { @@ -85,25 +109,26 @@ class LrSchedulerHookTest(unittest.TestCase): trainer.invoke_hook(TrainerStages.before_train_epoch) for _, data_batch in enumerate(train_dataloader): trainer.invoke_hook(TrainerStages.before_train_iter) + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook(TrainerStages.after_train_iter) log_lrs.append(trainer.log_buffer.output[LogKeys.LR]) optim_lrs.append(optimizer.param_groups[0]['lr']) - trainer.train_step(trainer.model, data_batch) - trainer.invoke_hook(TrainerStages.after_train_iter) - trainer.invoke_hook(TrainerStages.after_train_epoch) trainer._epoch += 1 trainer.invoke_hook(TrainerStages.after_run) iters = 5 - target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 - ] * iters * 2 - + target_lrs = [0.01] * iters * 2 + [0.001] * iters * 2 + [0.0001 + ] * iters * 1 self.assertListEqual(log_lrs, target_lrs) self.assertListEqual(optim_lrs, target_lrs) def test_warmup_lr_scheduler_hook(self): + global _global_iter + _global_iter = 0 + json_cfg = { 'task': 'image_classification', 'train': { @@ -156,22 +181,118 @@ class LrSchedulerHookTest(unittest.TestCase): trainer.invoke_hook(TrainerStages.before_train_epoch) for _, data_batch in enumerate(train_dataloader): trainer.invoke_hook(TrainerStages.before_train_iter) + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook(TrainerStages.after_train_iter) log_lrs.append(round(trainer.log_buffer.output[LogKeys.LR], 5)) optim_lrs.append( round(trainer.optimizer.param_groups[0]['lr'], 5)) + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer.invoke_hook(TrainerStages.after_run) + + iters = 5 + target_lrs = [0.001] * iters * 1 + [0.004] * iters * 1 + [ + 0.007 + ] * iters * 1 + [0.01] * iters * 1 + [0.001] * iters * 2 + [ + 0.0001 + ] * iters * 1 + + self.assertListEqual(log_lrs, target_lrs) + self.assertListEqual(optim_lrs, target_lrs) + + +class PlateauLrSchedulerHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + create_dummy_metric() + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_plateau_lr_scheduler_hook(self): + global _global_iter + _global_iter = 0 + + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'lr_scheduler': { + 'type': 'ReduceLROnPlateau', + 'mode': 'max', + 'factor': 0.1, + 'patience': 2, + }, + 'lr_scheduler_hook': { + 'type': 'PlateauLrSchedulerHook', + 'metric_key': MetricKeys.ACCURACY + }, + 'hooks': [{ + 'type': 'EvaluationHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['DummyMetric'] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimizer = SGD(model.parameters(), lr=0.01) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=dummy_dataset, + eval_dataset=dummy_dataset, + optimizers=(optimizer, None), + max_epochs=5) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.data_loader = train_dataloader + trainer.register_optimizers_hook() + trainer.register_hook_from_cfg(trainer.cfg.train.hooks) + + trainer.invoke_hook(TrainerStages.before_run) + log_lrs = [] + optim_lrs = [] + for _ in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook(TrainerStages.before_train_epoch) + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook(TrainerStages.before_train_iter) trainer.train_step(trainer.model, data_batch) trainer.invoke_hook(TrainerStages.after_train_iter) + log_lrs.append(trainer.log_buffer.output[LogKeys.LR]) + optim_lrs.append(optimizer.param_groups[0]['lr']) + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer._epoch += 1 trainer.invoke_hook(TrainerStages.after_run) iters = 5 - target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ - 0.01 - ] * iters * 1 + [0.001] * iters * 2 + [0.0001] * iters * 2 - + target_lrs = [0.01] * iters * 4 + [0.001] * iters * 1 self.assertListEqual(log_lrs, target_lrs) self.assertListEqual(optim_lrs, target_lrs) diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index 97cb94b8..9d4e79df 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -19,13 +19,6 @@ from modelscope.trainers import build_trainer from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import create_dummy_test_dataset, test_level - -class DummyMetric: - - def __call__(self, ground_truth, predict_results): - return {'accuracy': 0.5} - - dummy_dataset_small = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) @@ -265,14 +258,14 @@ class TrainerTest(unittest.TestCase): LogKeys.MODE: ModeKeys.TRAIN, LogKeys.EPOCH: 2, LogKeys.ITER: 10, - LogKeys.LR: 0.001 + LogKeys.LR: 0.01 }, json.loads(lines[3])) self.assertDictContainsSubset( { LogKeys.MODE: ModeKeys.TRAIN, LogKeys.EPOCH: 2, LogKeys.ITER: 20, - LogKeys.LR: 0.001 + LogKeys.LR: 0.01 }, json.loads(lines[4])) self.assertDictContainsSubset( { diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py index 00d3ac13..3a36dc02 100644 --- a/tests/trainers/test_trainer_gpu.py +++ b/tests/trainers/test_trainer_gpu.py @@ -18,13 +18,6 @@ from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import (DistributedTestCase, create_dummy_test_dataset, test_level) - -class DummyMetric: - - def __call__(self, ground_truth, predict_results): - return {'accuracy': 0.5} - - dummy_dataset_small = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) @@ -141,14 +134,14 @@ class TrainerTestSingleGpu(unittest.TestCase): LogKeys.MODE: ModeKeys.TRAIN, LogKeys.EPOCH: 2, LogKeys.ITER: 10, - LogKeys.LR: 0.001 + LogKeys.LR: 0.01 }, json.loads(lines[3])) self.assertDictContainsSubset( { LogKeys.MODE: ModeKeys.TRAIN, LogKeys.EPOCH: 2, LogKeys.ITER: 20, - LogKeys.LR: 0.001 + LogKeys.LR: 0.01 }, json.loads(lines[4])) self.assertDictContainsSubset( { @@ -229,7 +222,7 @@ class TrainerTestMultiGpus(DistributedTestCase): LogKeys.MODE: ModeKeys.TRAIN, LogKeys.EPOCH: 2, LogKeys.ITER: 10, - LogKeys.LR: 0.001 + LogKeys.LR: 0.01 }, json.loads(lines[2])) self.assertDictContainsSubset( {