diff --git a/modelscope/trainers/base.py b/modelscope/trainers/base.py index a2b655ed..98f97859 100644 --- a/modelscope/trainers/base.py +++ b/modelscope/trainers/base.py @@ -33,6 +33,7 @@ class BaseTrainer(ABC): else: self.args = None self.log_buffer = LogBuffer() + self.visualization_buffer = LogBuffer() self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) def get_or_download_model_dir(self, model, model_revision=None): diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py index a02478b9..7b2e339a 100644 --- a/modelscope/trainers/default_config.py +++ b/modelscope/trainers/default_config.py @@ -4,15 +4,23 @@ from modelscope.utils.config import Config DEFAULT_CONFIG = { 'train': { - 'hooks': [{ - 'type': 'CheckpointHook', - 'interval': 1 - }, { - 'type': 'TextLoggerHook', - 'interval': 10 - }, { - 'type': 'IterTimerHook' - }] + 'hooks': [ + { + 'type': 'CheckpointHook', + 'interval': 1 + }, + { + 'type': 'TextLoggerHook', + 'interval': 10 + }, + { + 'type': 'IterTimerHook' + }, + { + 'type': 'TensorboardHook', + 'interval': 10 + }, + ] } } diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py index 4479fa23..331b8f04 100644 --- a/modelscope/trainers/hooks/evaluation_hook.py +++ b/modelscope/trainers/hooks/evaluation_hook.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from collections import OrderedDict + from modelscope.metainfo import Hooks from .builder import HOOKS from .hook import Hook @@ -30,11 +32,19 @@ class EvaluationHook(Hook): if self.by_epoch and self._should_evaluate(trainer): self.do_evaluate(trainer) + def add_visualization_info(self, trainer, results): + if trainer.visualization_buffer.output.get('eval_results', + None) is None: + trainer.visualization_buffer.output['eval_results'] = OrderedDict() + + trainer.visualization_buffer.output['eval_results'].update( + trainer.visualize(results)) + def do_evaluate(self, trainer): """Evaluate the results.""" eval_res = trainer.evaluate() for name, val in eval_res.items(): - trainer.log_buffer.output[name] = val + trainer.log_buffer.output['evaluation/' + name] = val trainer.log_buffer.ready = True diff --git a/modelscope/trainers/hooks/logger/tensorboard_hook.py b/modelscope/trainers/hooks/logger/tensorboard_hook.py index a12f7ae7..31bef4f0 100644 --- a/modelscope/trainers/hooks/logger/tensorboard_hook.py +++ b/modelscope/trainers/hooks/logger/tensorboard_hook.py @@ -1,6 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os +import numpy as np +import torch + from modelscope.metainfo import Hooks from modelscope.trainers.hooks.builder import HOOKS from modelscope.utils.constant import LogKeys @@ -50,10 +53,14 @@ class TensorboardHook(LoggerHook): if self.out_dir is None: self.out_dir = os.path.join(trainer.work_dir, 'tensorboard_output') + trainer.logger.info( + f'tensorboard files will be saved to {self.out_dir}') self.writer = SummaryWriter(self.out_dir) @master_only def log(self, trainer): + if len(trainer.visualization_buffer.output) > 0: + self.visualization_log(trainer) for key, val in trainer.log_buffer.output.items(): if key in self.skip_keys: continue @@ -63,6 +70,45 @@ class TensorboardHook(LoggerHook): self.writer.add_scalar(key, val, self.get_iter(trainer)) else: pass + self.writer.flush() + + def visualization_log(self, trainer): + """ Images Visulization. + `visualization_buffer` is a dictionary containing: + images (list): list of visulaized images. + filenames (list of str, optional): image filenames. + """ + visual_results = trainer.visualization_buffer.output + for vis_key, vis_result in visual_results.items(): + images = vis_result.get('images', []) + filenames = vis_result.get('filenames', None) + if filenames is not None: + assert len(images) == len( + filenames + ), 'Output `images` and `filenames` must keep the same length!' + + for i, img in enumerate(images): + if isinstance(img, np.ndarray): + img = torch.from_numpy(img) + else: + assert isinstance( + img, torch.Tensor + ), f'Only support np.ndarray and torch.Tensor type! Got {type(img)} for img {filenames[i]}' + + default_name = 'image_%i' % i + filename = filenames[ + i] if filenames is not None else default_name + self.writer.add_image( + f'{vis_key}/{filename}', + img, + self.get_iter(trainer), + dataformats='HWC') + + def after_train_iter(self, trainer): + super(TensorboardHook, self).after_train_iter(trainer) + # clear visualization_buffer after each iter to ensure that it is only written once, + # avoiding repeated writing of the same image buffer every self.interval + trainer.visualization_buffer.clear_output() @master_only def after_run(self, trainer): diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 1c76fc2e..649cb96a 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -37,7 +37,8 @@ from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, - init_dist, set_random_seed) + init_dist, is_master, + set_random_seed) from .base import BaseTrainer from .builder import TRAINERS from .default_config import merge_cfg @@ -940,27 +941,73 @@ class EpochBasedTrainer(BaseTrainer): """ if self._dist and self.cfg.model.get('model_parallel_size', 1) == 1: from modelscope.trainers.utils.inference import multi_gpu_test - metric_values = multi_gpu_test( + # list of batched result and data samples + results, data_list = multi_gpu_test( self, data_loader, device=self.device, tmpdir=None, gpu_collect=False, - metric_classes=metric_classes, data_loader_iters_per_gpu=self._eval_iters_per_epoch) else: from modelscope.trainers.utils.inference import single_gpu_test - metric_values = single_gpu_test( + results, data_list = single_gpu_test( self, data_loader, device=self.device, - metric_classes=metric_classes, data_loader_iters=self._eval_iters_per_epoch) self._inner_iter = self.iters_per_epoch - 1 # start from index 0 + # evaluation result processing + if hasattr(self.cfg.evaluation, 'visualization'): + flatten_results = [] + for r in results: + flatten_results.extend(r) + vis_cfg = self.cfg.evaluation.visualization + self.visualization(results, self.eval_dataset, **vis_cfg) + + # do evaluation on rank0 + metric_values = {} + if not self._dist or is_master(): + assert len(data_list) == len( + results), f'size mismatch {len(data_list)} and {len(results)}' + for metric_cls in metric_classes: + for idx in range(len(data_list)): + metric_cls.add(results[idx], data_list[idx]) + + for metric_cls in metric_classes: + metric_values.update(metric_cls.evaluate()) + return metric_values + def visualization(self, results, dataset, **kwargs): + """ visualization function for evaluation results. + + Args: + results (list(dict)): a list of result dict. + dataset (:obj:`Dataset`): torch dataset object to access original data. + + Implementation Examples: + ```python + # draw list of images as numpy array + images = draw_images(num_of_visualization) + + # set displayed name for each image + filenames = get_image_display_names() + vis_results = { + 'images': images, + 'filenames' : filenames + } + + # visualization results will be displayed in group named eva_vis + self.visualization_buffer.output['eval_vis'] = vis_results + ``` + """ + # TODO @wenmeng.zwm add visualization support for cv evaluation + raise NotImplementedError( + 'visualization for evaluation will be supported in the future') + def register_hook(self, hook: Hook) -> None: """Register a hook into the hook list. diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index 87e0abc7..4ea34d59 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -15,18 +15,13 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, make_tmp_dir) -def single_gpu_test(trainer, - data_loader, - device, - metric_classes=None, - data_loader_iters=None): +def single_gpu_test(trainer, data_loader, device, data_loader_iters=None): """Test model in EpochBasedTrainer with a single gpu. Args: trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested. data_loader (nn.Dataloader): Pytorch data loader. device (str | torch.device): The target device for the data. - metric_classes (List): List of Metric class that uses to collect metrics data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset. Returns: @@ -48,13 +43,14 @@ def single_gpu_test(trainer, data_len = data_loader_iters desc = 'Test iterations' + results = [] + data_lists = [] with tqdm(total=data_len, desc=desc) as pbar: for i, data in enumerate(data_loader): data = to_device(data, device) result = trainer.evaluation_step(data) - if metric_classes is not None: - for metric_cls in metric_classes: - metric_cls.add(result, data) + results.append(result) + data_lists.append(data) if progress_with_iters: batch_size = 1 # iteration count @@ -75,11 +71,7 @@ def single_gpu_test(trainer, if progress_with_iters and (i + 1) >= data_len: break - metric_values = {} - for metric_cls in metric_classes: - metric_values.update(metric_cls.evaluate()) - - return metric_values + return results, data_lists def multi_gpu_test(trainer, @@ -87,7 +79,6 @@ def multi_gpu_test(trainer, device, tmpdir=None, gpu_collect=False, - metric_classes=None, data_loader_iters_per_gpu=None): """Test model in EpochBasedTrainer with multiple gpus. @@ -104,7 +95,6 @@ def multi_gpu_test(trainer, tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. - metric_classes(List): List of Metric class that uses to collect metrics data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset. Returns: list: The prediction results. @@ -180,22 +170,7 @@ def multi_gpu_test(trainer, data_list = collect_results_cpu(data_list, total_samples, os.path.join(tmpdir, 'groundtruth')) - if is_master(): - assert len(data_list) == len( - results), f'size mismatch {len(data_list)} and {len(results)}' - if metric_classes is not None: - for i in range(len(data_list)): - for metric_cls in metric_classes: - metric_cls.add(results[i], data_list[i]) - - metric_values = {} - if rank == 0: - for metric_cls in metric_classes: - metric_values.update(metric_cls.evaluate()) - if world_size > 1: - metric_values = broadcast(metric_values, 0) - - return metric_values + return results, data_list def collect_results_cpu(result_part, size, tmpdir=None): diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index 5d466ee0..660355bc 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -1,9 +1,11 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import glob import os import shutil import tempfile import unittest +import cv2 import json import numpy as np import torch @@ -17,6 +19,8 @@ from modelscope.metrics.builder import MetricKeys from modelscope.models.base import Model from modelscope.trainers import build_trainer from modelscope.trainers.base import DummyTrainer +from modelscope.trainers.builder import TRAINERS +from modelscope.trainers.trainer import EpochBasedTrainer from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks from modelscope.utils.test_utils import create_dummy_test_dataset, test_level @@ -52,6 +56,21 @@ class DummyModel(nn.Module, Model): return dict(logits=x, loss=loss) +@TRAINERS.register_module(module_name='test_vis') +class VisTrainer(EpochBasedTrainer): + + def visualization(self, results, dataset, **kwargs): + num_image = 5 + f = 'data/test/images/bird.JPEG' + filenames = [f for _ in range(num_image)] + imgs = [cv2.imread(f) for f in filenames] + filenames = [f + str(i) for i in range(num_image)] + vis_results = {'images': imgs, 'filenames': filenames} + + # visualization results will be displayed in group named eva_vis + self.visualization_buffer.output['eval_vis'] = vis_results + + class TrainerTest(unittest.TestCase): def setUp(self): @@ -105,6 +124,9 @@ class TrainerTest(unittest.TestCase): }, { 'type': 'EvaluationHook', 'interval': 1 + }, { + 'type': 'TensorboardHook', + 'interval': 1 }] }, 'evaluation': { @@ -113,7 +135,7 @@ class TrainerTest(unittest.TestCase): 'workers_per_gpu': 1, 'shuffle': False }, - 'metrics': [Metrics.seq_cls_metric] + 'metrics': [Metrics.seq_cls_metric], } } config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) @@ -138,6 +160,88 @@ class TrainerTest(unittest.TestCase): self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertIn('tensorboard_output', results_files) + self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_train_visualization(self): + json_cfg = { + 'task': Tasks.image_classification, + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01, + 'options': { + 'grad_clip': { + 'max_norm': 2.0 + } + } + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + 'options': { + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_iters': 2 + } + } + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'interval': 1 + }, { + 'type': 'TensorboardHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': [Metrics.seq_cls_metric], + 'visualization': {}, + } + } + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'test_vis' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=dummy_dataset_small, + eval_dataset=dummy_dataset_small, + max_epochs=3, + device='cpu') + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_train_1(self): @@ -199,6 +303,7 @@ class TrainerTest(unittest.TestCase): self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_train_with_default_config(self):