1. add tensorboard hook to default config 2. add image visualization support to tensorboard hook and trainer 3. move evaluation logic out of single_gpu_test and multi_gpu_test to make prediction results available for further processing such as result saving and visualization. visualization results are as follows:   Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10894813master^2
| @@ -33,6 +33,7 @@ class BaseTrainer(ABC): | |||
| else: | |||
| self.args = None | |||
| self.log_buffer = LogBuffer() | |||
| self.visualization_buffer = LogBuffer() | |||
| self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) | |||
| def get_or_download_model_dir(self, model, model_revision=None): | |||
| @@ -4,15 +4,23 @@ from modelscope.utils.config import Config | |||
| DEFAULT_CONFIG = { | |||
| 'train': { | |||
| 'hooks': [{ | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TextLoggerHook', | |||
| 'interval': 10 | |||
| }, { | |||
| 'type': 'IterTimerHook' | |||
| }] | |||
| 'hooks': [ | |||
| { | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }, | |||
| { | |||
| 'type': 'TextLoggerHook', | |||
| 'interval': 10 | |||
| }, | |||
| { | |||
| 'type': 'IterTimerHook' | |||
| }, | |||
| { | |||
| 'type': 'TensorboardHook', | |||
| 'interval': 10 | |||
| }, | |||
| ] | |||
| } | |||
| } | |||
| @@ -1,4 +1,6 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from collections import OrderedDict | |||
| from modelscope.metainfo import Hooks | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| @@ -30,11 +32,19 @@ class EvaluationHook(Hook): | |||
| if self.by_epoch and self._should_evaluate(trainer): | |||
| self.do_evaluate(trainer) | |||
| def add_visualization_info(self, trainer, results): | |||
| if trainer.visualization_buffer.output.get('eval_results', | |||
| None) is None: | |||
| trainer.visualization_buffer.output['eval_results'] = OrderedDict() | |||
| trainer.visualization_buffer.output['eval_results'].update( | |||
| trainer.visualize(results)) | |||
| def do_evaluate(self, trainer): | |||
| """Evaluate the results.""" | |||
| eval_res = trainer.evaluate() | |||
| for name, val in eval_res.items(): | |||
| trainer.log_buffer.output[name] = val | |||
| trainer.log_buffer.output['evaluation/' + name] = val | |||
| trainer.log_buffer.ready = True | |||
| @@ -1,6 +1,9 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.metainfo import Hooks | |||
| from modelscope.trainers.hooks.builder import HOOKS | |||
| from modelscope.utils.constant import LogKeys | |||
| @@ -50,10 +53,14 @@ class TensorboardHook(LoggerHook): | |||
| if self.out_dir is None: | |||
| self.out_dir = os.path.join(trainer.work_dir, 'tensorboard_output') | |||
| trainer.logger.info( | |||
| f'tensorboard files will be saved to {self.out_dir}') | |||
| self.writer = SummaryWriter(self.out_dir) | |||
| @master_only | |||
| def log(self, trainer): | |||
| if len(trainer.visualization_buffer.output) > 0: | |||
| self.visualization_log(trainer) | |||
| for key, val in trainer.log_buffer.output.items(): | |||
| if key in self.skip_keys: | |||
| continue | |||
| @@ -63,6 +70,45 @@ class TensorboardHook(LoggerHook): | |||
| self.writer.add_scalar(key, val, self.get_iter(trainer)) | |||
| else: | |||
| pass | |||
| self.writer.flush() | |||
| def visualization_log(self, trainer): | |||
| """ Images Visulization. | |||
| `visualization_buffer` is a dictionary containing: | |||
| images (list): list of visulaized images. | |||
| filenames (list of str, optional): image filenames. | |||
| """ | |||
| visual_results = trainer.visualization_buffer.output | |||
| for vis_key, vis_result in visual_results.items(): | |||
| images = vis_result.get('images', []) | |||
| filenames = vis_result.get('filenames', None) | |||
| if filenames is not None: | |||
| assert len(images) == len( | |||
| filenames | |||
| ), 'Output `images` and `filenames` must keep the same length!' | |||
| for i, img in enumerate(images): | |||
| if isinstance(img, np.ndarray): | |||
| img = torch.from_numpy(img) | |||
| else: | |||
| assert isinstance( | |||
| img, torch.Tensor | |||
| ), f'Only support np.ndarray and torch.Tensor type! Got {type(img)} for img {filenames[i]}' | |||
| default_name = 'image_%i' % i | |||
| filename = filenames[ | |||
| i] if filenames is not None else default_name | |||
| self.writer.add_image( | |||
| f'{vis_key}/{filename}', | |||
| img, | |||
| self.get_iter(trainer), | |||
| dataformats='HWC') | |||
| def after_train_iter(self, trainer): | |||
| super(TensorboardHook, self).after_train_iter(trainer) | |||
| # clear visualization_buffer after each iter to ensure that it is only written once, | |||
| # avoiding repeated writing of the same image buffer every self.interval | |||
| trainer.visualization_buffer.clear_output() | |||
| @master_only | |||
| def after_run(self, trainer): | |||
| @@ -37,7 +37,8 @@ from modelscope.utils.file_utils import func_receive_dict_inputs | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.registry import build_from_cfg | |||
| from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, | |||
| init_dist, set_random_seed) | |||
| init_dist, is_master, | |||
| set_random_seed) | |||
| from .base import BaseTrainer | |||
| from .builder import TRAINERS | |||
| from .default_config import merge_cfg | |||
| @@ -940,27 +941,73 @@ class EpochBasedTrainer(BaseTrainer): | |||
| """ | |||
| if self._dist and self.cfg.model.get('model_parallel_size', 1) == 1: | |||
| from modelscope.trainers.utils.inference import multi_gpu_test | |||
| metric_values = multi_gpu_test( | |||
| # list of batched result and data samples | |||
| results, data_list = multi_gpu_test( | |||
| self, | |||
| data_loader, | |||
| device=self.device, | |||
| tmpdir=None, | |||
| gpu_collect=False, | |||
| metric_classes=metric_classes, | |||
| data_loader_iters_per_gpu=self._eval_iters_per_epoch) | |||
| else: | |||
| from modelscope.trainers.utils.inference import single_gpu_test | |||
| metric_values = single_gpu_test( | |||
| results, data_list = single_gpu_test( | |||
| self, | |||
| data_loader, | |||
| device=self.device, | |||
| metric_classes=metric_classes, | |||
| data_loader_iters=self._eval_iters_per_epoch) | |||
| self._inner_iter = self.iters_per_epoch - 1 # start from index 0 | |||
| # evaluation result processing | |||
| if hasattr(self.cfg.evaluation, 'visualization'): | |||
| flatten_results = [] | |||
| for r in results: | |||
| flatten_results.extend(r) | |||
| vis_cfg = self.cfg.evaluation.visualization | |||
| self.visualization(results, self.eval_dataset, **vis_cfg) | |||
| # do evaluation on rank0 | |||
| metric_values = {} | |||
| if not self._dist or is_master(): | |||
| assert len(data_list) == len( | |||
| results), f'size mismatch {len(data_list)} and {len(results)}' | |||
| for metric_cls in metric_classes: | |||
| for idx in range(len(data_list)): | |||
| metric_cls.add(results[idx], data_list[idx]) | |||
| for metric_cls in metric_classes: | |||
| metric_values.update(metric_cls.evaluate()) | |||
| return metric_values | |||
| def visualization(self, results, dataset, **kwargs): | |||
| """ visualization function for evaluation results. | |||
| Args: | |||
| results (list(dict)): a list of result dict. | |||
| dataset (:obj:`Dataset`): torch dataset object to access original data. | |||
| Implementation Examples: | |||
| ```python | |||
| # draw list of images as numpy array | |||
| images = draw_images(num_of_visualization) | |||
| # set displayed name for each image | |||
| filenames = get_image_display_names() | |||
| vis_results = { | |||
| 'images': images, | |||
| 'filenames' : filenames | |||
| } | |||
| # visualization results will be displayed in group named eva_vis | |||
| self.visualization_buffer.output['eval_vis'] = vis_results | |||
| ``` | |||
| """ | |||
| # TODO @wenmeng.zwm add visualization support for cv evaluation | |||
| raise NotImplementedError( | |||
| 'visualization for evaluation will be supported in the future') | |||
| def register_hook(self, hook: Hook) -> None: | |||
| """Register a hook into the hook list. | |||
| @@ -15,18 +15,13 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, | |||
| make_tmp_dir) | |||
| def single_gpu_test(trainer, | |||
| data_loader, | |||
| device, | |||
| metric_classes=None, | |||
| data_loader_iters=None): | |||
| def single_gpu_test(trainer, data_loader, device, data_loader_iters=None): | |||
| """Test model in EpochBasedTrainer with a single gpu. | |||
| Args: | |||
| trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested. | |||
| data_loader (nn.Dataloader): Pytorch data loader. | |||
| device (str | torch.device): The target device for the data. | |||
| metric_classes (List): List of Metric class that uses to collect metrics | |||
| data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset. | |||
| Returns: | |||
| @@ -48,13 +43,14 @@ def single_gpu_test(trainer, | |||
| data_len = data_loader_iters | |||
| desc = 'Test iterations' | |||
| results = [] | |||
| data_lists = [] | |||
| with tqdm(total=data_len, desc=desc) as pbar: | |||
| for i, data in enumerate(data_loader): | |||
| data = to_device(data, device) | |||
| result = trainer.evaluation_step(data) | |||
| if metric_classes is not None: | |||
| for metric_cls in metric_classes: | |||
| metric_cls.add(result, data) | |||
| results.append(result) | |||
| data_lists.append(data) | |||
| if progress_with_iters: | |||
| batch_size = 1 # iteration count | |||
| @@ -75,11 +71,7 @@ def single_gpu_test(trainer, | |||
| if progress_with_iters and (i + 1) >= data_len: | |||
| break | |||
| metric_values = {} | |||
| for metric_cls in metric_classes: | |||
| metric_values.update(metric_cls.evaluate()) | |||
| return metric_values | |||
| return results, data_lists | |||
| def multi_gpu_test(trainer, | |||
| @@ -87,7 +79,6 @@ def multi_gpu_test(trainer, | |||
| device, | |||
| tmpdir=None, | |||
| gpu_collect=False, | |||
| metric_classes=None, | |||
| data_loader_iters_per_gpu=None): | |||
| """Test model in EpochBasedTrainer with multiple gpus. | |||
| @@ -104,7 +95,6 @@ def multi_gpu_test(trainer, | |||
| tmpdir (str): Path of directory to save the temporary results from | |||
| different gpus under cpu mode. | |||
| gpu_collect (bool): Option to use either gpu or cpu to collect results. | |||
| metric_classes(List): List of Metric class that uses to collect metrics | |||
| data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset. | |||
| Returns: | |||
| list: The prediction results. | |||
| @@ -180,22 +170,7 @@ def multi_gpu_test(trainer, | |||
| data_list = collect_results_cpu(data_list, total_samples, | |||
| os.path.join(tmpdir, 'groundtruth')) | |||
| if is_master(): | |||
| assert len(data_list) == len( | |||
| results), f'size mismatch {len(data_list)} and {len(results)}' | |||
| if metric_classes is not None: | |||
| for i in range(len(data_list)): | |||
| for metric_cls in metric_classes: | |||
| metric_cls.add(results[i], data_list[i]) | |||
| metric_values = {} | |||
| if rank == 0: | |||
| for metric_cls in metric_classes: | |||
| metric_values.update(metric_cls.evaluate()) | |||
| if world_size > 1: | |||
| metric_values = broadcast(metric_values, 0) | |||
| return metric_values | |||
| return results, data_list | |||
| def collect_results_cpu(result_part, size, tmpdir=None): | |||
| @@ -1,9 +1,11 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import glob | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| import cv2 | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| @@ -17,6 +19,8 @@ from modelscope.metrics.builder import MetricKeys | |||
| from modelscope.models.base import Model | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.trainers.base import DummyTrainer | |||
| from modelscope.trainers.builder import TRAINERS | |||
| from modelscope.trainers.trainer import EpochBasedTrainer | |||
| from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks | |||
| from modelscope.utils.test_utils import create_dummy_test_dataset, test_level | |||
| @@ -52,6 +56,21 @@ class DummyModel(nn.Module, Model): | |||
| return dict(logits=x, loss=loss) | |||
| @TRAINERS.register_module(module_name='test_vis') | |||
| class VisTrainer(EpochBasedTrainer): | |||
| def visualization(self, results, dataset, **kwargs): | |||
| num_image = 5 | |||
| f = 'data/test/images/bird.JPEG' | |||
| filenames = [f for _ in range(num_image)] | |||
| imgs = [cv2.imread(f) for f in filenames] | |||
| filenames = [f + str(i) for i in range(num_image)] | |||
| vis_results = {'images': imgs, 'filenames': filenames} | |||
| # visualization results will be displayed in group named eva_vis | |||
| self.visualization_buffer.output['eval_vis'] = vis_results | |||
| class TrainerTest(unittest.TestCase): | |||
| def setUp(self): | |||
| @@ -105,6 +124,9 @@ class TrainerTest(unittest.TestCase): | |||
| }, { | |||
| 'type': 'EvaluationHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TensorboardHook', | |||
| 'interval': 1 | |||
| }] | |||
| }, | |||
| 'evaluation': { | |||
| @@ -113,7 +135,7 @@ class TrainerTest(unittest.TestCase): | |||
| 'workers_per_gpu': 1, | |||
| 'shuffle': False | |||
| }, | |||
| 'metrics': [Metrics.seq_cls_metric] | |||
| 'metrics': [Metrics.seq_cls_metric], | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| @@ -138,6 +160,88 @@ class TrainerTest(unittest.TestCase): | |||
| self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) | |||
| self.assertIn('tensorboard_output', results_files) | |||
| self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) | |||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||
| def test_train_visualization(self): | |||
| json_cfg = { | |||
| 'task': Tasks.image_classification, | |||
| 'train': { | |||
| 'work_dir': | |||
| self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'optimizer': { | |||
| 'type': 'SGD', | |||
| 'lr': 0.01, | |||
| 'options': { | |||
| 'grad_clip': { | |||
| 'max_norm': 2.0 | |||
| } | |||
| } | |||
| }, | |||
| 'lr_scheduler': { | |||
| 'type': 'StepLR', | |||
| 'step_size': 2, | |||
| 'options': { | |||
| 'warmup': { | |||
| 'type': 'LinearWarmup', | |||
| 'warmup_iters': 2 | |||
| } | |||
| } | |||
| }, | |||
| 'hooks': [{ | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TextLoggerHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'IterTimerHook' | |||
| }, { | |||
| 'type': 'EvaluationHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TensorboardHook', | |||
| 'interval': 1 | |||
| }] | |||
| }, | |||
| 'evaluation': { | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1, | |||
| 'shuffle': False | |||
| }, | |||
| 'metrics': [Metrics.seq_cls_metric], | |||
| 'visualization': {}, | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| trainer_name = 'test_vis' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=DummyModel(), | |||
| data_collator=None, | |||
| train_dataset=dummy_dataset_small, | |||
| eval_dataset=dummy_dataset_small, | |||
| max_epochs=3, | |||
| device='cpu') | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) | |||
| self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) | |||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||
| def test_train_1(self): | |||
| @@ -199,6 +303,7 @@ class TrainerTest(unittest.TestCase): | |||
| self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) | |||
| self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) | |||
| self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0) | |||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||
| def test_train_with_default_config(self): | |||