Browse Source

add tensorboard hook for visualization

1. add tensorboard hook to default config
2. add image visualization support to tensorboard hook and trainer
3. move evaluation logic out of single_gpu_test and multi_gpu_test to make prediction results available for further processing such as result saving and visualization.

visualization results are as follows:
![image.png](https://cn-hangzhou.oss-cdn.aliyun-inc.com/git/force/uploads/comment/29212/38448470860386707/image.png)
![image.png](https://cn-hangzhou.oss-cdn.aliyun-inc.com/git/force/uploads/comment/29212/38437794200606734/image.png)
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10894813
master^2
wenmeng.zwm 3 years ago
parent
commit
c9a6b887a2
7 changed files with 240 additions and 48 deletions
  1. +1
    -0
      modelscope/trainers/base.py
  2. +17
    -9
      modelscope/trainers/default_config.py
  3. +11
    -1
      modelscope/trainers/hooks/evaluation_hook.py
  4. +46
    -0
      modelscope/trainers/hooks/logger/tensorboard_hook.py
  5. +52
    -5
      modelscope/trainers/trainer.py
  6. +7
    -32
      modelscope/trainers/utils/inference.py
  7. +106
    -1
      tests/trainers/test_trainer.py

+ 1
- 0
modelscope/trainers/base.py View File

@@ -33,6 +33,7 @@ class BaseTrainer(ABC):
else: else:
self.args = None self.args = None
self.log_buffer = LogBuffer() self.log_buffer = LogBuffer()
self.visualization_buffer = LogBuffer()
self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())


def get_or_download_model_dir(self, model, model_revision=None): def get_or_download_model_dir(self, model, model_revision=None):


+ 17
- 9
modelscope/trainers/default_config.py View File

@@ -4,15 +4,23 @@ from modelscope.utils.config import Config


DEFAULT_CONFIG = { DEFAULT_CONFIG = {
'train': { 'train': {
'hooks': [{
'type': 'CheckpointHook',
'interval': 1
}, {
'type': 'TextLoggerHook',
'interval': 10
}, {
'type': 'IterTimerHook'
}]
'hooks': [
{
'type': 'CheckpointHook',
'interval': 1
},
{
'type': 'TextLoggerHook',
'interval': 10
},
{
'type': 'IterTimerHook'
},
{
'type': 'TensorboardHook',
'interval': 10
},
]
} }
} }




+ 11
- 1
modelscope/trainers/hooks/evaluation_hook.py View File

@@ -1,4 +1,6 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
from collections import OrderedDict

from modelscope.metainfo import Hooks from modelscope.metainfo import Hooks
from .builder import HOOKS from .builder import HOOKS
from .hook import Hook from .hook import Hook
@@ -30,11 +32,19 @@ class EvaluationHook(Hook):
if self.by_epoch and self._should_evaluate(trainer): if self.by_epoch and self._should_evaluate(trainer):
self.do_evaluate(trainer) self.do_evaluate(trainer)


def add_visualization_info(self, trainer, results):
if trainer.visualization_buffer.output.get('eval_results',
None) is None:
trainer.visualization_buffer.output['eval_results'] = OrderedDict()

trainer.visualization_buffer.output['eval_results'].update(
trainer.visualize(results))

def do_evaluate(self, trainer): def do_evaluate(self, trainer):
"""Evaluate the results.""" """Evaluate the results."""
eval_res = trainer.evaluate() eval_res = trainer.evaluate()
for name, val in eval_res.items(): for name, val in eval_res.items():
trainer.log_buffer.output[name] = val
trainer.log_buffer.output['evaluation/' + name] = val


trainer.log_buffer.ready = True trainer.log_buffer.ready = True




+ 46
- 0
modelscope/trainers/hooks/logger/tensorboard_hook.py View File

@@ -1,6 +1,9 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
import os import os


import numpy as np
import torch

from modelscope.metainfo import Hooks from modelscope.metainfo import Hooks
from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.builder import HOOKS
from modelscope.utils.constant import LogKeys from modelscope.utils.constant import LogKeys
@@ -50,10 +53,14 @@ class TensorboardHook(LoggerHook):


if self.out_dir is None: if self.out_dir is None:
self.out_dir = os.path.join(trainer.work_dir, 'tensorboard_output') self.out_dir = os.path.join(trainer.work_dir, 'tensorboard_output')
trainer.logger.info(
f'tensorboard files will be saved to {self.out_dir}')
self.writer = SummaryWriter(self.out_dir) self.writer = SummaryWriter(self.out_dir)


@master_only @master_only
def log(self, trainer): def log(self, trainer):
if len(trainer.visualization_buffer.output) > 0:
self.visualization_log(trainer)
for key, val in trainer.log_buffer.output.items(): for key, val in trainer.log_buffer.output.items():
if key in self.skip_keys: if key in self.skip_keys:
continue continue
@@ -63,6 +70,45 @@ class TensorboardHook(LoggerHook):
self.writer.add_scalar(key, val, self.get_iter(trainer)) self.writer.add_scalar(key, val, self.get_iter(trainer))
else: else:
pass pass
self.writer.flush()

def visualization_log(self, trainer):
""" Images Visulization.
`visualization_buffer` is a dictionary containing:
images (list): list of visulaized images.
filenames (list of str, optional): image filenames.
"""
visual_results = trainer.visualization_buffer.output
for vis_key, vis_result in visual_results.items():
images = vis_result.get('images', [])
filenames = vis_result.get('filenames', None)
if filenames is not None:
assert len(images) == len(
filenames
), 'Output `images` and `filenames` must keep the same length!'

for i, img in enumerate(images):
if isinstance(img, np.ndarray):
img = torch.from_numpy(img)
else:
assert isinstance(
img, torch.Tensor
), f'Only support np.ndarray and torch.Tensor type! Got {type(img)} for img {filenames[i]}'

default_name = 'image_%i' % i
filename = filenames[
i] if filenames is not None else default_name
self.writer.add_image(
f'{vis_key}/{filename}',
img,
self.get_iter(trainer),
dataformats='HWC')

def after_train_iter(self, trainer):
super(TensorboardHook, self).after_train_iter(trainer)
# clear visualization_buffer after each iter to ensure that it is only written once,
# avoiding repeated writing of the same image buffer every self.interval
trainer.visualization_buffer.clear_output()


@master_only @master_only
def after_run(self, trainer): def after_run(self, trainer):


+ 52
- 5
modelscope/trainers/trainer.py View File

@@ -37,7 +37,8 @@ from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from modelscope.utils.registry import build_from_cfg from modelscope.utils.registry import build_from_cfg
from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
init_dist, set_random_seed)
init_dist, is_master,
set_random_seed)
from .base import BaseTrainer from .base import BaseTrainer
from .builder import TRAINERS from .builder import TRAINERS
from .default_config import merge_cfg from .default_config import merge_cfg
@@ -940,27 +941,73 @@ class EpochBasedTrainer(BaseTrainer):
""" """
if self._dist and self.cfg.model.get('model_parallel_size', 1) == 1: if self._dist and self.cfg.model.get('model_parallel_size', 1) == 1:
from modelscope.trainers.utils.inference import multi_gpu_test from modelscope.trainers.utils.inference import multi_gpu_test
metric_values = multi_gpu_test(
# list of batched result and data samples
results, data_list = multi_gpu_test(
self, self,
data_loader, data_loader,
device=self.device, device=self.device,
tmpdir=None, tmpdir=None,
gpu_collect=False, gpu_collect=False,
metric_classes=metric_classes,
data_loader_iters_per_gpu=self._eval_iters_per_epoch) data_loader_iters_per_gpu=self._eval_iters_per_epoch)
else: else:
from modelscope.trainers.utils.inference import single_gpu_test from modelscope.trainers.utils.inference import single_gpu_test
metric_values = single_gpu_test(
results, data_list = single_gpu_test(
self, self,
data_loader, data_loader,
device=self.device, device=self.device,
metric_classes=metric_classes,
data_loader_iters=self._eval_iters_per_epoch) data_loader_iters=self._eval_iters_per_epoch)


self._inner_iter = self.iters_per_epoch - 1 # start from index 0 self._inner_iter = self.iters_per_epoch - 1 # start from index 0


# evaluation result processing
if hasattr(self.cfg.evaluation, 'visualization'):
flatten_results = []
for r in results:
flatten_results.extend(r)
vis_cfg = self.cfg.evaluation.visualization
self.visualization(results, self.eval_dataset, **vis_cfg)

# do evaluation on rank0
metric_values = {}
if not self._dist or is_master():
assert len(data_list) == len(
results), f'size mismatch {len(data_list)} and {len(results)}'
for metric_cls in metric_classes:
for idx in range(len(data_list)):
metric_cls.add(results[idx], data_list[idx])

for metric_cls in metric_classes:
metric_values.update(metric_cls.evaluate())

return metric_values return metric_values


def visualization(self, results, dataset, **kwargs):
""" visualization function for evaluation results.

Args:
results (list(dict)): a list of result dict.
dataset (:obj:`Dataset`): torch dataset object to access original data.

Implementation Examples:
```python
# draw list of images as numpy array
images = draw_images(num_of_visualization)

# set displayed name for each image
filenames = get_image_display_names()
vis_results = {
'images': images,
'filenames' : filenames
}

# visualization results will be displayed in group named eva_vis
self.visualization_buffer.output['eval_vis'] = vis_results
```
"""
# TODO @wenmeng.zwm add visualization support for cv evaluation
raise NotImplementedError(
'visualization for evaluation will be supported in the future')

def register_hook(self, hook: Hook) -> None: def register_hook(self, hook: Hook) -> None:
"""Register a hook into the hook list. """Register a hook into the hook list.




+ 7
- 32
modelscope/trainers/utils/inference.py View File

@@ -15,18 +15,13 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
make_tmp_dir) make_tmp_dir)




def single_gpu_test(trainer,
data_loader,
device,
metric_classes=None,
data_loader_iters=None):
def single_gpu_test(trainer, data_loader, device, data_loader_iters=None):
"""Test model in EpochBasedTrainer with a single gpu. """Test model in EpochBasedTrainer with a single gpu.


Args: Args:
trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested. trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested.
data_loader (nn.Dataloader): Pytorch data loader. data_loader (nn.Dataloader): Pytorch data loader.
device (str | torch.device): The target device for the data. device (str | torch.device): The target device for the data.
metric_classes (List): List of Metric class that uses to collect metrics
data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset. data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset.


Returns: Returns:
@@ -48,13 +43,14 @@ def single_gpu_test(trainer,
data_len = data_loader_iters data_len = data_loader_iters
desc = 'Test iterations' desc = 'Test iterations'


results = []
data_lists = []
with tqdm(total=data_len, desc=desc) as pbar: with tqdm(total=data_len, desc=desc) as pbar:
for i, data in enumerate(data_loader): for i, data in enumerate(data_loader):
data = to_device(data, device) data = to_device(data, device)
result = trainer.evaluation_step(data) result = trainer.evaluation_step(data)
if metric_classes is not None:
for metric_cls in metric_classes:
metric_cls.add(result, data)
results.append(result)
data_lists.append(data)


if progress_with_iters: if progress_with_iters:
batch_size = 1 # iteration count batch_size = 1 # iteration count
@@ -75,11 +71,7 @@ def single_gpu_test(trainer,
if progress_with_iters and (i + 1) >= data_len: if progress_with_iters and (i + 1) >= data_len:
break break


metric_values = {}
for metric_cls in metric_classes:
metric_values.update(metric_cls.evaluate())

return metric_values
return results, data_lists




def multi_gpu_test(trainer, def multi_gpu_test(trainer,
@@ -87,7 +79,6 @@ def multi_gpu_test(trainer,
device, device,
tmpdir=None, tmpdir=None,
gpu_collect=False, gpu_collect=False,
metric_classes=None,
data_loader_iters_per_gpu=None): data_loader_iters_per_gpu=None):
"""Test model in EpochBasedTrainer with multiple gpus. """Test model in EpochBasedTrainer with multiple gpus.


@@ -104,7 +95,6 @@ def multi_gpu_test(trainer,
tmpdir (str): Path of directory to save the temporary results from tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode. different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results. gpu_collect (bool): Option to use either gpu or cpu to collect results.
metric_classes(List): List of Metric class that uses to collect metrics
data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset. data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset.
Returns: Returns:
list: The prediction results. list: The prediction results.
@@ -180,22 +170,7 @@ def multi_gpu_test(trainer,
data_list = collect_results_cpu(data_list, total_samples, data_list = collect_results_cpu(data_list, total_samples,
os.path.join(tmpdir, 'groundtruth')) os.path.join(tmpdir, 'groundtruth'))


if is_master():
assert len(data_list) == len(
results), f'size mismatch {len(data_list)} and {len(results)}'
if metric_classes is not None:
for i in range(len(data_list)):
for metric_cls in metric_classes:
metric_cls.add(results[i], data_list[i])

metric_values = {}
if rank == 0:
for metric_cls in metric_classes:
metric_values.update(metric_cls.evaluate())
if world_size > 1:
metric_values = broadcast(metric_values, 0)

return metric_values
return results, data_list




def collect_results_cpu(result_part, size, tmpdir=None): def collect_results_cpu(result_part, size, tmpdir=None):


+ 106
- 1
tests/trainers/test_trainer.py View File

@@ -1,9 +1,11 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
import glob
import os import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest


import cv2
import json import json
import numpy as np import numpy as np
import torch import torch
@@ -17,6 +19,8 @@ from modelscope.metrics.builder import MetricKeys
from modelscope.models.base import Model from modelscope.models.base import Model
from modelscope.trainers import build_trainer from modelscope.trainers import build_trainer
from modelscope.trainers.base import DummyTrainer from modelscope.trainers.base import DummyTrainer
from modelscope.trainers.builder import TRAINERS
from modelscope.trainers.trainer import EpochBasedTrainer
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level from modelscope.utils.test_utils import create_dummy_test_dataset, test_level


@@ -52,6 +56,21 @@ class DummyModel(nn.Module, Model):
return dict(logits=x, loss=loss) return dict(logits=x, loss=loss)




@TRAINERS.register_module(module_name='test_vis')
class VisTrainer(EpochBasedTrainer):

def visualization(self, results, dataset, **kwargs):
num_image = 5
f = 'data/test/images/bird.JPEG'
filenames = [f for _ in range(num_image)]
imgs = [cv2.imread(f) for f in filenames]
filenames = [f + str(i) for i in range(num_image)]
vis_results = {'images': imgs, 'filenames': filenames}

# visualization results will be displayed in group named eva_vis
self.visualization_buffer.output['eval_vis'] = vis_results


class TrainerTest(unittest.TestCase): class TrainerTest(unittest.TestCase):


def setUp(self): def setUp(self):
@@ -105,6 +124,9 @@ class TrainerTest(unittest.TestCase):
}, { }, {
'type': 'EvaluationHook', 'type': 'EvaluationHook',
'interval': 1 'interval': 1
}, {
'type': 'TensorboardHook',
'interval': 1
}] }]
}, },
'evaluation': { 'evaluation': {
@@ -113,7 +135,7 @@ class TrainerTest(unittest.TestCase):
'workers_per_gpu': 1, 'workers_per_gpu': 1,
'shuffle': False 'shuffle': False
}, },
'metrics': [Metrics.seq_cls_metric]
'metrics': [Metrics.seq_cls_metric],
} }
} }
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
@@ -138,6 +160,88 @@ class TrainerTest(unittest.TestCase):
self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
self.assertIn('tensorboard_output', results_files)
self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0)

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_train_visualization(self):
json_cfg = {
'task': Tasks.image_classification,
'train': {
'work_dir':
self.tmp_dir,
'dataloader': {
'batch_size_per_gpu': 2,
'workers_per_gpu': 1
},
'optimizer': {
'type': 'SGD',
'lr': 0.01,
'options': {
'grad_clip': {
'max_norm': 2.0
}
}
},
'lr_scheduler': {
'type': 'StepLR',
'step_size': 2,
'options': {
'warmup': {
'type': 'LinearWarmup',
'warmup_iters': 2
}
}
},
'hooks': [{
'type': 'CheckpointHook',
'interval': 1
}, {
'type': 'TextLoggerHook',
'interval': 1
}, {
'type': 'IterTimerHook'
}, {
'type': 'EvaluationHook',
'interval': 1
}, {
'type': 'TensorboardHook',
'interval': 1
}]
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': 2,
'workers_per_gpu': 1,
'shuffle': False
},
'metrics': [Metrics.seq_cls_metric],
'visualization': {},
}
}
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
with open(config_path, 'w') as f:
json.dump(json_cfg, f)

trainer_name = 'test_vis'
kwargs = dict(
cfg_file=config_path,
model=DummyModel(),
data_collator=None,
train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small,
max_epochs=3,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
trainer.train()
results_files = os.listdir(self.tmp_dir)

self.assertIn(f'{trainer.timestamp}.log.json', results_files)
self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0)


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_train_1(self): def test_train_1(self):
@@ -199,6 +303,7 @@ class TrainerTest(unittest.TestCase):
self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
self.assertTrue(len(glob.glob(f'{self.tmp_dir}/*/*events*')) > 0)


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_train_with_default_config(self): def test_train_with_default_config(self):


Loading…
Cancel
Save