Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9644184 * fix ditributed training and evalmaster
| @@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel): | |||||
| model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | ||||
| self.model = NAFNet(**self.config.model.network_g) | self.model = NAFNet(**self.config.model.network_g) | ||||
| self.loss = PSNRLoss() | self.loss = PSNRLoss() | ||||
| if torch.cuda.is_available(): | |||||
| self._device = torch.device('cuda') | |||||
| else: | |||||
| self._device = torch.device('cpu') | |||||
| self.model = self.model.to(self._device) | |||||
| self.model = self._load_pretrained(self.model, model_path) | self.model = self._load_pretrained(self.model, model_path) | ||||
| if self.training: | |||||
| self.model.train() | |||||
| else: | |||||
| self.model.eval() | |||||
| def _load_pretrained(self, | def _load_pretrained(self, | ||||
| net, | net, | ||||
| load_path, | load_path, | ||||
| @@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel): | |||||
| Returns: | Returns: | ||||
| Dict[str, Tensor]: results | Dict[str, Tensor]: results | ||||
| """ | """ | ||||
| for key, value in inputs.items(): | |||||
| inputs[key] = inputs[key].to(self._device) | |||||
| if self.training: | if self.training: | ||||
| return self._train_forward(**inputs) | return self._train_forward(**inputs) | ||||
| elif 'target' in inputs: | elif 'target' in inputs: | ||||
| @@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .base import Preprocessor | from .base import Preprocessor | ||||
| from .builder import PREPROCESSORS, build_preprocessor | from .builder import PREPROCESSORS, build_preprocessor | ||||
| from .common import Compose | |||||
| from .common import Compose, ToTensor, Filter | |||||
| from .asr import WavToScp | from .asr import WavToScp | ||||
| from .audio import LinearAECAndFbank | from .audio import LinearAECAndFbank | ||||
| from .image import (LoadImage, load_image, | from .image import (LoadImage, load_image, | ||||
| @@ -33,7 +33,7 @@ else: | |||||
| _import_structure = { | _import_structure = { | ||||
| 'base': ['Preprocessor'], | 'base': ['Preprocessor'], | ||||
| 'builder': ['PREPROCESSORS', 'build_preprocessor'], | 'builder': ['PREPROCESSORS', 'build_preprocessor'], | ||||
| 'common': ['Compose'], | |||||
| 'common': ['Compose', 'ToTensor', 'Filter'], | |||||
| 'audio': ['LinearAECAndFbank'], | 'audio': ['LinearAECAndFbank'], | ||||
| 'asr': ['WavToScp'], | 'asr': ['WavToScp'], | ||||
| 'video': ['ReadVideoData'], | 'video': ['ReadVideoData'], | ||||
| @@ -2,6 +2,10 @@ | |||||
| import time | import time | ||||
| from collections.abc import Sequence | from collections.abc import Sequence | ||||
| from typing import Mapping | |||||
| import numpy as np | |||||
| import torch | |||||
| from .builder import PREPROCESSORS, build_preprocessor | from .builder import PREPROCESSORS, build_preprocessor | ||||
| @@ -25,12 +29,18 @@ class Compose(object): | |||||
| if isinstance(transform, dict): | if isinstance(transform, dict): | ||||
| if self.field_name is None: | if self.field_name is None: | ||||
| transform = build_preprocessor(transform, field_name) | transform = build_preprocessor(transform, field_name) | ||||
| self.transforms.append(transform) | |||||
| else: | |||||
| # if not found key in field_name, try field_name=None(default_group) | |||||
| try: | |||||
| transform = build_preprocessor(transform, field_name) | |||||
| except KeyError: | |||||
| transform = build_preprocessor(transform, None) | |||||
| elif callable(transform): | elif callable(transform): | ||||
| self.transforms.append(transform) | |||||
| pass | |||||
| else: | else: | ||||
| raise TypeError('transform must be callable or a dict, but got' | raise TypeError('transform must be callable or a dict, but got' | ||||
| f' {type(transform)}') | f' {type(transform)}') | ||||
| self.transforms.append(transform) | |||||
| def __call__(self, data): | def __call__(self, data): | ||||
| for t in self.transforms: | for t in self.transforms: | ||||
| @@ -52,3 +62,82 @@ class Compose(object): | |||||
| format_string += f'\n {t}' | format_string += f'\n {t}' | ||||
| format_string += '\n)' | format_string += '\n)' | ||||
| return format_string | return format_string | ||||
| def to_tensor(data): | |||||
| """Convert objects of various python types to :obj:`torch.Tensor`. | |||||
| Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, | |||||
| :class:`Sequence`, :class:`int` and :class:`float`. | |||||
| Args: | |||||
| data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to | |||||
| be converted. | |||||
| """ | |||||
| if isinstance(data, torch.Tensor): | |||||
| return data | |||||
| elif isinstance(data, np.ndarray): | |||||
| return torch.from_numpy(data) | |||||
| elif isinstance(data, Sequence) and not isinstance(data, str): | |||||
| return torch.tensor(data) | |||||
| elif isinstance(data, int): | |||||
| return torch.LongTensor([data]) | |||||
| elif isinstance(data, float): | |||||
| return torch.FloatTensor([data]) | |||||
| else: | |||||
| raise TypeError(f'type {type(data)} cannot be converted to tensor.') | |||||
| @PREPROCESSORS.register_module() | |||||
| class ToTensor(object): | |||||
| """Convert target object to tensor. | |||||
| Args: | |||||
| keys (Sequence[str]): Key of data to be converted to Tensor. | |||||
| Only valid when data is type of `Mapping`. If `keys` is None, | |||||
| all values of keys will be converted to tensor by default. | |||||
| """ | |||||
| def __init__(self, keys=None): | |||||
| self.keys = keys | |||||
| def __call__(self, data): | |||||
| if isinstance(data, Mapping): | |||||
| if self.keys is None: | |||||
| self.keys = list(data.keys()) | |||||
| for key in self.keys: | |||||
| data[key] = to_tensor(data[key]) | |||||
| else: | |||||
| data = to_tensor(data) | |||||
| return data | |||||
| def __repr__(self): | |||||
| return self.__class__.__name__ + f'(keys={self.keys})' | |||||
| @PREPROCESSORS.register_module() | |||||
| class Filter(object): | |||||
| """This is usually the last stage of the dataloader transform. | |||||
| Only data of reserved keys will be kept and passed directly to the model, others will be removed. | |||||
| Args: | |||||
| keys (Sequence[str]): Keys of data to be reserved, others will be removed. | |||||
| """ | |||||
| def __init__(self, reserved_keys): | |||||
| self.reserved_keys = reserved_keys | |||||
| def __call__(self, data): | |||||
| assert isinstance(data, Mapping) | |||||
| reserved_data = {} | |||||
| for key in self.reserved_keys: | |||||
| reserved_data[key] = data[key] | |||||
| return reserved_data | |||||
| def __repr__(self): | |||||
| return self.__class__.__name__ + f'(keys={self.reserved_keys})' | |||||
| @@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor): | |||||
| super().__init__(*args, **kwargs) | super().__init__(*args, **kwargs) | ||||
| self.model_dir: str = model_dir | self.model_dir: str = model_dir | ||||
| from .common import Filter | |||||
| # TODO: `Filter` should be moved to configurarion file of each model | |||||
| self._transforms = [Filter(reserved_keys=['input', 'target'])] | |||||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | ||||
| """process the raw input data | """process the raw input data | ||||
| @@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor): | |||||
| Returns: | Returns: | ||||
| Dict[str, Any]: the preprocessed data | Dict[str, Any]: the preprocessed data | ||||
| """ | """ | ||||
| for t in self._transforms: | |||||
| data = t(data) | |||||
| return data | return data | ||||
| @@ -4,6 +4,7 @@ import os.path as osp | |||||
| import uuid | import uuid | ||||
| from typing import Any, Dict, Iterable, Optional, Tuple, Union | from typing import Any, Dict, Iterable, Optional, Tuple, Union | ||||
| import numpy as np | |||||
| from transformers import AutoTokenizer | from transformers import AutoTokenizer | ||||
| from modelscope.metainfo import Models, Preprocessors | from modelscope.metainfo import Models, Preprocessors | ||||
| @@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||||
| text_b, | text_b, | ||||
| return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | ||||
| **self.tokenize_kwargs) | **self.tokenize_kwargs) | ||||
| output = { | |||||
| k: np.array(v) if isinstance(v, list) else v | |||||
| for k, v in output.items() | |||||
| } | |||||
| self.labels_to_id(labels, output) | self.labels_to_id(labels, output) | ||||
| return output | return output | ||||
| @@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||||
| if labels is not None: | if labels is not None: | ||||
| if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ | if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ | ||||
| and self.label2id is not None: | and self.label2id is not None: | ||||
| output[OutputKeys.LABEL] = [ | |||||
| output[OutputKeys.LABELS] = [ | |||||
| self.label2id[str(label)] for label in labels | self.label2id[str(label)] for label in labels | ||||
| ] | ] | ||||
| elif label_can_be_mapped(labels) and self.label2id is not None: | elif label_can_be_mapped(labels) and self.label2id is not None: | ||||
| output[OutputKeys.LABEL] = self.label2id[str(labels)] | |||||
| output[OutputKeys.LABELS] = self.label2id[str(labels)] | |||||
| else: | else: | ||||
| output[OutputKeys.LABEL] = labels | |||||
| output[OutputKeys.LABELS] = labels | |||||
| @PREPROCESSORS.register_module( | @PREPROCESSORS.register_module( | ||||
| @@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer): | |||||
| train_outputs = dict() | train_outputs = dict() | ||||
| self._mode = ModeKeys.TRAIN | self._mode = ModeKeys.TRAIN | ||||
| inputs = self.collate_fn(inputs) | |||||
| # call model forward but not __call__ to skip postprocess | # call model forward but not __call__ to skip postprocess | ||||
| if isinstance(inputs, Mapping): | if isinstance(inputs, Mapping): | ||||
| d_loss = model._train_forward_d(**inputs) | d_loss = model._train_forward_d(**inputs) | ||||
| @@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||||
| self.train_keys = build_dataset_keys( | self.train_keys = build_dataset_keys( | ||||
| self.cfg.dataset.train if hasattr(self.cfg, 'dataset') | self.cfg.dataset.train if hasattr(self.cfg, 'dataset') | ||||
| and hasattr(self.cfg.dataset, 'train') else None) | and hasattr(self.cfg.dataset, 'train') else None) | ||||
| # TODO eval may has special keys, which is now not supported. | |||||
| # because there is only one preprocessor in the trainer, and it only supports one group of keys. | |||||
| self.eval_keys = self.train_keys | |||||
| self.eval_keys = build_dataset_keys( | |||||
| self.cfg.dataset.val if hasattr(self.cfg, 'dataset') | |||||
| and hasattr(self.cfg.dataset, 'val') else None) | |||||
| if len(self.eval_keys) == 0: | |||||
| self.eval_keys = self.train_keys | |||||
| super().__init__( | super().__init__( | ||||
| model=model_dir, | model=model_dir, | ||||
| @@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||||
| elif isinstance(model, nn.Module): | elif isinstance(model, nn.Module): | ||||
| return model | return model | ||||
| def build_preprocessor(self) -> Preprocessor: | |||||
| def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||||
| """Build the preprocessor. | """Build the preprocessor. | ||||
| User can override this method to implement custom logits. | User can override this method to implement custom logits. | ||||
| @@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||||
| model_args = {} if self.label2id is None else { | model_args = {} if self.label2id is None else { | ||||
| 'label2id': self.label2id | 'label2id': self.label2id | ||||
| } | } | ||||
| cfg = ConfigDict({ | |||||
| **getattr(self.cfg, 'preprocessor'), | |||||
| 'model_dir': | |||||
| self.model_dir, | |||||
| **model_args, | |||||
| 'mode': | |||||
| ModeKeys.TRAIN, | |||||
| **self.train_keys, | |||||
| }) | |||||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||||
| field_name = Tasks.find_field_by_task(self.cfg.task) | |||||
| train_preprocessor, eval_preprocessor = None, None | |||||
| _train_cfg, _eval_cfg = {}, {} | |||||
| if 'type' not in self.cfg.preprocessor and ( | |||||
| 'train' in self.cfg.preprocessor | |||||
| or 'val' in self.cfg.preprocessor): | |||||
| if 'train' in self.cfg.preprocessor: | |||||
| _train_cfg = self.cfg.preprocessor.train | |||||
| if 'val' in self.cfg.preprocessor: | |||||
| _eval_cfg = self.cfg.preprocessor.val | |||||
| else: | |||||
| _train_cfg = self.cfg.preprocessor | |||||
| _eval_cfg = self.cfg.preprocessor | |||||
| if len(_train_cfg): | |||||
| _train_cfg.update({ | |||||
| 'model_dir': self.model_dir, | |||||
| **model_args, | |||||
| **self.train_keys, 'mode': ModeKeys.TRAIN | |||||
| }) | |||||
| train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||||
| if len(_eval_cfg): | |||||
| _eval_cfg.update({ | |||||
| 'model_dir': self.model_dir, | |||||
| **model_args, | |||||
| **self.eval_keys, 'mode': ModeKeys.EVAL | |||||
| }) | |||||
| eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||||
| return train_preprocessor, eval_preprocessor | |||||
| @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) | @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) | ||||
| @@ -5,15 +5,15 @@ import time | |||||
| from collections.abc import Mapping | from collections.abc import Mapping | ||||
| from distutils.version import LooseVersion | from distutils.version import LooseVersion | ||||
| from functools import partial | from functools import partial | ||||
| from typing import Callable, List, Optional, Tuple, Union | |||||
| from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union | |||||
| import json | import json | ||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| from addict import Dict | |||||
| from torch import distributed as dist | from torch import distributed as dist | ||||
| from torch import nn | from torch import nn | ||||
| from torch.utils.data import DataLoader, Dataset | from torch.utils.data import DataLoader, Dataset | ||||
| from torch.utils.data.dataloader import default_collate | |||||
| from torch.utils.data.distributed import DistributedSampler | from torch.utils.data.distributed import DistributedSampler | ||||
| from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
| @@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers | |||||
| from modelscope.metrics import build_metric, task_default_metrics | from modelscope.metrics import build_metric, task_default_metrics | ||||
| from modelscope.models.base import Model, TorchModel | from modelscope.models.base import Model, TorchModel | ||||
| from modelscope.msdatasets.ms_dataset import MsDataset | from modelscope.msdatasets.ms_dataset import MsDataset | ||||
| from modelscope.preprocessors import build_preprocessor | |||||
| from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
| from modelscope.preprocessors.builder import build_preprocessor | |||||
| from modelscope.preprocessors.common import Compose | |||||
| from modelscope.task_datasets.builder import build_task_dataset | from modelscope.task_datasets.builder import build_task_dataset | ||||
| from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | ||||
| from modelscope.trainers.hooks.builder import HOOKS | from modelscope.trainers.hooks.builder import HOOKS | ||||
| @@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority | |||||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | ||||
| from modelscope.trainers.optimizer.builder import build_optimizer | from modelscope.trainers.optimizer.builder import build_optimizer | ||||
| from modelscope.utils.config import Config, ConfigDict | from modelscope.utils.config import Config, ConfigDict | ||||
| from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, | |||||
| ModelFile, Tasks, TrainerStages) | |||||
| from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, | |||||
| ConfigKeys, Hubs, ModeKeys, ModelFile, | |||||
| Tasks, TrainerStages) | |||||
| from modelscope.utils.data_utils import to_device | |||||
| from modelscope.utils.file_utils import func_receive_dict_inputs | from modelscope.utils.file_utils import func_receive_dict_inputs | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from modelscope.utils.registry import build_from_cfg | from modelscope.utils.registry import build_from_cfg | ||||
| from modelscope.utils.tensor_utils import torch_default_data_collator | |||||
| from modelscope.utils.torch_utils import (broadcast, create_device, | |||||
| get_dist_info, init_dist) | |||||
| from modelscope.utils.torch_utils import (create_device, get_dist_info, | |||||
| init_dist) | |||||
| from .base import BaseTrainer | from .base import BaseTrainer | ||||
| from .builder import TRAINERS | from .builder import TRAINERS | ||||
| from .default_config import DEFAULT_CONFIG | from .default_config import DEFAULT_CONFIG | ||||
| @@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| data_collator: Optional[Callable] = None, | data_collator: Optional[Callable] = None, | ||||
| train_dataset: Optional[Union[MsDataset, Dataset]] = None, | train_dataset: Optional[Union[MsDataset, Dataset]] = None, | ||||
| eval_dataset: Optional[Union[MsDataset, Dataset]] = None, | eval_dataset: Optional[Union[MsDataset, Dataset]] = None, | ||||
| preprocessor: Optional[Preprocessor] = None, | |||||
| preprocessor: Optional[Union[Preprocessor, | |||||
| Dict[str, Preprocessor]]] = None, | |||||
| optimizers: Tuple[torch.optim.Optimizer, | optimizers: Tuple[torch.optim.Optimizer, | ||||
| torch.optim.lr_scheduler._LRScheduler] = (None, | torch.optim.lr_scheduler._LRScheduler] = (None, | ||||
| None), | None), | ||||
| @@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| else: | else: | ||||
| self.work_dir = self.cfg.train.get('work_dir', './work_dir') | self.work_dir = self.cfg.train.get('work_dir', './work_dir') | ||||
| self.preprocessor = None | |||||
| self.train_preprocessor, self.eval_preprocessor = None, None | |||||
| if isinstance(preprocessor, Preprocessor): | if isinstance(preprocessor, Preprocessor): | ||||
| self.preprocessor = preprocessor | |||||
| elif hasattr(self.cfg, 'preprocessor'): | |||||
| self.preprocessor = self.build_preprocessor() | |||||
| if self.preprocessor is not None: | |||||
| self.preprocessor.mode = ModeKeys.TRAIN | |||||
| self.train_preprocessor = preprocessor | |||||
| self.eval_preprocessor = preprocessor | |||||
| elif isinstance(preprocessor, Mapping): | |||||
| if not (ConfigKeys.train in preprocessor | |||||
| or ConfigKeys.val in preprocessor): | |||||
| raise ValueError( | |||||
| f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' | |||||
| ) | |||||
| if ConfigKeys.train in preprocessor: | |||||
| assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) | |||||
| self.train_preprocessor = preprocessor[ConfigKeys.train] | |||||
| if ConfigKeys.val in preprocessor: | |||||
| assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) | |||||
| self.eval_preprocessor = preprocessor[ConfigKeys.val] | |||||
| elif hasattr(self.cfg, ConfigFields.preprocessor): | |||||
| self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( | |||||
| ) | |||||
| if self.train_preprocessor is not None: | |||||
| self.train_preprocessor.mode = ModeKeys.TRAIN | |||||
| if self.eval_preprocessor is not None: | |||||
| self.eval_preprocessor.mode = ModeKeys.EVAL | |||||
| device_name = kwargs.get('device', 'gpu') | device_name = kwargs.get('device', 'gpu') | ||||
| assert device_name in ['gpu', | assert device_name in ['gpu', | ||||
| 'cpu'], 'device should be either cpu or gpu.' | 'cpu'], 'device should be either cpu or gpu.' | ||||
| self.device = create_device(device_name == 'cpu') | self.device = create_device(device_name == 'cpu') | ||||
| self.train_dataset = self.to_task_dataset( | self.train_dataset = self.to_task_dataset( | ||||
| train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) | |||||
| train_dataset, | |||||
| mode=ModeKeys.TRAIN, | |||||
| preprocessor=self.train_preprocessor) | |||||
| self.eval_dataset = self.to_task_dataset( | self.eval_dataset = self.to_task_dataset( | ||||
| eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) | |||||
| eval_dataset, | |||||
| mode=ModeKeys.EVAL, | |||||
| preprocessor=self.eval_preprocessor) | |||||
| self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||||
| self.data_collator = data_collator if data_collator is not None else default_collate | |||||
| self.metrics = self.get_metrics() | self.metrics = self.get_metrics() | ||||
| self._metric_values = None | self._metric_values = None | ||||
| self.optimizers = optimizers | self.optimizers = optimizers | ||||
| @@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| return datasets | return datasets | ||||
| elif isinstance(datasets, MsDataset): | elif isinstance(datasets, MsDataset): | ||||
| datasets = datasets.to_torch_dataset( | datasets = datasets.to_torch_dataset( | ||||
| preprocessors=self.preprocessor) | |||||
| preprocessors=preprocessor) | |||||
| return datasets | return datasets | ||||
| elif isinstance(datasets, List) and isinstance( | elif isinstance(datasets, List) and isinstance( | ||||
| datasets[0], MsDataset): | datasets[0], MsDataset): | ||||
| datasets = [ | datasets = [ | ||||
| d.to_torch_dataset(preprocessor=self.preprocessor) | |||||
| d.to_torch_dataset(preprocessor=preprocessor) | |||||
| for d in datasets | for d in datasets | ||||
| ] | ] | ||||
| cfg = ConfigDict( | cfg = ConfigDict( | ||||
| @@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| else: | else: | ||||
| return datasets | return datasets | ||||
| def build_preprocessor(self) -> Preprocessor: | |||||
| """Build the preprocessor. | |||||
| def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||||
| """Build train and eval preprocessor. | |||||
| User can override this method to implement custom logits. | User can override this method to implement custom logits. | ||||
| Returns: The preprocessor instance. | |||||
| Returns: The train preprocessor and eval preprocessor instance. | |||||
| """ | """ | ||||
| # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||||
| # when they are different ones in training and evaluation | |||||
| cfg = ConfigDict({ | |||||
| **getattr(self.cfg, 'preprocessor'), | |||||
| 'model_dir': | |||||
| self.model_dir, | |||||
| 'mode': | |||||
| ModeKeys.TRAIN, | |||||
| }) | |||||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||||
| field_name = Tasks.find_field_by_task(self.cfg.task) | |||||
| train_preprocessor, eval_preprocessor = None, None | |||||
| _train_cfg, _eval_cfg = {}, {} | |||||
| _dafault_args = {'model_dir': self.model_dir} | |||||
| if 'type' not in self.cfg.preprocessor and ( | |||||
| 'train' in self.cfg.preprocessor | |||||
| or 'val' in self.cfg.preprocessor): | |||||
| if 'train' in self.cfg.preprocessor: | |||||
| _train_cfg = self.cfg.preprocessor.train | |||||
| if 'val' in self.cfg.preprocessor: | |||||
| _eval_cfg = self.cfg.preprocessor.val | |||||
| else: | |||||
| _train_cfg = self.cfg.preprocessor | |||||
| _eval_cfg = self.cfg.preprocessor | |||||
| if len(_train_cfg): | |||||
| if isinstance(_train_cfg, Sequence): | |||||
| # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, | |||||
| # and add mode for Compose or other plans | |||||
| raise NotImplementedError('Not supported yet!') | |||||
| _train_cfg.update(_dafault_args) | |||||
| train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||||
| if len(_eval_cfg): | |||||
| if isinstance(_eval_cfg, Sequence): | |||||
| raise NotImplementedError('Not supported yet!') | |||||
| _eval_cfg.update(_dafault_args) | |||||
| eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||||
| return train_preprocessor, eval_preprocessor | |||||
| def get_metrics(self) -> List[str]: | def get_metrics(self) -> List[str]: | ||||
| """Get the metric class types. | """Get the metric class types. | ||||
| @@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| return build_parallel(dp_cfg) | return build_parallel(dp_cfg) | ||||
| def collate_fn(self, data): | |||||
| """Prepare the input just before the forward function. | |||||
| This method will move the tensors to the right device. | |||||
| Usually this method does not need to be overridden. | |||||
| Args: | |||||
| data: The data out of the dataloader. | |||||
| Returns: The processed data. | |||||
| """ | |||||
| from torch.utils.data.dataloader import default_collate | |||||
| if isinstance(data, dict) or isinstance(data, Mapping): | |||||
| return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||||
| elif isinstance(data, (tuple, list)): | |||||
| if isinstance(data[0], (int, float)): | |||||
| return default_collate(data).to(self.device) | |||||
| else: | |||||
| return type(data)(self.collate_fn(v) for v in data) | |||||
| elif isinstance(data, np.ndarray): | |||||
| return self.collate_fn(torch.from_numpy(data)) | |||||
| elif isinstance(data, torch.Tensor): | |||||
| return data.to(self.device) | |||||
| elif isinstance(data, (str, int, float, bool)): | |||||
| return data | |||||
| else: | |||||
| raise ValueError(f'Unsupported data type {type(data)}') | |||||
| def train_step(self, model, inputs): | def train_step(self, model, inputs): | ||||
| """ Perform a training step on a batch of inputs. | """ Perform a training step on a batch of inputs. | ||||
| @@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| # TODO: find more pretty way to change mode | # TODO: find more pretty way to change mode | ||||
| model.train() | model.train() | ||||
| self._mode = ModeKeys.TRAIN | self._mode = ModeKeys.TRAIN | ||||
| inputs = self.collate_fn(inputs) | |||||
| # call model forward but not __call__ to skip postprocess | # call model forward but not __call__ to skip postprocess | ||||
| if isinstance(inputs, | if isinstance(inputs, | ||||
| Mapping) and not func_receive_dict_inputs(model.forward): | Mapping) and not func_receive_dict_inputs(model.forward): | ||||
| @@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| if self.train_dataset is None: | if self.train_dataset is None: | ||||
| train_data = self.cfg.dataset.train | train_data = self.cfg.dataset.train | ||||
| self.train_dataset = self.build_dataset( | self.train_dataset = self.build_dataset( | ||||
| train_data, mode=ModeKeys.TRAIN) | |||||
| train_data, | |||||
| mode=ModeKeys.TRAIN, | |||||
| preprocessor=self.train_preprocessor) | |||||
| data_loader = self._build_dataloader_with_dataset( | data_loader = self._build_dataloader_with_dataset( | ||||
| self.train_dataset, | self.train_dataset, | ||||
| @@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| if self.eval_dataset is None: | if self.eval_dataset is None: | ||||
| val_data = self.cfg.dataset.val | val_data = self.cfg.dataset.val | ||||
| self.eval_dataset = self.build_dataset( | self.eval_dataset = self.build_dataset( | ||||
| val_data, mode=ModeKeys.EVAL) | |||||
| val_data, | |||||
| mode=ModeKeys.EVAL, | |||||
| preprocessor=self.eval_preprocessor) | |||||
| batch_size = self.cfg.evaluation.batch_size | batch_size = self.cfg.evaluation.batch_size | ||||
| workers = self.cfg.evaluation.workers | workers = self.cfg.evaluation.workers | ||||
| @@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| ) | ) | ||||
| return data_loader | return data_loader | ||||
| def build_dataset(self, data_cfg, mode): | |||||
| def build_dataset(self, data_cfg, mode, preprocessor=None): | |||||
| """ Build torch dataset object using data config | """ Build torch dataset object using data config | ||||
| """ | """ | ||||
| dataset = MsDataset.load( | dataset = MsDataset.load( | ||||
| @@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| data_cfg, 'subset_name') else None, | data_cfg, 'subset_name') else None, | ||||
| hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | ||||
| ) | ) | ||||
| torch_dataset = dataset.to_torch_dataset( | |||||
| preprocessors=self.preprocessor, ) | |||||
| torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) | |||||
| dataset = self.to_task_dataset(torch_dataset, mode) | dataset = self.to_task_dataset(torch_dataset, mode) | ||||
| return dataset | return dataset | ||||
| @@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| self.invoke_hook(TrainerStages.before_train_epoch) | self.invoke_hook(TrainerStages.before_train_epoch) | ||||
| time.sleep(2) # Prevent possible deadlock during epoch transition | time.sleep(2) # Prevent possible deadlock during epoch transition | ||||
| for i, data_batch in enumerate(data_loader): | for i, data_batch in enumerate(data_loader): | ||||
| data_batch = to_device(data_batch, self.device) | |||||
| self.data_batch = data_batch | self.data_batch = data_batch | ||||
| self._inner_iter = i | self._inner_iter = i | ||||
| self.invoke_hook(TrainerStages.before_train_iter) | self.invoke_hook(TrainerStages.before_train_iter) | ||||
| @@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer): | |||||
| metric_values = multi_gpu_test( | metric_values = multi_gpu_test( | ||||
| self.model, | self.model, | ||||
| data_loader, | data_loader, | ||||
| device=self.device, | |||||
| tmpdir=None, | tmpdir=None, | ||||
| gpu_collect=False, | gpu_collect=False, | ||||
| data_collate_fn=self.collate_fn, | |||||
| metric_classes=metric_classes) | metric_classes=metric_classes) | ||||
| else: | else: | ||||
| from modelscope.trainers.utils.inference import single_gpu_test | from modelscope.trainers.utils.inference import single_gpu_test | ||||
| metric_values = single_gpu_test( | metric_values = single_gpu_test( | ||||
| self.model, | self.model, | ||||
| data_loader, | data_loader, | ||||
| data_collate_fn=self.collate_fn, | |||||
| device=self.device, | |||||
| metric_classes=metric_classes) | metric_classes=metric_classes) | ||||
| return metric_values | return metric_values | ||||
| @@ -10,21 +10,19 @@ import torch | |||||
| from torch import distributed as dist | from torch import distributed as dist | ||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| from modelscope.utils.data_utils import to_device | |||||
| from modelscope.utils.file_utils import func_receive_dict_inputs | from modelscope.utils.file_utils import func_receive_dict_inputs | ||||
| from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, | from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, | ||||
| make_tmp_dir) | make_tmp_dir) | ||||
| def single_gpu_test(model, | |||||
| data_loader, | |||||
| data_collate_fn=None, | |||||
| metric_classes=None): | |||||
| def single_gpu_test(model, data_loader, device, metric_classes=None): | |||||
| """Test model with a single gpu. | """Test model with a single gpu. | ||||
| Args: | Args: | ||||
| model (nn.Module): Model to be tested. | model (nn.Module): Model to be tested. | ||||
| data_loader (nn.Dataloader): Pytorch data loader. | data_loader (nn.Dataloader): Pytorch data loader. | ||||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||||
| device: (str | torch.device): The target device for the data. | |||||
| metric_classes(List): List of Metric class that uses to collect metrics | metric_classes(List): List of Metric class that uses to collect metrics | ||||
| Returns: | Returns: | ||||
| @@ -34,8 +32,7 @@ def single_gpu_test(model, | |||||
| dataset = data_loader.dataset | dataset = data_loader.dataset | ||||
| with tqdm(total=len(dataset), desc='test samples') as pbar: | with tqdm(total=len(dataset), desc='test samples') as pbar: | ||||
| for data in data_loader: | for data in data_loader: | ||||
| if data_collate_fn is not None: | |||||
| data = data_collate_fn(data) | |||||
| data = to_device(data, device) | |||||
| with torch.no_grad(): | with torch.no_grad(): | ||||
| if isinstance(data, Mapping) and not func_receive_dict_inputs( | if isinstance(data, Mapping) and not func_receive_dict_inputs( | ||||
| model.forward): | model.forward): | ||||
| @@ -62,9 +59,9 @@ def single_gpu_test(model, | |||||
| def multi_gpu_test(model, | def multi_gpu_test(model, | ||||
| data_loader, | data_loader, | ||||
| device, | |||||
| tmpdir=None, | tmpdir=None, | ||||
| gpu_collect=False, | gpu_collect=False, | ||||
| data_collate_fn=None, | |||||
| metric_classes=None): | metric_classes=None): | ||||
| """Test model with multiple gpus. | """Test model with multiple gpus. | ||||
| @@ -77,10 +74,10 @@ def multi_gpu_test(model, | |||||
| Args: | Args: | ||||
| model (nn.Module): Model to be tested. | model (nn.Module): Model to be tested. | ||||
| data_loader (nn.Dataloader): Pytorch data loader. | data_loader (nn.Dataloader): Pytorch data loader. | ||||
| device: (str | torch.device): The target device for the data. | |||||
| tmpdir (str): Path of directory to save the temporary results from | tmpdir (str): Path of directory to save the temporary results from | ||||
| different gpus under cpu mode. | different gpus under cpu mode. | ||||
| gpu_collect (bool): Option to use either gpu or cpu to collect results. | gpu_collect (bool): Option to use either gpu or cpu to collect results. | ||||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||||
| metric_classes(List): List of Metric class that uses to collect metrics | metric_classes(List): List of Metric class that uses to collect metrics | ||||
| Returns: | Returns: | ||||
| @@ -98,8 +95,7 @@ def multi_gpu_test(model, | |||||
| count = 0 | count = 0 | ||||
| with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | ||||
| for _, data in enumerate(data_loader): | for _, data in enumerate(data_loader): | ||||
| if data_collate_fn is not None: | |||||
| data = data_collate_fn(data) | |||||
| data = to_device(data, device) | |||||
| data_list.append(data) | data_list.append(data) | ||||
| with torch.no_grad(): | with torch.no_grad(): | ||||
| if isinstance(data, Mapping) and not func_receive_dict_inputs( | if isinstance(data, Mapping) and not func_receive_dict_inputs( | ||||
| @@ -219,6 +219,12 @@ class ConfigFields(object): | |||||
| evaluation = 'evaluation' | evaluation = 'evaluation' | ||||
| class ConfigKeys(object): | |||||
| """Fixed keywords in configuration file""" | |||||
| train = 'train' | |||||
| val = 'val' | |||||
| class Requirements(object): | class Requirements(object): | ||||
| """Requirement names for each module | """Requirement names for each module | ||||
| """ | """ | ||||
| @@ -0,0 +1,23 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from collections.abc import Mapping | |||||
| import torch | |||||
| def to_device(batch, device, non_blocking=False): | |||||
| """Put the data to the target cuda device just before the forward function. | |||||
| Args: | |||||
| batch: The batch data out of the dataloader. | |||||
| device: (str | torch.device): The target device for the data. | |||||
| Returns: The data to the target device. | |||||
| """ | |||||
| if isinstance(batch, dict) or isinstance(batch, Mapping): | |||||
| return type(batch)({k: to_device(v, device) for k, v in batch.items()}) | |||||
| elif isinstance(batch, (tuple, list)): | |||||
| return type(batch)(to_device(v, device) for v in batch) | |||||
| elif isinstance(batch, torch.Tensor): | |||||
| return batch.to(device, non_blocking=non_blocking) | |||||
| else: | |||||
| return batch | |||||
| @@ -24,65 +24,3 @@ def torch_nested_detach(tensors): | |||||
| if isinstance(tensors, torch.Tensor): | if isinstance(tensors, torch.Tensor): | ||||
| return tensors.detach() | return tensors.detach() | ||||
| return tensors | return tensors | ||||
| def torch_default_data_collator(features): | |||||
| # TODO @jiangnana.jnn refine this default data collator | |||||
| import torch | |||||
| first = features[0] | |||||
| if isinstance(first, Mapping): | |||||
| batch = {} | |||||
| # Special handling for labels. | |||||
| # Ensure that tensor is created with the correct type | |||||
| # (it should be automatically the case, but let's make sure of it.) | |||||
| if 'label' in first and first['label'] is not None: | |||||
| label = first['label'].item() if isinstance( | |||||
| first['label'], torch.Tensor) else first['label'] | |||||
| # the msdataset return a 0-dimension np.array with a single value, the following part handle this. | |||||
| if isinstance(label, np.ndarray): | |||||
| src_dtype = label[()].dtype | |||||
| dtype = torch.long if label[( | |||||
| )].dtype == np.int64 else torch.float | |||||
| else: | |||||
| src_dtype = type(label) | |||||
| dtype = torch.long if isinstance(label, int) else torch.float | |||||
| # add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_" | |||||
| batch['labels'] = torch.tensor( | |||||
| np.array([f['label'] for f in features], dtype=src_dtype), | |||||
| dtype=dtype) | |||||
| elif 'label_ids' in first and first['label_ids'] is not None: | |||||
| if isinstance(first['label_ids'], torch.Tensor): | |||||
| batch['labels'] = torch.stack( | |||||
| [f['label_ids'] for f in features]) | |||||
| else: | |||||
| dtype = torch.long if type( | |||||
| first['label_ids'][0]) is int else torch.float | |||||
| batch['labels'] = torch.tensor( | |||||
| [f['label_ids'] for f in features], dtype=dtype) | |||||
| # Handling of all other possible keys. | |||||
| # Again, we will use the first element to figure out which key/values are not None for this model. | |||||
| for k, v in first.items(): | |||||
| if k not in ('label', 'label_ids' | |||||
| ) and v is not None and not isinstance(v, str): | |||||
| if isinstance(v, torch.Tensor): | |||||
| batch[k] = torch.stack([f[k] for f in features]) | |||||
| elif isinstance(v, list) and isinstance(v[0], torch.Tensor): | |||||
| batch[k] = torch.stack([d for f in features for d in f[k]]) | |||||
| else: | |||||
| batch[k] = torch.tensor(np.array([f[k] for f in features])) | |||||
| elif isinstance(first, tuple): | |||||
| batch = [] | |||||
| for idx in range(len(first)): | |||||
| if isinstance(first[idx], torch.Tensor): | |||||
| batch.append(torch.stack([f[idx] for f in features])) | |||||
| else: | |||||
| batch.append(torch.tensor([f[idx] for f in features])) | |||||
| else: | |||||
| if isinstance(first, torch.Tensor): | |||||
| batch = torch.stack(features) | |||||
| else: | |||||
| batch = torch.tensor(features) | |||||
| return batch | |||||
| @@ -50,7 +50,7 @@ def set_test_level(level: int): | |||||
| def create_dummy_test_dataset(feat, label, num): | def create_dummy_test_dataset(feat, label, num): | ||||
| return MsDataset.from_hf_dataset( | return MsDataset.from_hf_dataset( | ||||
| Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num))) | |||||
| Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num))) | |||||
| def download_and_untar(fpath, furl, dst) -> str: | def download_and_untar(fpath, furl, dst) -> str: | ||||
| @@ -2,7 +2,10 @@ | |||||
| import unittest | import unittest | ||||
| from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor | |||||
| import torch | |||||
| from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter, | |||||
| Preprocessor, ToTensor) | |||||
| class ComposeTest(unittest.TestCase): | class ComposeTest(unittest.TestCase): | ||||
| @@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase): | |||||
| self.assertEqual(output['tmp2'], 'tmp2') | self.assertEqual(output['tmp2'], 'tmp2') | ||||
| class ToTensorTest(unittest.TestCase): | |||||
| def test_totensor(self): | |||||
| to_tensor_op = ToTensor(keys=['img']) | |||||
| inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} | |||||
| inputs = to_tensor_op(inputs) | |||||
| self.assertIsInstance(inputs['img'], torch.Tensor) | |||||
| self.assertEqual(inputs['label'], 1) | |||||
| self.assertEqual(inputs['path'], 'test.jpg') | |||||
| class FilterTest(unittest.TestCase): | |||||
| def test_filter(self): | |||||
| filter_op = Filter(reserved_keys=['img', 'label']) | |||||
| inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} | |||||
| inputs = filter_op(inputs) | |||||
| self.assertIn('img', inputs) | |||||
| self.assertIn('label', inputs) | |||||
| self.assertNotIn('path', inputs) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| unittest.main() | unittest.main() | ||||
| @@ -12,7 +12,7 @@ from torch import nn | |||||
| from modelscope.metainfo import Trainers | from modelscope.metainfo import Trainers | ||||
| from modelscope.metrics.builder import METRICS, MetricKeys | from modelscope.metrics.builder import METRICS, MetricKeys | ||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModelFile | |||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.utils.registry import default_group | from modelscope.utils.registry import default_group | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | from modelscope.utils.test_utils import create_dummy_test_dataset | ||||
| @@ -9,7 +9,7 @@ import numpy as np | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau | |||||
| from torch.optim.lr_scheduler import MultiStepLR | |||||
| from modelscope.metainfo import Trainers | from modelscope.metainfo import Trainers | ||||
| from modelscope.metrics.builder import METRICS, MetricKeys | from modelscope.metrics.builder import METRICS, MetricKeys | ||||
| @@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase): | |||||
| model=model, | model=model, | ||||
| train_dataset=dummy_dataset, | train_dataset=dummy_dataset, | ||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=5) | |||||
| max_epochs=5, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| train_dataloader = trainer._build_dataloader_with_dataset( | train_dataloader = trainer._build_dataloader_with_dataset( | ||||
| @@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase): | |||||
| json.dump(json_cfg, f) | json.dump(json_cfg, f) | ||||
| model = DummyModel() | model = DummyModel() | ||||
| # optimmizer = SGD(model.parameters(), lr=0.01) | |||||
| # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) | |||||
| trainer_name = Trainers.default | trainer_name = Trainers.default | ||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=config_path, | cfg_file=config_path, | ||||
| model=model, | model=model, | ||||
| train_dataset=dummy_dataset, | train_dataset=dummy_dataset, | ||||
| # optimizers=(optimmizer, lr_scheduler), | |||||
| max_epochs=7) | |||||
| max_epochs=7, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| train_dataloader = trainer._build_dataloader_with_dataset( | train_dataloader = trainer._build_dataloader_with_dataset( | ||||
| @@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase): | |||||
| train_dataset=dummy_dataset, | train_dataset=dummy_dataset, | ||||
| eval_dataset=dummy_dataset, | eval_dataset=dummy_dataset, | ||||
| optimizers=(optimizer, None), | optimizers=(optimizer, None), | ||||
| max_epochs=5) | |||||
| max_epochs=5, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| train_dataloader = trainer._build_dataloader_with_dataset( | train_dataloader = trainer._build_dataloader_with_dataset( | ||||
| @@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages | |||||
| from modelscope.utils.test_utils import create_dummy_test_dataset | from modelscope.utils.test_utils import create_dummy_test_dataset | ||||
| dummy_dataset = create_dummy_test_dataset( | dummy_dataset = create_dummy_test_dataset( | ||||
| np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10) | |||||
| np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10) | |||||
| class DummyModel(nn.Module): | class DummyModel(nn.Module): | ||||
| @@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase): | |||||
| model=model, | model=model, | ||||
| train_dataset=dummy_dataset, | train_dataset=dummy_dataset, | ||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=2) | |||||
| max_epochs=2, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| train_dataloader = trainer._build_dataloader_with_dataset( | train_dataloader = trainer._build_dataloader_with_dataset( | ||||
| @@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase): | |||||
| model=model, | model=model, | ||||
| train_dataset=dummy_dataset, | train_dataset=dummy_dataset, | ||||
| optimizers=(optimizer, lr_scheduler), | optimizers=(optimizer, lr_scheduler), | ||||
| max_epochs=5) | |||||
| max_epochs=5, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| train_dataloader = trainer._build_dataloader_with_dataset( | train_dataloader = trainer._build_dataloader_with_dataset( | ||||
| @@ -3,19 +3,16 @@ import os | |||||
| import shutil | import shutil | ||||
| import tempfile | import tempfile | ||||
| import unittest | import unittest | ||||
| from abc import ABCMeta | |||||
| import json | import json | ||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| from datasets import Dataset | |||||
| from torch import nn | from torch import nn | ||||
| from torch.optim import SGD | from torch.optim import SGD | ||||
| from torch.optim.lr_scheduler import StepLR | from torch.optim.lr_scheduler import StepLR | ||||
| from modelscope.metainfo import Metrics, Trainers | from modelscope.metainfo import Metrics, Trainers | ||||
| from modelscope.metrics.builder import MetricKeys | from modelscope.metrics.builder import MetricKeys | ||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile | from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile | ||||
| from modelscope.utils.test_utils import create_dummy_test_dataset, test_level | from modelscope.utils.test_utils import create_dummy_test_dataset, test_level | ||||
| @@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase): | |||||
| data_collator=None, | data_collator=None, | ||||
| train_dataset=dummy_dataset_small, | train_dataset=dummy_dataset_small, | ||||
| eval_dataset=dummy_dataset_small, | eval_dataset=dummy_dataset_small, | ||||
| max_epochs=3) | |||||
| max_epochs=3, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase): | |||||
| train_dataset=dummy_dataset_small, | train_dataset=dummy_dataset_small, | ||||
| eval_dataset=dummy_dataset_small, | eval_dataset=dummy_dataset_small, | ||||
| optimizers=(optimmizer, lr_scheduler), | optimizers=(optimmizer, lr_scheduler), | ||||
| max_epochs=3) | |||||
| max_epochs=3, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase): | |||||
| train_dataset=dummy_dataset_big, | train_dataset=dummy_dataset_big, | ||||
| eval_dataset=dummy_dataset_small, | eval_dataset=dummy_dataset_small, | ||||
| optimizers=(optimmizer, lr_scheduler), | optimizers=(optimmizer, lr_scheduler), | ||||
| max_epochs=3) | |||||
| max_epochs=3, | |||||
| device='cpu') | |||||
| trainer = build_trainer(trainer_name, kwargs) | trainer = build_trainer(trainer_name, kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| model=model_id, | model=model_id, | ||||
| train_dataset=self.dataset, | train_dataset=self.dataset, | ||||
| eval_dataset=self.dataset, | eval_dataset=self.dataset, | ||||
| work_dir=self.tmp_dir) | |||||
| work_dir=self.tmp_dir, | |||||
| model_revision='beta') | |||||
| trainer = build_trainer(default_args=kwargs) | trainer = build_trainer(default_args=kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| model=model_id, | model=model_id, | ||||
| train_dataset=self.dataset, | train_dataset=self.dataset, | ||||
| eval_dataset=self.dataset, | eval_dataset=self.dataset, | ||||
| work_dir=self.tmp_dir) | |||||
| work_dir=self.tmp_dir, | |||||
| model_revision='beta') | |||||
| trainer = build_trainer(default_args=kwargs) | trainer = build_trainer(default_args=kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| def test_trainer_with_user_defined_config(self): | def test_trainer_with_user_defined_config(self): | ||||
| model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' | model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' | ||||
| cfg = read_config(model_id) | |||||
| cfg = read_config(model_id, revision='beta') | |||||
| cfg.train.max_epochs = 20 | cfg.train.max_epochs = 20 | ||||
| cfg.train.work_dir = self.tmp_dir | cfg.train.work_dir = self.tmp_dir | ||||
| cfg_file = os.path.join(self.tmp_dir, 'config.json') | cfg_file = os.path.join(self.tmp_dir, 'config.json') | ||||
| @@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| model=model_id, | model=model_id, | ||||
| train_dataset=self.dataset, | train_dataset=self.dataset, | ||||
| eval_dataset=self.dataset, | eval_dataset=self.dataset, | ||||
| cfg_file=cfg_file) | |||||
| cfg_file=cfg_file, | |||||
| model_revision='beta') | |||||
| trainer = build_trainer(default_args=kwargs) | trainer = build_trainer(default_args=kwargs) | ||||
| trainer.train() | trainer.train() | ||||
| @@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase): | |||||
| os.makedirs(tmp_dir) | os.makedirs(tmp_dir) | ||||
| model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | ||||
| cache_path = snapshot_download(model_id) | |||||
| cache_path = snapshot_download(model_id, revision='beta') | |||||
| model = SbertForSequenceClassification.from_pretrained(cache_path) | model = SbertForSequenceClassification.from_pretrained(cache_path) | ||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | ||||
| @@ -0,0 +1,116 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.utils.data import DataLoader | |||||
| from modelscope.metrics.builder import MetricKeys | |||||
| from modelscope.metrics.sequence_classification_metric import \ | |||||
| SequenceClassificationMetric | |||||
| from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test | |||||
| from modelscope.utils.test_utils import (DistributedTestCase, | |||||
| create_dummy_test_dataset, test_level) | |||||
| from modelscope.utils.torch_utils import get_dist_info, init_dist | |||||
| dummy_dataset = create_dummy_test_dataset( | |||||
| torch.rand((5, )), torch.randint(0, 4, (1, )), 20) | |||||
| class DummyModel(nn.Module): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.linear = nn.Linear(5, 4) | |||||
| self.bn = nn.BatchNorm1d(4) | |||||
| def forward(self, feat, labels): | |||||
| x = self.linear(feat) | |||||
| x = self.bn(x) | |||||
| loss = torch.sum(x) | |||||
| return dict(logits=x, loss=loss) | |||||
| def test_func(dist=False): | |||||
| dummy_model = DummyModel() | |||||
| dataset = dummy_dataset.to_torch_dataset() | |||||
| dummy_loader = DataLoader( | |||||
| dataset, | |||||
| batch_size=2, | |||||
| ) | |||||
| metric_class = SequenceClassificationMetric() | |||||
| if dist: | |||||
| init_dist(launcher='pytorch') | |||||
| rank, world_size = get_dist_info() | |||||
| device = torch.device(f'cuda:{rank}') | |||||
| dummy_model.cuda() | |||||
| if world_size > 1: | |||||
| from torch.nn.parallel.distributed import DistributedDataParallel | |||||
| dummy_model = DistributedDataParallel( | |||||
| dummy_model, device_ids=[torch.cuda.current_device()]) | |||||
| test_func = multi_gpu_test | |||||
| else: | |||||
| test_func = single_gpu_test | |||||
| metric_results = test_func( | |||||
| dummy_model, | |||||
| dummy_loader, | |||||
| device=device, | |||||
| metric_classes=[metric_class]) | |||||
| return metric_results | |||||
| @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') | |||||
| class SingleGpuTestTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_single_gpu_test(self): | |||||
| metric_results = test_func() | |||||
| self.assertIn(MetricKeys.ACCURACY, metric_results) | |||||
| @unittest.skipIf(not torch.cuda.is_available() | |||||
| or torch.cuda.device_count() <= 1, 'distributed unittest') | |||||
| class MultiGpuTestTest(DistributedTestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_multi_gpu_test(self): | |||||
| self.start( | |||||
| test_func, | |||||
| num_gpus=2, | |||||
| assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x), | |||||
| dist=True) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||