co-contributed with 夕陌&雨泓 * add torch epoch based trainer and dis utils * add hooks including optimizer, lrscheduler, logging, checkpoint, evaluation, time profiling * add torch mdoel base and test * add optimizer and lrscheduler module * add sbert for text classification example * add task_dataset for dataset-level processor Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9338412master
| @@ -0,0 +1,175 @@ | |||||
| { | |||||
| "framework": "pytorch", | |||||
| "task": "image_classification", | |||||
| "work_dir": "./work_dir", | |||||
| "model": { | |||||
| "type": "classification", | |||||
| "pretrained": null, | |||||
| "backbone": { | |||||
| "type": "ResNet", | |||||
| "depth": 50, | |||||
| "out_indices": [ | |||||
| 4 | |||||
| ], | |||||
| "norm_cfg": { | |||||
| "type": "BN" | |||||
| } | |||||
| }, | |||||
| "head": { | |||||
| "type": "ClsHead", | |||||
| "with_avg_pool": true, | |||||
| "in_channels": 2048, | |||||
| "loss_config": { | |||||
| "type": "CrossEntropyLossWithLabelSmooth", | |||||
| "label_smooth": 0 | |||||
| }, | |||||
| "num_classes": 1000 | |||||
| } | |||||
| }, | |||||
| "dataset": { | |||||
| "train": { | |||||
| "type": "ClsDataset", | |||||
| "data_source": { | |||||
| "list_file": "data/imagenet_raw/meta/train_labeled.txt", | |||||
| "root": "data/imagenet_raw/train/", | |||||
| "type": "ClsSourceImageList" | |||||
| } | |||||
| }, | |||||
| "val": { | |||||
| "type": "ClsDataset", | |||||
| "data_source": { | |||||
| "list_file": "data/imagenet_raw/meta/val_labeled.txt", | |||||
| "root": "data/imagenet_raw/validation/", | |||||
| "type": "ClsSourceImageList" | |||||
| } | |||||
| }, | |||||
| "test": {} | |||||
| }, | |||||
| "preprocessor":{ | |||||
| "train": [ | |||||
| { | |||||
| "type": "RandomResizedCrop", | |||||
| "size": 224 | |||||
| }, | |||||
| { | |||||
| "type": "RandomHorizontalFlip" | |||||
| }, | |||||
| { | |||||
| "type": "ToTensor" | |||||
| }, | |||||
| { | |||||
| "type": "Normalize", | |||||
| "mean": [ | |||||
| 0.485, | |||||
| 0.456, | |||||
| 0.406 | |||||
| ], | |||||
| "std": [ | |||||
| 0.229, | |||||
| 0.224, | |||||
| 0.225 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "type": "Collect", | |||||
| "keys": [ | |||||
| "img", | |||||
| "gt_labels" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "val": [ | |||||
| { | |||||
| "type": "Resize", | |||||
| "size": 256 | |||||
| }, | |||||
| { | |||||
| "type": "CenterCrop", | |||||
| "size": 224 | |||||
| }, | |||||
| { | |||||
| "type": "ToTensor" | |||||
| }, | |||||
| { | |||||
| "type": "Normalize", | |||||
| "mean": [ | |||||
| 0.485, | |||||
| 0.456, | |||||
| 0.406 | |||||
| ], | |||||
| "std": [ | |||||
| 0.229, | |||||
| 0.224, | |||||
| 0.225 | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "type": "Collect", | |||||
| "keys": [ | |||||
| "img", | |||||
| "gt_labels" | |||||
| ] | |||||
| } | |||||
| ] | |||||
| }, | |||||
| "train": { | |||||
| "dataloader": { | |||||
| "batch_size_per_gpu": 2, | |||||
| "workers_per_gpu": 1 | |||||
| }, | |||||
| "optimizer": { | |||||
| "type": "SGD", | |||||
| "lr": 0.01, | |||||
| "options": { | |||||
| "grad_clip": { | |||||
| "max_norm": 2.0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "lr_scheduler": { | |||||
| "type": "StepLR", | |||||
| "step_size": 2, | |||||
| "options": { | |||||
| "warmup": { | |||||
| "type": "LinearWarmup", | |||||
| "warmup_iters": 2 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "hooks": | |||||
| [ | |||||
| { | |||||
| "type": "CheckpointHook", | |||||
| "interval": 2 | |||||
| }, | |||||
| { | |||||
| "type": "TextLoggerHook", | |||||
| "interval": 1 | |||||
| }, | |||||
| { | |||||
| "type": "IterTimerHook" | |||||
| }, | |||||
| { | |||||
| "type": "EvaluationHook", | |||||
| "interval": 1 | |||||
| } | |||||
| ] | |||||
| }, | |||||
| "evaluation": { | |||||
| "dataloader": { | |||||
| "batch_size_per_gpu": 2, | |||||
| "workers_per_gpu": 1, | |||||
| "shuffle": false | |||||
| }, | |||||
| "metrics": ["accuracy", "precision", "recall"] | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,77 @@ | |||||
| { | |||||
| "task": "sentence-similarity", | |||||
| "preprocessor": { | |||||
| "type": "bert-seq-cls-tokenizer-finetune", | |||||
| "first_sequence": "sentence1", | |||||
| "second_sequence": "sentence2" | |||||
| }, | |||||
| "model": { | |||||
| "type": "structbert", | |||||
| "attention_probs_dropout_prob": 0.1, | |||||
| "easynlp_version": "0.0.3", | |||||
| "gradient_checkpointing": false, | |||||
| "hidden_act": "gelu", | |||||
| "hidden_dropout_prob": 0.1, | |||||
| "hidden_size": 768, | |||||
| "initializer_range": 0.02, | |||||
| "intermediate_size": 3072, | |||||
| "layer_norm_eps": 1e-12, | |||||
| "max_position_embeddings": 512, | |||||
| "num_attention_heads": 12, | |||||
| "num_hidden_layers": 12, | |||||
| "pad_token_id": 0, | |||||
| "position_embedding_type": "absolute", | |||||
| "transformers_version": "4.6.0.dev0", | |||||
| "type_vocab_size": 2, | |||||
| "use_cache": true, | |||||
| "vocab_size": 30522 | |||||
| }, | |||||
| "pipeline": { | |||||
| "type": "sentence-similarity" | |||||
| }, | |||||
| "work_dir": "/tmp", | |||||
| "train": { | |||||
| "dataloader": { | |||||
| "batch_size_per_gpu": 2, | |||||
| "workers_per_gpu": 1 | |||||
| }, | |||||
| "optimizer": { | |||||
| "type": "SGD", | |||||
| "lr": 0.01, | |||||
| "options": { | |||||
| "grad_clip": { | |||||
| "max_norm": 2.0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "lr_scheduler": { | |||||
| "type": "StepLR", | |||||
| "step_size": 2, | |||||
| "options": { | |||||
| "warmup": { | |||||
| "type": "LinearWarmup", | |||||
| "warmup_iters": 2 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "hooks": [{ | |||||
| "type": "CheckpointHook", | |||||
| "interval": 1 | |||||
| }, { | |||||
| "type": "TextLoggerHook", | |||||
| "interval": 1 | |||||
| }, { | |||||
| "type": "IterTimerHook" | |||||
| }, { | |||||
| "type": "EvaluationHook", | |||||
| "interval": 1 | |||||
| }] | |||||
| }, | |||||
| "evaluation": { | |||||
| "dataloader": { | |||||
| "batch_size_per_gpu": 2, | |||||
| "workers_per_gpu": 1, | |||||
| "shuffle": false | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -36,10 +36,10 @@ modelscope.pipelines.base module | |||||
| :undoc-members: | :undoc-members: | ||||
| :show-inheritance: | :show-inheritance: | ||||
| modelscope.pipelines.outputs module | |||||
| modelscope.outputs module | |||||
| ----------------------------------- | ----------------------------------- | ||||
| .. automodule:: modelscope.pipelines.outputs | |||||
| .. automodule:: modelscope.outputs | |||||
| :members: | :members: | ||||
| :undoc-members: | :undoc-members: | ||||
| :show-inheritance: | :show-inheritance: | ||||
| @@ -127,3 +127,16 @@ class Preprocessors(object): | |||||
| # multi-modal | # multi-modal | ||||
| ofa_image_caption = 'ofa-image-caption' | ofa_image_caption = 'ofa-image-caption' | ||||
| mplug_visual_question_answering = 'mplug-visual-question-answering' | mplug_visual_question_answering = 'mplug-visual-question-answering' | ||||
| class Metrics(object): | |||||
| """ Names for different metrics. | |||||
| """ | |||||
| # accuracy | |||||
| accuracy = 'accuracy' | |||||
| # metrics for sequence classification task | |||||
| seq_cls_metric = 'seq_cls_metric' | |||||
| # metrics for token-classification task | |||||
| token_cls_metric = 'token-cls-metric' | |||||
| @@ -0,0 +1,3 @@ | |||||
| from .base import Metric | |||||
| from .builder import METRICS, build_metric, task_default_metrics | |||||
| from .sequence_classification_metric import SequenceClassificationMetric | |||||
| @@ -0,0 +1,37 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from abc import ABC, abstractmethod | |||||
| from typing import Dict | |||||
| class Metric(ABC): | |||||
| """The metric base class for computing metrics. | |||||
| The subclasses can either compute a single metric like 'accuracy', or compute the | |||||
| complex metrics for a specific task with or without other Metric subclasses. | |||||
| """ | |||||
| @abstractmethod | |||||
| def add(self, outputs: Dict, inputs: Dict): | |||||
| """ Append logits and labels within an eval loop. | |||||
| Will be called after every batch finished to gather the model predictions and the labels. | |||||
| Args: | |||||
| outputs: The model prediction outputs. | |||||
| inputs: The mini batch inputs from the dataloader. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| @abstractmethod | |||||
| def evaluate(self): | |||||
| """Evaluate the metrics after the eval finished. | |||||
| Will be called after the whole validation finished. | |||||
| Returns: The actual metric dict with standard names. | |||||
| """ | |||||
| pass | |||||
| @@ -0,0 +1,35 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from ..metainfo import Metrics | |||||
| from ..utils.config import ConfigDict | |||||
| from ..utils.constant import Tasks | |||||
| from ..utils.registry import Registry, build_from_cfg, default_group | |||||
| METRICS = Registry('metrics') | |||||
| class MetricKeys(object): | |||||
| ACCURACY = 'accuracy' | |||||
| F1 = 'f1' | |||||
| PRECISION = 'precision' | |||||
| RECALL = 'recall' | |||||
| task_default_metrics = { | |||||
| Tasks.sentence_similarity: [Metrics.seq_cls_metric], | |||||
| } | |||||
| def build_metric(metric_name: str, | |||||
| field: str = default_group, | |||||
| default_args: dict = None): | |||||
| """ Build metric given metric_name and field. | |||||
| Args: | |||||
| metric_name (:obj:`str`): The metric name. | |||||
| field (str, optional): The field of this metric, default value: 'default' for all fields. | |||||
| default_args (dict, optional): Default initialization arguments. | |||||
| """ | |||||
| cfg = ConfigDict({'type': metric_name}) | |||||
| return build_from_cfg( | |||||
| cfg, METRICS, group_key=field, default_args=default_args) | |||||
| @@ -0,0 +1,40 @@ | |||||
| from typing import Dict, List, Union | |||||
| import numpy as np | |||||
| from modelscope.outputs import OutputKeys | |||||
| from ..metainfo import Metrics | |||||
| from ..utils.registry import default_group | |||||
| from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify | |||||
| from .base import Metric | |||||
| from .builder import METRICS, MetricKeys | |||||
| @METRICS.register_module( | |||||
| group_key=default_group, module_name=Metrics.seq_cls_metric) | |||||
| class SequenceClassificationMetric(Metric): | |||||
| """The metric computation class for sequence classification classes. | |||||
| """ | |||||
| label_name = 'labels' | |||||
| def __init__(self): | |||||
| self.preds = [] | |||||
| self.labels = [] | |||||
| def add(self, outputs: Dict, inputs: Dict): | |||||
| ground_truths = inputs[SequenceClassificationMetric.label_name] | |||||
| eval_results = outputs[OutputKeys.LOGITS] | |||||
| self.preds.append( | |||||
| torch_nested_numpify(torch_nested_detach(eval_results))) | |||||
| self.labels.append( | |||||
| torch_nested_numpify(torch_nested_detach(ground_truths))) | |||||
| def evaluate(self): | |||||
| preds = np.concatenate(self.preds, axis=0) | |||||
| labels = np.concatenate(self.labels, axis=0) | |||||
| preds = np.argmax(preds, axis=1) | |||||
| return { | |||||
| MetricKeys.ACCURACY: | |||||
| (preds == labels).astype(np.float32).mean().item() | |||||
| } | |||||
| @@ -0,0 +1,23 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import Dict | |||||
| import torch | |||||
| from .base import Model | |||||
| class TorchModel(Model, torch.nn.Module): | |||||
| """ Base model interface for pytorch | |||||
| """ | |||||
| def __init__(self, model_dir=None, *args, **kwargs): | |||||
| # init reference: https://stackoverflow.com/questions\ | |||||
| # /9575409/calling-parent-class-init-with-multiple-inheritance-whats-the-right-way | |||||
| super().__init__(model_dir) | |||||
| super(Model, self).__init__() | |||||
| def forward(self, inputs: Dict[str, | |||||
| torch.Tensor]) -> Dict[str, torch.Tensor]: | |||||
| raise NotImplementedError | |||||
| @@ -108,7 +108,7 @@ class CLIPForMultiModalEmbedding(Model): | |||||
| return text_ids_tensor, text_mask_tensor | return text_ids_tensor, text_mask_tensor | ||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.outputs import OutputKeys | |||||
| output = { | output = { | ||||
| OutputKeys.IMG_EMBEDDING: None, | OutputKeys.IMG_EMBEDDING: None, | ||||
| OutputKeys.TEXT_EMBEDDING: None | OutputKeys.TEXT_EMBEDDING: None | ||||
| @@ -134,7 +134,7 @@ class CLIPForMultiModalEmbedding(Model): | |||||
| img_embedding = self.clip_model( | img_embedding = self.clip_model( | ||||
| input_data=img_tensor, input_type='img') | input_data=img_tensor, input_type='img') | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.outputs import OutputKeys | |||||
| output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() | output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() | ||||
| if 'text' in input and input['text'] is not None: | if 'text' in input and input['text'] is not None: | ||||
| @@ -76,7 +76,7 @@ class OfaForImageCaptioning(Model): | |||||
| input = fairseq.utils.move_to_cuda(input, device=self._device) | input = fairseq.utils.move_to_cuda(input, device=self._device) | ||||
| results, _ = self.eval_caption(self.task, self.generator, self.models, | results, _ = self.eval_caption(self.task, self.generator, self.models, | ||||
| input) | input) | ||||
| from ...pipelines.outputs import OutputKeys | |||||
| from modelscope.outputs import OutputKeys | |||||
| return { | return { | ||||
| 'image_id': results[0]['image_id'], | 'image_id': results[0]['image_id'], | ||||
| OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] | OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] | ||||
| @@ -7,7 +7,10 @@ import torch | |||||
| from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel | from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel | ||||
| from torch import nn | from torch import nn | ||||
| from modelscope.metainfo import Models | |||||
| from modelscope.utils.constant import Tasks | |||||
| from ..base import Model | from ..base import Model | ||||
| from ..builder import MODELS | |||||
| class SbertTextClassfier(SbertPreTrainedModel): | class SbertTextClassfier(SbertPreTrainedModel): | ||||
| @@ -20,7 +23,11 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||||
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | self.dropout = nn.Dropout(config.hidden_dropout_prob) | ||||
| self.classifier = nn.Linear(config.hidden_size, config.num_labels) | self.classifier = nn.Linear(config.hidden_size, config.num_labels) | ||||
| def forward(self, input_ids=None, token_type_ids=None): | |||||
| def forward(self, | |||||
| input_ids=None, | |||||
| token_type_ids=None, | |||||
| labels=None, | |||||
| **kwargs): | |||||
| outputs = self.encoder( | outputs = self.encoder( | ||||
| input_ids, | input_ids, | ||||
| token_type_ids=token_type_ids, | token_type_ids=token_type_ids, | ||||
| @@ -29,6 +36,10 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||||
| pooled_output = outputs[1] | pooled_output = outputs[1] | ||||
| pooled_output = self.dropout(pooled_output) | pooled_output = self.dropout(pooled_output) | ||||
| logits = self.classifier(pooled_output) | logits = self.classifier(pooled_output) | ||||
| if labels is not None: | |||||
| loss_fct = nn.CrossEntropyLoss() | |||||
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |||||
| return {'logits': logits, 'loss': loss} | |||||
| return {'logits': logits} | return {'logits': logits} | ||||
| @@ -4,6 +4,7 @@ from modelscope.utils.constant import Tasks | |||||
| class OutputKeys(object): | class OutputKeys(object): | ||||
| LOGITS = 'logits' | |||||
| SCORES = 'scores' | SCORES = 'scores' | ||||
| LABEL = 'label' | LABEL = 'label' | ||||
| LABELS = 'labels' | LABELS = 'labels' | ||||
| @@ -7,10 +7,10 @@ import soundfile as sf | |||||
| import torch | import torch | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from ..base import Input, Pipeline | from ..base import Input, Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| def audio_norm(x): | def audio_norm(x): | ||||
| @@ -9,12 +9,12 @@ import yaml | |||||
| from modelscope.fileio import File | from modelscope.fileio import File | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.preprocessors.audio import LinearAECAndFbank | from modelscope.preprocessors.audio import LinearAECAndFbank | ||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -5,9 +5,9 @@ import numpy as np | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models import Model | from modelscope.models import Model | ||||
| from modelscope.models.audio.tts import SambertHifigan | from modelscope.models.audio.tts import SambertHifigan | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input, InputModel, Pipeline | from modelscope.pipelines.base import Input, InputModel, Pipeline | ||||
| from modelscope.pipelines.builder import PIPELINES | from modelscope.pipelines.builder import PIPELINES | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Fields, Tasks | from modelscope.utils.constant import Fields, Tasks | ||||
| __all__ = ['TextToSpeechSambertHifiganPipeline'] | __all__ = ['TextToSpeechSambertHifiganPipeline'] | ||||
| @@ -7,10 +7,10 @@ from typing import Any, Dict, Generator, List, Union | |||||
| from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
| from modelscope.models.base import Model | from modelscope.models.base import Model | ||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| from modelscope.outputs import TASK_OUTPUTS | |||||
| from modelscope.preprocessors import Preprocessor | from modelscope.preprocessors import Preprocessor | ||||
| from modelscope.utils.config import Config | from modelscope.utils.config import Config | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .outputs import TASK_OUTPUTS | |||||
| from .util import is_model, is_official_hub_path | from .util import is_model, is_official_hub_path | ||||
| Tensor = Union['torch.Tensor', 'tf.Tensor'] | Tensor = Union['torch.Tensor', 'tf.Tensor'] | ||||
| @@ -6,6 +6,7 @@ import torch | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models.cv.action_recognition.models import BaseVideoModel | from modelscope.models.cv.action_recognition.models import BaseVideoModel | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.preprocessors.video import ReadVideoData | from modelscope.preprocessors.video import ReadVideoData | ||||
| from modelscope.utils.config import Config | from modelscope.utils.config import Config | ||||
| @@ -13,7 +14,6 @@ from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -10,13 +10,13 @@ from torchvision import transforms | |||||
| from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models.cv.animal_recognition import resnet | from modelscope.models.cv.animal_recognition import resnet | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.preprocessors import load_image | from modelscope.preprocessors import load_image | ||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -11,8 +11,8 @@ from PIL import Image | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models.cv.cmdssl_video_embedding.resnet2p1d import \ | from modelscope.models.cv.cmdssl_video_embedding.resnet2p1d import \ | ||||
| resnet26_2p1d | resnet26_2p1d | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.config import Config | from modelscope.utils.config import Config | ||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| @@ -11,13 +11,13 @@ from modelscope.models.cv.cartoon.facelib.facer import FaceAna | |||||
| from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( | from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( | ||||
| get_reference_facial_points, warp_and_crop_face) | get_reference_facial_points, warp_and_crop_face) | ||||
| from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size | from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.preprocessors import load_image | from modelscope.preprocessors import load_image | ||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| if tf.__version__ >= '2.0': | if tf.__version__ >= '2.0': | ||||
| tf = tf.compat.v1 | tf = tf.compat.v1 | ||||
| @@ -6,13 +6,13 @@ import numpy as np | |||||
| import PIL | import PIL | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.preprocessors import load_image | from modelscope.preprocessors import load_image | ||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -7,13 +7,13 @@ import PIL | |||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.preprocessors import load_image | from modelscope.preprocessors import load_image | ||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | ||||
| if tf.__version__ >= '2.0': | if tf.__version__ >= '2.0': | ||||
| @@ -3,12 +3,12 @@ from typing import Any, Dict | |||||
| import torch | import torch | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input | from modelscope.pipelines.base import Input | ||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from ..base import Model, Pipeline | from ..base import Model, Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -2,6 +2,7 @@ | |||||
| from typing import Any, Dict, Union | from typing import Any, Dict, Union | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SpaceForDialogIntent | from ...models.nlp import SpaceForDialogIntent | ||||
| @@ -9,7 +10,6 @@ from ...preprocessors import DialogIntentPredictionPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['DialogIntentPredictionPipeline'] | __all__ = ['DialogIntentPredictionPipeline'] | ||||
| @@ -2,6 +2,7 @@ | |||||
| from typing import Dict, Union | from typing import Dict, Union | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SpaceForDialogModeling | from ...models.nlp import SpaceForDialogModeling | ||||
| @@ -9,7 +10,6 @@ from ...preprocessors import DialogModelingPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline, Tensor | from ..base import Pipeline, Tensor | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['DialogModelingPipeline'] | __all__ = ['DialogModelingPipeline'] | ||||
| @@ -1,12 +1,12 @@ | |||||
| from typing import Any, Dict, Union | from typing import Any, Dict, Union | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model, SpaceForDialogStateTracking | from ...models import Model, SpaceForDialogStateTracking | ||||
| from ...preprocessors import DialogStateTrackingPreprocessor | from ...preprocessors import DialogStateTrackingPreprocessor | ||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['DialogStateTrackingPipeline'] | __all__ = ['DialogStateTrackingPipeline'] | ||||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Optional, Union | |||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp.masked_language import MaskedLanguageModelBase | from ...models.nlp.masked_language import MaskedLanguageModelBase | ||||
| @@ -11,7 +12,6 @@ from ...utils.config import Config | |||||
| from ...utils.constant import ModelFile, Tasks | from ...utils.constant import ModelFile, Tasks | ||||
| from ..base import Pipeline, Tensor | from ..base import Pipeline, Tensor | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['FillMaskPipeline'] | __all__ = ['FillMaskPipeline'] | ||||
| _type_map = {'veco': 'roberta', 'sbert': 'bert'} | _type_map = {'veco': 'roberta', 'sbert': 'bert'} | ||||
| @@ -4,6 +4,7 @@ from typing import Any, Dict, Union | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SbertForNLI | from ...models.nlp import SbertForNLI | ||||
| @@ -11,7 +12,6 @@ from ...preprocessors import NLIPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['NLIPipeline'] | __all__ = ['NLIPipeline'] | ||||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SbertForSentenceSimilarity | from ...models.nlp import SbertForSentenceSimilarity | ||||
| @@ -10,7 +11,6 @@ from ...preprocessors import SentenceSimilarityPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Input, Pipeline | from ..base import Input, Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['SentenceSimilarityPipeline'] | __all__ = ['SentenceSimilarityPipeline'] | ||||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SbertForSentimentClassification | from ...models.nlp import SbertForSentimentClassification | ||||
| @@ -10,7 +11,6 @@ from ...preprocessors import SentimentClassificationPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['SentimentClassificationPipeline'] | __all__ = ['SentimentClassificationPipeline'] | ||||
| @@ -4,12 +4,12 @@ import numpy as np | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models.nlp import BertForSequenceClassification | from modelscope.models.nlp import BertForSequenceClassification | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from ...models import Model | from ...models import Model | ||||
| from ..base import Input, Pipeline | from ..base import Input, Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['SequenceClassificationPipeline'] | __all__ = ['SequenceClassificationPipeline'] | ||||
| @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import PalmForTextGeneration | from ...models.nlp import PalmForTextGeneration | ||||
| @@ -9,7 +10,6 @@ from ...preprocessors import TextGenerationPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline, Tensor | from ..base import Pipeline, Tensor | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['TextGenerationPipeline'] | __all__ = ['TextGenerationPipeline'] | ||||
| @@ -4,6 +4,7 @@ from typing import Any, Dict | |||||
| import numpy as np | import numpy as np | ||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...hub.snapshot_download import snapshot_download | from ...hub.snapshot_download import snapshot_download | ||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models.nlp import CsanmtForTranslation | from ...models.nlp import CsanmtForTranslation | ||||
| @@ -11,7 +12,6 @@ from ...utils.constant import ModelFile, Tasks | |||||
| from ...utils.logger import get_logger | from ...utils.logger import get_logger | ||||
| from ..base import Pipeline, Tensor | from ..base import Pipeline, Tensor | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| if tf.__version__ >= '2.0': | if tf.__version__ >= '2.0': | ||||
| tf = tf.compat.v1 | tf = tf.compat.v1 | ||||
| @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||||
| import torch | import torch | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SbertForTokenClassification | from ...models.nlp import SbertForTokenClassification | ||||
| @@ -9,7 +10,6 @@ from ...preprocessors import TokenClassificationPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline, Tensor | from ..base import Pipeline, Tensor | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['WordSegmentationPipeline'] | __all__ = ['WordSegmentationPipeline'] | ||||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||||
| import torch | import torch | ||||
| from scipy.special import softmax | from scipy.special import softmax | ||||
| from modelscope.outputs import OutputKeys | |||||
| from ...metainfo import Pipelines | from ...metainfo import Pipelines | ||||
| from ...models import Model | from ...models import Model | ||||
| from ...models.nlp import SbertForZeroShotClassification | from ...models.nlp import SbertForZeroShotClassification | ||||
| @@ -10,7 +11,6 @@ from ...preprocessors import ZeroShotClassificationPreprocessor | |||||
| from ...utils.constant import Tasks | from ...utils.constant import Tasks | ||||
| from ..base import Pipeline | from ..base import Pipeline | ||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from ..outputs import OutputKeys | |||||
| __all__ = ['ZeroShotClassificationPipeline'] | __all__ = ['ZeroShotClassificationPipeline'] | ||||
| @@ -8,6 +8,7 @@ from transformers import AutoTokenizer | |||||
| from ..metainfo import Preprocessors | from ..metainfo import Preprocessors | ||||
| from ..models import Model | from ..models import Model | ||||
| from ..utils.constant import Fields, InputFields | from ..utils.constant import Fields, InputFields | ||||
| from ..utils.hub import parse_label_mapping | |||||
| from ..utils.type_assert import type_assert | from ..utils.type_assert import type_assert | ||||
| from .base import Preprocessor | from .base import Preprocessor | ||||
| from .builder import PREPROCESSORS | from .builder import PREPROCESSORS | ||||
| @@ -115,7 +116,8 @@ class SentenceSimilarityPreprocessor(NLPPreprocessorBase): | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | def __init__(self, model_dir: str, *args, **kwargs): | ||||
| kwargs['truncation'] = True | kwargs['truncation'] = True | ||||
| kwargs['padding'] = False | |||||
| kwargs['padding'] = False if 'padding' not in kwargs else kwargs[ | |||||
| 'padding'] | |||||
| kwargs['return_tensors'] = 'pt' | kwargs['return_tensors'] = 'pt' | ||||
| kwargs['max_length'] = kwargs.pop('sequence_length', 128) | kwargs['max_length'] = kwargs.pop('sequence_length', 128) | ||||
| super().__init__(model_dir, *args, **kwargs) | super().__init__(model_dir, *args, **kwargs) | ||||
| @@ -143,6 +145,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||||
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | ||||
| print(f'this is the tokenzier {self.tokenizer}') | print(f'this is the tokenzier {self.tokenizer}') | ||||
| self.label2id = parse_label_mapping(self.model_dir) | |||||
| @type_assert(object, (str, tuple, Dict)) | @type_assert(object, (str, tuple, Dict)) | ||||
| def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | ||||
| @@ -164,7 +167,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||||
| 'id': [], | 'id': [], | ||||
| 'input_ids': [], | 'input_ids': [], | ||||
| 'attention_mask': [], | 'attention_mask': [], | ||||
| 'token_type_ids': [] | |||||
| 'token_type_ids': [], | |||||
| } | } | ||||
| max_seq_length = self.sequence_length | max_seq_length = self.sequence_length | ||||
| @@ -186,6 +189,29 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||||
| return rst | return rst | ||||
| @PREPROCESSORS.register_module( | |||||
| Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune') | |||||
| class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor): | |||||
| """Sentence similarity preprocessor in the finetune scenario | |||||
| Mainly added the label mapping procedure. | |||||
| """ | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| kwargs['padding'] = 'max_length' | |||||
| super().__init__(model_dir, *args, **kwargs) | |||||
| self.label2id = parse_label_mapping(self.model_dir) | |||||
| @type_assert(object, (str, tuple, Dict)) | |||||
| def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||||
| rst = super().__call__(data) | |||||
| rst = {k: v.squeeze() for k, v in rst.items()} | |||||
| if self.label2id is not None and 'label' in data: | |||||
| rst['labels'] = [] | |||||
| rst['labels'].append(self.label2id[str(data['label'])]) | |||||
| return rst | |||||
| @PREPROCESSORS.register_module( | @PREPROCESSORS.register_module( | ||||
| Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) | Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) | ||||
| class TextGenerationPreprocessor(NLPPreprocessorBase): | class TextGenerationPreprocessor(NLPPreprocessorBase): | ||||
| @@ -0,0 +1,3 @@ | |||||
| from .base import TaskDataset | |||||
| from .builder import build_task_dataset | |||||
| from .torch_base_dataset import TorchTaskDataset | |||||
| @@ -0,0 +1,48 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from abc import ABC, abstractmethod | |||||
| from typing import Any, List, Tuple | |||||
| class TaskDataset(ABC): | |||||
| """The task dataset base class for all the task specific dataset processors. | |||||
| """ | |||||
| def __init__(self, | |||||
| datasets: Tuple[Any, List[Any]], | |||||
| mode, | |||||
| preprocessor=None, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.mode = mode | |||||
| self.preprocessor = preprocessor | |||||
| self._inner_dataset = self.compose_dataset(datasets) | |||||
| @abstractmethod | |||||
| def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||||
| """Prepare a dataset. | |||||
| User can process the input datasets in a whole dataset perspective. | |||||
| This method also helps to merge several datasets to one. | |||||
| Args: | |||||
| datasets: The original dataset(s) | |||||
| Returns: A single dataset, which may be created after merging. | |||||
| """ | |||||
| pass | |||||
| @abstractmethod | |||||
| def preprocess_dataset(self, data): | |||||
| """Preprocess the data fetched from the inner_dataset. | |||||
| If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||||
| User can override this method to implement custom logics. | |||||
| Args: | |||||
| data: The data fetched from the dataset. | |||||
| Returns: The processed data. | |||||
| """ | |||||
| pass | |||||
| @@ -0,0 +1,21 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.utils.config import ConfigDict | |||||
| from modelscope.utils.registry import Registry, build_from_cfg | |||||
| TASK_DATASETS = Registry('task_datasets') | |||||
| def build_task_dataset(cfg: ConfigDict, | |||||
| task_name: str = None, | |||||
| default_args: dict = None): | |||||
| """ Build task specific dataset processor given model config dict and the task name. | |||||
| Args: | |||||
| cfg (:obj:`ConfigDict`): config dict for model object. | |||||
| task_name (str, optional): task name, refer to | |||||
| :obj:`Tasks` for more details | |||||
| default_args (dict, optional): Default initialization arguments. | |||||
| """ | |||||
| return build_from_cfg( | |||||
| cfg, TASK_DATASETS, group_key=task_name, default_args=default_args) | |||||
| @@ -0,0 +1,63 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import Any, List, Tuple | |||||
| from torch.utils.data import ConcatDataset, Dataset | |||||
| from .base import TaskDataset | |||||
| class TorchTaskDataset(TaskDataset, Dataset): | |||||
| """The task dataset base class for all the torch-based task processors. | |||||
| This base class is enough for most cases, except there are procedures which can not be executed in | |||||
| preprocessors and Datasets like dataset merging. | |||||
| """ | |||||
| def __init__(self, | |||||
| datasets: Tuple[Any, List[Any]], | |||||
| mode, | |||||
| preprocessor=None, | |||||
| **kwargs): | |||||
| TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) | |||||
| def __getitem__(self, index) -> Any: | |||||
| return self.preprocess_dataset(self._inner_dataset[index]) | |||||
| def __len__(self): | |||||
| return len(self._inner_dataset) | |||||
| def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||||
| """Prepare a dataset. | |||||
| User can process the input datasets in a whole dataset perspective. | |||||
| This method gives a default implementation of datasets merging, user can override this | |||||
| method to write custom logics. | |||||
| Args: | |||||
| datasets: The original dataset(s) | |||||
| Returns: A single dataset, which may be created after merging. | |||||
| """ | |||||
| if isinstance(datasets, List): | |||||
| if len(datasets) == 1: | |||||
| return datasets[0] | |||||
| elif len(datasets) > 1: | |||||
| return ConcatDataset(datasets) | |||||
| else: | |||||
| return datasets | |||||
| def preprocess_dataset(self, data): | |||||
| """Preprocess the data fetched from the inner_dataset. | |||||
| If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||||
| User can override this method to implement custom logics. | |||||
| Args: | |||||
| data: The data fetched from the dataset. | |||||
| Returns: The processed data. | |||||
| """ | |||||
| return self.preprocessor( | |||||
| data) if self.preprocessor is not None else data | |||||
| @@ -1,3 +1,4 @@ | |||||
| from .base import DummyTrainer | from .base import DummyTrainer | ||||
| from .builder import build_trainer | from .builder import build_trainer | ||||
| from .nlp import SequenceClassificationTrainer | from .nlp import SequenceClassificationTrainer | ||||
| from .trainer import EpochBasedTrainer | |||||
| @@ -1,10 +1,12 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| import time | |||||
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||
| from typing import Callable, Dict, List, Optional, Tuple, Union | from typing import Callable, Dict, List, Optional, Tuple, Union | ||||
| from modelscope.trainers.builder import TRAINERS | from modelscope.trainers.builder import TRAINERS | ||||
| from modelscope.utils.config import Config | from modelscope.utils.config import Config | ||||
| from .utils.log_buffer import LogBuffer | |||||
| class BaseTrainer(ABC): | class BaseTrainer(ABC): | ||||
| @@ -27,6 +29,8 @@ class BaseTrainer(ABC): | |||||
| self.args = self.cfg.to_args(arg_parse_fn) | self.args = self.cfg.to_args(arg_parse_fn) | ||||
| else: | else: | ||||
| self.args = None | self.args = None | ||||
| self.log_buffer = LogBuffer() | |||||
| self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) | |||||
| @abstractmethod | @abstractmethod | ||||
| def train(self, *args, **kwargs): | def train(self, *args, **kwargs): | ||||
| @@ -5,9 +5,10 @@ from modelscope.utils.constant import Tasks | |||||
| from modelscope.utils.registry import Registry, build_from_cfg | from modelscope.utils.registry import Registry, build_from_cfg | ||||
| TRAINERS = Registry('trainers') | TRAINERS = Registry('trainers') | ||||
| HOOKS = Registry('hooks') | |||||
| def build_trainer(name: str = None, default_args: dict = None): | |||||
| def build_trainer(name: str = 'EpochBasedTrainer', default_args: dict = None): | |||||
| """ build trainer given a trainer name | """ build trainer given a trainer name | ||||
| Args: | Args: | ||||
| @@ -15,7 +16,5 @@ def build_trainer(name: str = None, default_args: dict = None): | |||||
| will be used. | will be used. | ||||
| default_args (dict, optional): Default initialization arguments. | default_args (dict, optional): Default initialization arguments. | ||||
| """ | """ | ||||
| if name is None: | |||||
| name = 'Trainer' | |||||
| cfg = dict(type=name) | cfg = dict(type=name) | ||||
| return build_from_cfg(cfg, TRAINERS, default_args=default_args) | return build_from_cfg(cfg, TRAINERS, default_args=default_args) | ||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .builder import HOOKS, build_hook | |||||
| from .checkpoint_hook import CheckpointHook | |||||
| from .evaluation_hook import EvaluationHook | |||||
| from .hook import Hook | |||||
| from .iter_timer_hook import IterTimerHook | |||||
| from .logger.text_logger_hook import TextLoggerHook | |||||
| from .lr_scheduler_hook import LrSchedulerHook | |||||
| from .optimizer_hook import OptimizerHook | |||||
| from .priority import Priority | |||||
| __all__ = [ | |||||
| 'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', | |||||
| 'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', | |||||
| 'IterTimerHook' | |||||
| ] | |||||
| @@ -0,0 +1,9 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||||
| HOOKS = Registry('hooks') | |||||
| def build_hook(cfg, default_args=None): | |||||
| return build_from_cfg( | |||||
| cfg, HOOKS, group_key=default_group, default_args=default_args) | |||||
| @@ -0,0 +1,92 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| from modelscope import __version__ | |||||
| from modelscope.utils.checkpoint import save_checkpoint | |||||
| from modelscope.utils.logger import get_logger | |||||
| from modelscope.utils.torch_utils import get_dist_info | |||||
| from .builder import HOOKS | |||||
| from .hook import Hook | |||||
| from .priority import Priority | |||||
| @HOOKS.register_module() | |||||
| class CheckpointHook(Hook): | |||||
| """Save checkpoints periodically. | |||||
| Args: | |||||
| interval (int): The frequency to save model. If `by_epoch=True`, | |||||
| it means the number of epochs, else means the number of iterations | |||||
| by_epoch (bool): Saving checkpoints by epoch or by iteration. | |||||
| save_optimizer (bool): Whether to save optimizer state dict. Default: True. | |||||
| save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` | |||||
| save_last (bool): Whether to save the last checkpoint. Default: True. | |||||
| """ | |||||
| PRIORITY = Priority.NORMAL | |||||
| def __init__(self, | |||||
| interval=0, | |||||
| by_epoch=True, | |||||
| save_optimizer=True, | |||||
| save_dir=None, | |||||
| save_last=True): | |||||
| self.interval = interval | |||||
| self.by_epoch = by_epoch | |||||
| self.save_optimizer = save_optimizer | |||||
| self.save_dir = save_dir | |||||
| self.save_last = save_last | |||||
| def before_run(self, trainer): | |||||
| if not self.save_dir: | |||||
| self.save_dir = trainer.work_dir | |||||
| if not hasattr(trainer, 'logger'): | |||||
| self.logger = get_logger(__name__) | |||||
| else: | |||||
| self.logger = trainer.logger | |||||
| self.logger.info(f'Checkpoints will be saved to {self.save_dir}') | |||||
| def after_train_epoch(self, trainer): | |||||
| if not self.by_epoch: | |||||
| return | |||||
| if self._should_save(trainer): | |||||
| self.logger.info(f'Saving checkpoint at {trainer.epoch + 1} epoch') | |||||
| self._save_checkpoint(trainer) | |||||
| def _save_checkpoint(self, trainer): | |||||
| if self.by_epoch: | |||||
| cur_save_name = os.path.join(self.save_dir, | |||||
| f'epoch_{trainer.epoch + 1}.pth') | |||||
| else: | |||||
| cur_save_name = os.path.join(self.save_dir, | |||||
| f'iter_{trainer.epoch + 1}.pth') | |||||
| rank, _ = get_dist_info() | |||||
| if rank == 0: | |||||
| save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) | |||||
| def after_train_iter(self, trainer): | |||||
| if self.by_epoch: | |||||
| return | |||||
| if self._should_save(trainer): | |||||
| self.logger.info( | |||||
| f'Saving checkpoint at {trainer.iter + 1} iterations') | |||||
| self._save_checkpoint(trainer) | |||||
| def _should_save(self, trainer): | |||||
| if self.by_epoch: | |||||
| check_last = self.is_last_epoch | |||||
| check_frequency = self.every_n_epochs | |||||
| else: | |||||
| check_last = self.is_last_iter | |||||
| check_frequency = self.every_n_iters | |||||
| if check_frequency(trainer, | |||||
| self.interval) or (self.save_last | |||||
| and check_last(trainer)): | |||||
| return True | |||||
| return False | |||||
| @@ -0,0 +1,74 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .builder import HOOKS | |||||
| from .hook import Hook | |||||
| from .priority import Priority | |||||
| @HOOKS.register_module() | |||||
| class EvaluationHook(Hook): | |||||
| """Evaluation hook. | |||||
| Args: | |||||
| interval (int): Evaluation interval. | |||||
| by_epoch (bool): Evaluate by epoch or by iteration. | |||||
| start_idx (int | None, optional): The epoch/iterations validation begins. | |||||
| Default: None, validate every interval epochs/iterations from scratch. | |||||
| """ | |||||
| PRIORITY = Priority.NORMAL | |||||
| def __init__(self, interval=1, by_epoch=True, start_idx=None): | |||||
| assert interval > 0, 'interval must be a positive number' | |||||
| self.interval = interval | |||||
| self.start_idx = start_idx | |||||
| self.by_epoch = by_epoch | |||||
| def after_train_iter(self, trainer): | |||||
| """Called after every training iter to evaluate the results.""" | |||||
| if not self.by_epoch and self._should_evaluate(trainer): | |||||
| self.do_evaluate(trainer) | |||||
| def after_train_epoch(self, trainer): | |||||
| """Called after every training epoch to evaluate the results.""" | |||||
| if self.by_epoch and self._should_evaluate(trainer): | |||||
| self.do_evaluate(trainer) | |||||
| def do_evaluate(self, trainer): | |||||
| """Evaluate the results.""" | |||||
| eval_res = trainer.evaluate() | |||||
| for name, val in eval_res.items(): | |||||
| trainer.log_buffer.output[name] = val | |||||
| trainer.log_buffer.ready = True | |||||
| def _should_evaluate(self, trainer): | |||||
| """Judge whether to perform evaluation. | |||||
| Here is the rule to judge whether to perform evaluation: | |||||
| 1. It will not perform evaluation during the epoch/iteration interval, | |||||
| which is determined by ``self.interval``. | |||||
| 2. It will not perform evaluation if the ``start_idx`` is larger than | |||||
| current epochs/iters. | |||||
| 3. It will not perform evaluation when current epochs/iters is larger than | |||||
| the ``start_idx`` but during epoch/iteration interval. | |||||
| Returns: | |||||
| bool: The flag indicating whether to perform evaluation. | |||||
| """ | |||||
| if self.by_epoch: | |||||
| current = trainer.epoch | |||||
| check_time = self.every_n_epochs | |||||
| else: | |||||
| current = trainer.iter | |||||
| check_time = self.every_n_iters | |||||
| if self.start_idx is None: | |||||
| if not check_time(trainer, self.interval): | |||||
| return False | |||||
| elif (current + 1) < self.start_idx: | |||||
| return False | |||||
| else: | |||||
| if (current + 1 - self.start_idx) % self.interval: | |||||
| return False | |||||
| return True | |||||
| @@ -0,0 +1,208 @@ | |||||
| # Copyright (c) OpenMMLab. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.utils.import_utils import is_method_overridden | |||||
| from .priority import Priority | |||||
| class Hook: | |||||
| """ | |||||
| The Hook base class of any modelscope trainer. You can build your own hook inherited from this class. | |||||
| """ | |||||
| # TODO @jiangnana.jnn use constant variable for stages | |||||
| stages = ('before_run', 'before_train_epoch', 'before_train_iter', | |||||
| 'after_train_iter', 'after_train_epoch', 'before_val_epoch', | |||||
| 'before_val_iter', 'after_val_iter', 'after_val_epoch', | |||||
| 'after_run') | |||||
| PRIORITY = Priority.NORMAL | |||||
| def before_run(self, trainer): | |||||
| """ | |||||
| Will be called before any loop begins. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def after_run(self, trainer): | |||||
| """ | |||||
| Will be called after all loops end. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def before_epoch(self, trainer): | |||||
| """ | |||||
| Will be called before every epoch begins. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def after_epoch(self, trainer): | |||||
| """ | |||||
| Will be called after every epoch ends. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def before_iter(self, trainer): | |||||
| """ | |||||
| Will be called before every loop begins. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def after_iter(self, trainer): | |||||
| """ | |||||
| Will be called after every loop ends. | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| pass | |||||
| def before_train_epoch(self, trainer): | |||||
| """ | |||||
| Will be called before every train epoch begins. Default call ``self.before_epoch`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.before_epoch(trainer) | |||||
| def before_val_epoch(self, trainer): | |||||
| """ | |||||
| Will be called before every validation epoch begins. Default call ``self.before_epoch`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.before_epoch(trainer) | |||||
| def after_train_epoch(self, trainer): | |||||
| """ | |||||
| Will be called after every train epoch ends. Default call ``self.after_epoch`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.after_epoch(trainer) | |||||
| def after_val_epoch(self, trainer): | |||||
| """ | |||||
| Will be called after every validation epoch ends. Default call ``self.after_epoch`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.after_epoch(trainer) | |||||
| def before_train_iter(self, trainer): | |||||
| """ | |||||
| Will be called before every train loop begins. Default call ``self.before_iter`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.before_iter(trainer) | |||||
| def before_val_iter(self, trainer): | |||||
| """ | |||||
| Will be called before every validation loop begins. Default call ``self.before_iter`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.before_iter(trainer) | |||||
| def after_train_iter(self, trainer): | |||||
| """ | |||||
| Will be called after every train loop ends. Default call ``self.after_iter`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.after_iter(trainer) | |||||
| def after_val_iter(self, trainer): | |||||
| """ | |||||
| Will be called after every validation loop ends. Default call ``self.after_iter`` | |||||
| Args: | |||||
| trainer: The trainer instance. | |||||
| Returns: None | |||||
| """ | |||||
| self.after_iter(trainer) | |||||
| def every_n_epochs(self, trainer, n): | |||||
| """ | |||||
| Whether to reach every ``n`` epochs | |||||
| Returns: bool | |||||
| """ | |||||
| return (trainer.epoch + 1) % n == 0 if n > 0 else False | |||||
| def every_n_iters(self, trainer, n): | |||||
| """ | |||||
| Whether to reach every ``n`` iterations | |||||
| Returns: bool | |||||
| """ | |||||
| return (trainer.iter + 1) % n == 0 if n > 0 else False | |||||
| def end_of_epoch(self, trainer): | |||||
| """ | |||||
| Whether to reach the end of every epoch | |||||
| Returns: bool | |||||
| """ | |||||
| return trainer.inner_iter + 1 == len(trainer.data_loader) | |||||
| def is_last_epoch(self, trainer): | |||||
| """ | |||||
| Whether to reach the last epoch | |||||
| Returns: bool | |||||
| """ | |||||
| return trainer.epoch + 1 == trainer._max_epochs | |||||
| def is_last_iter(self, trainer): | |||||
| """ | |||||
| Whether to reach the last iteration in the entire training process | |||||
| Returns: bool | |||||
| """ | |||||
| return trainer.iter + 1 == trainer._max_iters | |||||
| def get_triggered_stages(self): | |||||
| trigger_stages = set() | |||||
| for stage in Hook.stages: | |||||
| if is_method_overridden(stage, Hook, self): | |||||
| trigger_stages.add(stage) | |||||
| return [stage for stage in Hook.stages if stage in trigger_stages] | |||||
| @@ -0,0 +1,22 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import time | |||||
| from .builder import HOOKS | |||||
| from .hook import Hook | |||||
| from .priority import Priority | |||||
| @HOOKS.register_module() | |||||
| class IterTimerHook(Hook): | |||||
| PRIORITY = Priority.LOW | |||||
| def before_epoch(self, trainer): | |||||
| self.start_time = time.time() | |||||
| def before_iter(self, trainer): | |||||
| trainer.log_buffer.update( | |||||
| {'data_load_time': time.time() - self.start_time}) | |||||
| def after_iter(self, trainer): | |||||
| trainer.log_buffer.update({'time': time.time() - self.start_time}) | |||||
| self.start_time = time.time() | |||||
| @@ -0,0 +1,6 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.trainers.utils.log_buffer import LogBuffer | |||||
| from .base import LoggerHook | |||||
| from .text_logger_hook import TextLoggerHook | |||||
| __all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer'] | |||||
| @@ -0,0 +1,115 @@ | |||||
| # Copyright (c) OpenMMLab. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import numbers | |||||
| from abc import ABCMeta, abstractmethod | |||||
| import numpy as np | |||||
| import torch | |||||
| from modelscope.trainers.hooks.hook import Hook | |||||
| from ..priority import Priority | |||||
| class LoggerHook(Hook): | |||||
| """Base class for logger hooks. | |||||
| Args: | |||||
| interval (int): Logging interval (every k iterations). | |||||
| ignore_last (bool): Ignore the log of last iterations in each epoch | |||||
| if less than `interval`. | |||||
| reset_flag (bool): Whether to clear the output buffer after logging. | |||||
| by_epoch (bool): Whether EpochBasedtrainer is used. | |||||
| """ | |||||
| __metaclass__ = ABCMeta | |||||
| PRIORITY = Priority.VERY_LOW | |||||
| def __init__(self, | |||||
| interval=10, | |||||
| ignore_last=True, | |||||
| reset_flag=False, | |||||
| by_epoch=True): | |||||
| self.interval = interval | |||||
| self.ignore_last = ignore_last | |||||
| self.reset_flag = reset_flag | |||||
| self.by_epoch = by_epoch | |||||
| @abstractmethod | |||||
| def log(self, trainer): | |||||
| pass | |||||
| @staticmethod | |||||
| def is_scalar(val, include_np=True, include_torch=True): | |||||
| """Tell the input variable is a scalar or not. | |||||
| Args: | |||||
| val: Input variable. | |||||
| include_np (bool): Whether to treat 0-d np.ndarray as a scalar. | |||||
| include_torch (bool): Whether to treat 0-d torch.Tensor as a scalar. | |||||
| Returns: | |||||
| bool: True or False. | |||||
| """ | |||||
| if isinstance(val, numbers.Number): | |||||
| return True | |||||
| elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: | |||||
| return True | |||||
| elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| def get_epoch(self, trainer): | |||||
| if trainer.mode == 'train': | |||||
| epoch = trainer.epoch + 1 | |||||
| elif trainer.mode == 'val': | |||||
| # normal val mode | |||||
| # trainer.epoch += 1 has been done before val workflow | |||||
| epoch = trainer.epoch | |||||
| else: | |||||
| raise ValueError(f"trainer mode should be 'train' or 'val', " | |||||
| f'but got {trainer.mode}') | |||||
| return epoch | |||||
| def get_iter(self, trainer, inner_iter=False): | |||||
| """Get the current training iteration step.""" | |||||
| if self.by_epoch and inner_iter: | |||||
| current_iter = trainer.inner_iter + 1 | |||||
| else: | |||||
| current_iter = trainer.iter + 1 | |||||
| return current_iter | |||||
| def before_run(self, trainer): | |||||
| for hook in trainer.hooks[::-1]: | |||||
| if isinstance(hook, LoggerHook): | |||||
| hook.reset_flag = True | |||||
| break | |||||
| def before_epoch(self, trainer): | |||||
| trainer.log_buffer.clear() # clear logs of last epoch | |||||
| def after_train_iter(self, trainer): | |||||
| if self.by_epoch and self.every_n_epochs(trainer, self.interval): | |||||
| trainer.log_buffer.average(self.interval) | |||||
| elif not self.by_epoch and self.every_n_iters(trainer, self.interval): | |||||
| trainer.log_buffer.average(self.interval) | |||||
| elif self.end_of_epoch(trainer) and not self.ignore_last: | |||||
| # not precise but more stable | |||||
| trainer.log_buffer.average(self.interval) | |||||
| if trainer.log_buffer.ready: | |||||
| self.log(trainer) | |||||
| if self.reset_flag: | |||||
| trainer.log_buffer.clear_output() | |||||
| def after_train_epoch(self, trainer): | |||||
| if trainer.log_buffer.ready: | |||||
| self.log(trainer) | |||||
| if self.reset_flag: | |||||
| trainer.log_buffer.clear_output() | |||||
| def after_val_epoch(self, trainer): | |||||
| trainer.log_buffer.average() | |||||
| self.log(trainer) | |||||
| if self.reset_flag: | |||||
| trainer.log_buffer.clear_output() | |||||
| @@ -0,0 +1,159 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import datetime | |||||
| import os | |||||
| import os.path as osp | |||||
| from collections import OrderedDict | |||||
| import json | |||||
| import torch | |||||
| from torch import distributed as dist | |||||
| from modelscope.utils.torch_utils import get_dist_info | |||||
| from ..builder import HOOKS | |||||
| from .base import LoggerHook | |||||
| @HOOKS.register_module() | |||||
| class TextLoggerHook(LoggerHook): | |||||
| """Logger hook in text, Output log to both console and local json file. | |||||
| Args: | |||||
| by_epoch (bool, optional): Whether EpochBasedtrainer is used. | |||||
| Default: True. | |||||
| interval (int, optional): Logging interval (every k iterations). | |||||
| Default: 10. | |||||
| ignore_last (bool, optional): Ignore the log of last iterations in each | |||||
| epoch if less than :attr:`interval`. Default: True. | |||||
| reset_flag (bool, optional): Whether to clear the output buffer after | |||||
| logging. Default: False. | |||||
| out_dir (str): The directory to save log. If is None, use `trainer.work_dir` | |||||
| """ | |||||
| def __init__(self, | |||||
| by_epoch=True, | |||||
| interval=10, | |||||
| ignore_last=True, | |||||
| reset_flag=False, | |||||
| out_dir=None): | |||||
| super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, | |||||
| by_epoch) | |||||
| self.by_epoch = by_epoch | |||||
| self.time_sec_tot = 0 | |||||
| self.out_dir = out_dir | |||||
| self._logged_keys = [] # store the key has been logged | |||||
| def before_run(self, trainer): | |||||
| super(TextLoggerHook, self).before_run(trainer) | |||||
| if self.out_dir is None: | |||||
| self.out_dir = trainer.work_dir | |||||
| if not osp.exists(self.out_dir): | |||||
| os.makedirs(self.out_dir) | |||||
| trainer.logger.info('Text logs will be saved to {}'.format( | |||||
| self.out_dir)) | |||||
| self.start_iter = trainer.iter | |||||
| self.json_log_path = osp.join(self.out_dir, | |||||
| '{}.log.json'.format(trainer.timestamp)) | |||||
| if hasattr(trainer, 'meta') and trainer.meta is not None: | |||||
| self._dump_log(trainer.meta, trainer) | |||||
| def _get_max_memory(self, trainer): | |||||
| device = getattr(trainer.model, 'output_device', None) | |||||
| mem = torch.cuda.max_memory_allocated(device=device) | |||||
| mem_mb = torch.tensor([mem / (1024 * 1024)], | |||||
| dtype=torch.int, | |||||
| device=device) | |||||
| _, world_size = get_dist_info() | |||||
| if world_size > 1: | |||||
| dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) | |||||
| return mem_mb.item() | |||||
| def _log_info(self, log_dict, trainer): | |||||
| if log_dict['mode'] == 'train': | |||||
| if isinstance(log_dict['lr'], dict): | |||||
| lr_str = [] | |||||
| for k, val in log_dict['lr'].items(): | |||||
| lr_str.append(f'lr_{k}: {val:.3e}') | |||||
| lr_str = ' '.join(lr_str) | |||||
| else: | |||||
| lr_str = f'lr: {log_dict["lr"]:.3e}' | |||||
| if self.by_epoch: | |||||
| log_str = f'Epoch [{log_dict["epoch"]}][{log_dict["iter"]}/{len(trainer.data_loader)}]\t' | |||||
| else: | |||||
| log_str = f'Iter [{log_dict["iter"]}/{trainer.max_iters}]\t' | |||||
| log_str += f'{lr_str}, ' | |||||
| self._logged_keys.extend(['lr', 'mode', 'iter', 'epoch']) | |||||
| if 'time' in log_dict.keys(): | |||||
| self.time_sec_tot += (log_dict['time'] * self.interval) | |||||
| time_sec_avg = self.time_sec_tot / ( | |||||
| trainer.iter - self.start_iter + 1) | |||||
| eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) | |||||
| eta_str = str(datetime.timedelta(seconds=int(eta_sec))) | |||||
| log_str += f'eta: {eta_str}, ' | |||||
| log_str += f'time: {log_dict["time"]:.3f}, data_load_time: {log_dict["data_load_time"]:.3f}, ' | |||||
| self._logged_keys.extend([ | |||||
| 'time', | |||||
| 'data_load_time', | |||||
| ]) | |||||
| else: | |||||
| # val/test time | |||||
| # here 1000 is the length of the val dataloader | |||||
| # by epoch: Epoch[val] [4][1000] | |||||
| # by iter: Iter[val] [1000] | |||||
| if self.by_epoch: | |||||
| log_str = f'Epoch({log_dict["mode"]}) [{log_dict["epoch"]}][{log_dict["iter"]}]\t' | |||||
| else: | |||||
| log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' | |||||
| self._logged_keys.extend(['mode', 'iter', 'epoch']) | |||||
| log_items = [] | |||||
| for name, val in log_dict.items(): | |||||
| if name in self._logged_keys: | |||||
| continue | |||||
| if isinstance(val, float): | |||||
| val = f'{val:.4f}' | |||||
| log_items.append(f'{name}: {val}') | |||||
| log_str += ', '.join(log_items) | |||||
| trainer.logger.info(log_str) | |||||
| def _dump_log(self, log_dict): | |||||
| # dump log in json format | |||||
| json_log = OrderedDict() | |||||
| for k, v in log_dict.items(): | |||||
| json_log[k] = self._round_float(v) | |||||
| rank, _ = get_dist_info() | |||||
| if rank == 0: | |||||
| with open(self.json_log_path, 'a+') as f: | |||||
| json.dump(json_log, f) | |||||
| f.write('\n') | |||||
| def _round_float(self, items, ndigits=5): | |||||
| if isinstance(items, list): | |||||
| return [self._round_float(item) for item in items] | |||||
| elif isinstance(items, float): | |||||
| return round(items, ndigits) | |||||
| else: | |||||
| return items | |||||
| def log(self, trainer): | |||||
| cur_iter = self.get_iter(trainer, inner_iter=True) | |||||
| log_dict = OrderedDict( | |||||
| mode=trainer.mode, epoch=self.get_epoch(trainer), iter=cur_iter) | |||||
| # statistic memory | |||||
| if torch.cuda.is_available(): | |||||
| log_dict['memory'] = self._get_max_memory(trainer) | |||||
| log_dict = dict(log_dict, **trainer.log_buffer.output) | |||||
| self._log_info(log_dict, trainer) | |||||
| self._dump_log(log_dict) | |||||
| return log_dict | |||||
| @@ -0,0 +1,71 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||||
| from .builder import HOOKS | |||||
| from .hook import Hook | |||||
| from .priority import Priority | |||||
| @HOOKS.register_module() | |||||
| class LrSchedulerHook(Hook): | |||||
| """Lr scheduler. | |||||
| Args: | |||||
| by_epoch (bool): Whether lr changes by epoch | |||||
| warmup (dict): warm up config | |||||
| """ | |||||
| PRIORITY = Priority.VERY_HIGH | |||||
| def __init__(self, by_epoch=True, warmup=None) -> None: | |||||
| super().__init__() | |||||
| self.by_epoch = by_epoch | |||||
| if not self.by_epoch: | |||||
| raise ValueError('We only support ``by_epoch=True`` now!') | |||||
| self.warmup = warmup | |||||
| self.warmup_lr_scheduler = None | |||||
| def before_run(self, trainer): | |||||
| if self.warmup is not None: | |||||
| assert isinstance(self.warmup, dict) and 'type' in self.warmup | |||||
| self.warmup_lr_scheduler = build_lr_scheduler( | |||||
| cfg=self.warmup, | |||||
| default_args={'base_scheduler': trainer.lr_scheduler}) | |||||
| def get_current_lr(self, trainer): | |||||
| import torch | |||||
| if isinstance(trainer.optimizer, torch.optim.Optimizer): | |||||
| lr = [group['lr'] for group in trainer.optimizer.param_groups] | |||||
| elif isinstance(trainer.optimizer, dict): | |||||
| lr = dict() | |||||
| for name, optim in trainer.optimizer.items(): | |||||
| lr[name] = [group['lr'] for group in optim.param_groups] | |||||
| else: | |||||
| raise RuntimeError( | |||||
| 'lr is not applicable because optimizer does not exist.') | |||||
| return lr | |||||
| def before_train_iter(self, trainer): | |||||
| trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||||
| def before_train_epoch(self, trainer): | |||||
| if self.by_epoch: | |||||
| if self.warmup_lr_scheduler is not None: | |||||
| self.warmup_lr_scheduler.step() | |||||
| else: | |||||
| trainer.lr_scheduler.step() | |||||
| trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||||
| def _get_log_lr(self, trainer): | |||||
| cur_lr = self.get_current_lr(trainer) | |||||
| # only record lr of the first param group | |||||
| if isinstance(cur_lr, list): | |||||
| lr = cur_lr[0] | |||||
| else: | |||||
| assert isinstance(cur_lr, dict) | |||||
| lr = {} | |||||
| for k, lr_ in cur_lr.items(): | |||||
| assert isinstance(lr_, list) | |||||
| lr.update({k: lr_[0]}) | |||||
| return lr | |||||
| @@ -0,0 +1,37 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from torch.nn.utils import clip_grad | |||||
| from .builder import HOOKS | |||||
| from .hook import Hook | |||||
| from .priority import Priority | |||||
| @HOOKS.register_module() | |||||
| class OptimizerHook(Hook): | |||||
| PRIORITY = Priority.ABOVE_NORMAL | |||||
| def __init__(self, grad_clip=None, loss_keys='loss') -> None: | |||||
| if isinstance(loss_keys, str): | |||||
| loss_keys = [loss_keys] | |||||
| assert isinstance(loss_keys, (tuple, list)) | |||||
| self.loss_keys = loss_keys | |||||
| self.grad_clip = grad_clip | |||||
| def clip_grads(self, params, **clip_args): | |||||
| params = list( | |||||
| filter(lambda p: p.requires_grad and p.grad is not None, params)) | |||||
| if len(params) > 0: | |||||
| return clip_grad.clip_grad_norm_(params, **clip_args) | |||||
| def after_train_iter(self, trainer): | |||||
| trainer.optimizer.zero_grad() | |||||
| for k in self.loss_keys: | |||||
| trainer.train_outputs[k].backward() | |||||
| clip_args = self.grad_clip | |||||
| if clip_args is not None: | |||||
| self.clip_grads(trainer.model.parameters(), **clip_args) | |||||
| trainer.optimizer.step() | |||||
| @@ -0,0 +1,62 @@ | |||||
| # Copyright (c) OpenMMLab. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from enum import Enum | |||||
| from typing import Union | |||||
| class Priority(Enum): | |||||
| """Hook priority levels. | |||||
| +--------------+------------+ | |||||
| | Level | Value | | |||||
| +==============+============+ | |||||
| | HIGHEST | 0 | | |||||
| +--------------+------------+ | |||||
| | VERY_HIGH | 10 | | |||||
| +--------------+------------+ | |||||
| | HIGH | 30 | | |||||
| +--------------+------------+ | |||||
| | ABOVE_NORMAL | 40 | | |||||
| +--------------+------------+ | |||||
| | NORMAL | 50 | | |||||
| +--------------+------------+ | |||||
| | BELOW_NORMAL | 60 | | |||||
| +--------------+------------+ | |||||
| | LOW | 70 | | |||||
| +--------------+------------+ | |||||
| | VERY_LOW | 90 | | |||||
| +--------------+------------+ | |||||
| | LOWEST | 100 | | |||||
| +--------------+------------+ | |||||
| """ | |||||
| HIGHEST = 0 | |||||
| VERY_HIGH = 10 | |||||
| HIGH = 30 | |||||
| ABOVE_NORMAL = 40 | |||||
| NORMAL = 50 | |||||
| BELOW_NORMAL = 60 | |||||
| LOW = 70 | |||||
| VERY_LOW = 90 | |||||
| LOWEST = 100 | |||||
| def get_priority(priority: Union[int, str, Priority]) -> int: | |||||
| """Get priority value. | |||||
| Args: | |||||
| priority (int or str or :obj:`Priority`): Priority. | |||||
| Returns: | |||||
| int: The priority value. | |||||
| """ | |||||
| if isinstance(priority, int): | |||||
| if priority < 0 or priority > 100: | |||||
| raise ValueError('priority must be between 0 and 100') | |||||
| return priority | |||||
| elif isinstance(priority, Priority): | |||||
| return priority.value | |||||
| elif isinstance(priority, str): | |||||
| return Priority[priority.upper()].value | |||||
| else: | |||||
| raise TypeError('priority must be an integer or Priority enum value') | |||||
| @@ -0,0 +1,8 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .builder import LR_SCHEDULER, build_lr_scheduler | |||||
| from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup | |||||
| __all__ = [ | |||||
| 'LR_SCHEDULER', 'build_lr_scheduler', 'BaseWarmup', 'ConstantWarmup', | |||||
| 'LinearWarmup', 'ExponentialWarmup' | |||||
| ] | |||||
| @@ -0,0 +1,47 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import inspect | |||||
| from modelscope.utils.config import ConfigDict | |||||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||||
| LR_SCHEDULER = Registry('lr scheduler') | |||||
| def build_lr_scheduler(cfg: ConfigDict, default_args: dict = None): | |||||
| """ build lr scheduler from given lr scheduler config dict | |||||
| Args: | |||||
| cfg (:obj:`ConfigDict`): config dict for lr scheduler object. | |||||
| default_args (dict, optional): Default initialization arguments. | |||||
| """ | |||||
| if cfg['type'].lower().endswith('warmup'): | |||||
| # build warmup lr scheduler | |||||
| if not hasattr(cfg, 'base_scheduler'): | |||||
| if default_args is None or ('base_scheduler' not in default_args): | |||||
| raise ValueError( | |||||
| 'Must provide ``base_scheduler`` which is an instance of ``torch.optim.lr_scheduler._LRScheduler`` ' | |||||
| 'for build warmup lr scheduler.') | |||||
| else: | |||||
| # build lr scheduler without warmup | |||||
| if not hasattr(cfg, 'optimizer'): | |||||
| if default_args is None or ('optimizer' not in default_args): | |||||
| raise ValueError( | |||||
| 'Must provide ``optimizer`` which is an instance of ``torch.optim.Optimizer`` ' | |||||
| 'for build lr scheduler') | |||||
| return build_from_cfg( | |||||
| cfg, LR_SCHEDULER, group_key=default_group, default_args=default_args) | |||||
| def register_torch_lr_scheduler(): | |||||
| from torch.optim import lr_scheduler | |||||
| from torch.optim.lr_scheduler import _LRScheduler | |||||
| members = inspect.getmembers(lr_scheduler) | |||||
| for name, obj in members: | |||||
| if inspect.isclass(obj) and issubclass(obj, _LRScheduler): | |||||
| LR_SCHEDULER.register_module(module_name=name, module_cls=obj) | |||||
| register_torch_lr_scheduler() | |||||
| @@ -0,0 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .base import BaseWarmup | |||||
| from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup | |||||
| __all__ = ['BaseWarmup', 'ConstantWarmup', 'LinearWarmup', 'ExponentialWarmup'] | |||||
| @@ -0,0 +1,75 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from torch.optim.lr_scheduler import _LRScheduler | |||||
| class BaseWarmup(_LRScheduler): | |||||
| """Base warmup scheduler | |||||
| Args: | |||||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||||
| warmup_iters (int | list): Warmup iterations | |||||
| last_epoch (int): The index of last epoch. | |||||
| """ | |||||
| def __init__(self, | |||||
| base_scheduler, | |||||
| warmup_iters, | |||||
| last_epoch=-1, | |||||
| verbose=False): | |||||
| self.base_scheduler = base_scheduler | |||||
| self.warmup_iters = warmup_iters | |||||
| optimizer = self.base_scheduler.optimizer | |||||
| self._is_init_step = True | |||||
| super(BaseWarmup, self).__init__( | |||||
| optimizer, last_epoch=last_epoch, verbose=verbose) | |||||
| def get_lr(self): | |||||
| return self.base_scheduler.get_lr() | |||||
| def state_dict(self): | |||||
| self.base_scheduler.state_dict() | |||||
| def load_state_dict(self, state_dict): | |||||
| self.base_scheduler.load_state_dict(state_dict) | |||||
| def scale(self): | |||||
| """Scale the learning rates. | |||||
| """ | |||||
| scale_value = self.get_warmup_scale(self.base_scheduler._step_count | |||||
| - 1) | |||||
| if isinstance(scale_value, (int, float)): | |||||
| scale_value = [ | |||||
| scale_value for _ in range(len(self.optimizer.param_groups)) | |||||
| ] | |||||
| else: | |||||
| assert isinstance( | |||||
| scale_value, (list, tuple)), 'Only support list or tuple type!' | |||||
| assert len(scale_value) == len( | |||||
| self.optimizer.param_groups), ('Size mismatch {} != {}'.format( | |||||
| len(scale_value), len(self.optimizer.param_groups))) | |||||
| for i, group in enumerate(self.optimizer.param_groups): | |||||
| group['lr'] *= scale_value[i] | |||||
| def step(self, epoch=None): | |||||
| """ | |||||
| When ``self.base_scheduler._step_count`` is less than ``self.warmup_iters``, multiply lr by scale | |||||
| """ | |||||
| if self.base_scheduler._step_count > self.warmup_iters: | |||||
| return self.base_scheduler.step(epoch=epoch) | |||||
| for group, lr in zip(self.optimizer.param_groups, self.base_lrs): | |||||
| group['lr'] = lr | |||||
| # `base_scheduler` has done step() at init when build | |||||
| if self._is_init_step: | |||||
| self._is_init_step = False | |||||
| else: | |||||
| self.base_scheduler.step(epoch=epoch) | |||||
| self.scale() | |||||
| @classmethod | |||||
| def get_warmup_scale(self, cur_iter): | |||||
| pass | |||||
| @@ -0,0 +1,79 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.trainers.lrscheduler.builder import LR_SCHEDULER | |||||
| from .base import BaseWarmup | |||||
| @LR_SCHEDULER.register_module() | |||||
| class ConstantWarmup(BaseWarmup): | |||||
| """Linear warmup scheduler. | |||||
| Args: | |||||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||||
| warmup_ratio (float): Lr used at warmup stage equals to warmup_ratio * initial_lr | |||||
| warmup_iters (int | list): Warmup iterations | |||||
| last_epoch (int): The index of last epoch. | |||||
| """ | |||||
| def __init__(self, | |||||
| base_scheduler, | |||||
| warmup_iters, | |||||
| warmup_ratio=0.1, | |||||
| last_epoch=-1): | |||||
| self.warmup_ratio = warmup_ratio | |||||
| super(ConstantWarmup, self).__init__( | |||||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||||
| def get_warmup_scale(self, cur_iter): | |||||
| if cur_iter >= self.warmup_iters: | |||||
| return 1.0 | |||||
| return self.warmup_ratio | |||||
| @LR_SCHEDULER.register_module() | |||||
| class LinearWarmup(BaseWarmup): | |||||
| """Linear warmup scheduler. | |||||
| Args: | |||||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||||
| warmup_iters (int | list): Warmup iterations | |||||
| warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||||
| last_epoch (int): The index of last epoch. | |||||
| """ | |||||
| def __init__(self, | |||||
| base_scheduler, | |||||
| warmup_iters, | |||||
| warmup_ratio=0.1, | |||||
| last_epoch=-1): | |||||
| self.warmup_ratio = warmup_ratio | |||||
| super(LinearWarmup, self).__init__( | |||||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||||
| def get_warmup_scale(self, cur_iter): | |||||
| k = (1 - cur_iter / self.warmup_iters) * (1 - self.warmup_ratio) | |||||
| return 1 - k | |||||
| @LR_SCHEDULER.register_module() | |||||
| class ExponentialWarmup(BaseWarmup): | |||||
| """Exponential warmup scheduler. | |||||
| Args: | |||||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||||
| warmup_iters (int | list): Warmup iterations | |||||
| warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||||
| last_epoch (int): The index of last epoch. | |||||
| """ | |||||
| def __init__(self, | |||||
| base_scheduler, | |||||
| warmup_iters, | |||||
| warmup_ratio=0.1, | |||||
| last_epoch=-1): | |||||
| self.warmup_ratio = warmup_ratio | |||||
| super(ExponentialWarmup, self).__init__( | |||||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||||
| def get_warmup_scale(self, cur_iter): | |||||
| k = self.warmup_ratio**(1 - cur_iter / self.warmup_iters) | |||||
| return k | |||||
| @@ -0,0 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .builder import OPTIMIZERS, build_optimizer | |||||
| __all__ = ['OPTIMIZERS', 'build_optimizer'] | |||||
| @@ -0,0 +1,39 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import inspect | |||||
| import torch | |||||
| from modelscope.utils.config import ConfigDict | |||||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||||
| OPTIMIZERS = Registry('optimizer') | |||||
| def build_optimizer(model: torch.nn.Module, | |||||
| cfg: ConfigDict, | |||||
| default_args: dict = None): | |||||
| """ build optimizer from optimizer config dict | |||||
| Args: | |||||
| cfg (:obj:`ConfigDict`): config dict for optimizer object. | |||||
| default_args (dict, optional): Default initialization arguments. | |||||
| """ | |||||
| if hasattr(model, 'module'): | |||||
| model = model.module | |||||
| cfg.params = model.parameters() | |||||
| return build_from_cfg( | |||||
| cfg, OPTIMIZERS, group_key=default_group, default_args=default_args) | |||||
| def register_torch_optimizers(): | |||||
| for name, module in inspect.getmembers(torch.optim): | |||||
| if name.startswith('__'): | |||||
| continue | |||||
| if inspect.isclass(module) and issubclass(module, | |||||
| torch.optim.Optimizer): | |||||
| OPTIMIZERS.register_module( | |||||
| default_group, module_name=name, module_cls=module) | |||||
| register_torch_optimizers() | |||||
| @@ -0,0 +1,703 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os.path | |||||
| import random | |||||
| import time | |||||
| from distutils.version import LooseVersion | |||||
| from functools import partial | |||||
| from typing import Callable, List, Optional, Tuple, Union | |||||
| import numpy as np | |||||
| import torch | |||||
| from addict import Dict | |||||
| from torch import distributed as dist | |||||
| from torch import nn | |||||
| from torch.utils.data import DataLoader, Dataset | |||||
| from torch.utils.data.distributed import DistributedSampler | |||||
| from modelscope.hub.snapshot_download import snapshot_download | |||||
| from modelscope.metrics import build_metric, task_default_metrics | |||||
| from modelscope.models.base import Model | |||||
| from modelscope.models.base_torch import TorchModel | |||||
| from modelscope.msdatasets.ms_dataset import MsDataset | |||||
| from modelscope.preprocessors import build_preprocessor | |||||
| from modelscope.preprocessors.base import Preprocessor | |||||
| from modelscope.task_datasets import TorchTaskDataset, build_task_dataset | |||||
| from modelscope.trainers.hooks.builder import HOOKS | |||||
| from modelscope.trainers.hooks.priority import Priority, get_priority | |||||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||||
| from modelscope.trainers.optimizer.builder import build_optimizer | |||||
| from modelscope.utils.config import ConfigDict | |||||
| from modelscope.utils.constant import Hubs, ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from modelscope.utils.registry import build_from_cfg | |||||
| from modelscope.utils.tensor_utils import torch_default_data_collator | |||||
| from modelscope.utils.torch_utils import get_dist_info | |||||
| from .base import BaseTrainer | |||||
| from .builder import TRAINERS | |||||
| from .hooks.hook import Hook | |||||
| @TRAINERS.register_module() | |||||
| class EpochBasedTrainer(BaseTrainer): | |||||
| """Epoch based Trainer, a training helper for PyTorch. | |||||
| Args: | |||||
| cfg_file(str): The local config file. | |||||
| model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir | |||||
| or a model id. If model is None, build_model method will be called. | |||||
| data_collator (`Callable`, *optional*): | |||||
| The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. | |||||
| train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): | |||||
| The dataset to use for training. | |||||
| Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a | |||||
| distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a | |||||
| `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will | |||||
| manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally | |||||
| sets the seed of the RNGs used. | |||||
| eval_dataset (`torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation. | |||||
| preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor. | |||||
| NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code, | |||||
| this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file. | |||||
| Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and | |||||
| this preprocessing action will be executed every time the dataset's __getitem__ is called. | |||||
| optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple | |||||
| containing the optimizer and the scheduler to use. | |||||
| max_epochs: (int, optional): Total training epochs. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| model: Optional[Union[TorchModel, nn.Module, str]] = None, | |||||
| cfg_file: Optional[str] = None, | |||||
| arg_parse_fn: Optional[Callable] = None, | |||||
| data_collator: Optional[Callable] = None, | |||||
| train_dataset: Optional[Dataset] = None, | |||||
| eval_dataset: Optional[Dataset] = None, | |||||
| preprocessor: Optional[Preprocessor] = None, | |||||
| optimizers: Tuple[torch.optim.Optimizer, | |||||
| torch.optim.lr_scheduler._LRScheduler] = (None, | |||||
| None), | |||||
| **kwargs): | |||||
| if isinstance(model, str): | |||||
| if os.path.exists(model): | |||||
| self.model_dir = model if os.path.isdir( | |||||
| model) else os.path.dirname(model) | |||||
| else: | |||||
| self.model_dir = snapshot_download(model) | |||||
| cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) | |||||
| self.model = self.build_model() | |||||
| else: | |||||
| assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' | |||||
| assert isinstance( | |||||
| model, | |||||
| (TorchModel, nn.Module | |||||
| )), 'model should be either str, TorchMode or nn.Module.' | |||||
| self.model_dir = os.path.dirname(cfg_file) | |||||
| self.model = model | |||||
| super().__init__(cfg_file, arg_parse_fn) | |||||
| if 'work_dir' in kwargs: | |||||
| self.work_dir = kwargs['work_dir'] | |||||
| else: | |||||
| self.work_dir = self.cfg.train.get('work_dir', './work_dir') | |||||
| self.preprocessor = None | |||||
| if isinstance(preprocessor, Preprocessor): | |||||
| self.preprocessor = preprocessor | |||||
| elif hasattr(self.cfg, 'preprocessor'): | |||||
| self.preprocessor = self.build_preprocessor() | |||||
| # TODO @wenmeng.zwm add data collator option | |||||
| # TODO how to fill device option? | |||||
| self.device = int( | |||||
| os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None | |||||
| self.train_dataset = self.to_task_dataset( | |||||
| train_dataset, mode='train', preprocessor=self.preprocessor) | |||||
| self.eval_dataset = self.to_task_dataset( | |||||
| eval_dataset, mode='eval', preprocessor=self.preprocessor) | |||||
| self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||||
| self.metrics = self.get_metrics() | |||||
| self.optimizers = optimizers | |||||
| self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) | |||||
| self._mode = 'train' | |||||
| self._hooks: List[Hook] = [] | |||||
| self._epoch = 0 | |||||
| self._iter = 0 | |||||
| self._inner_iter = 0 | |||||
| if 'max_epochs' not in kwargs: | |||||
| assert hasattr( | |||||
| self.cfg.train, | |||||
| 'max_epochs'), 'max_epochs is missing in configuration file' | |||||
| self._max_epochs = self.cfg.train.max_epochs | |||||
| else: | |||||
| self._max_epochs = kwargs['max_epochs'] | |||||
| # TODO @wenmeng.zwm add seed init fn | |||||
| self._seed = 0 | |||||
| self._dist = get_dist_info()[1] > 1 | |||||
| @property | |||||
| def mode(self): | |||||
| return self._mode | |||||
| @property | |||||
| def hooks(self) -> List[Hook]: | |||||
| """list[:obj:`Hook`]: A list of registered hooks.""" | |||||
| return self._hooks | |||||
| @property | |||||
| def epoch(self) -> int: | |||||
| """int: Current epoch.""" | |||||
| return self._epoch | |||||
| @property | |||||
| def iter(self) -> int: | |||||
| """int: Current iteration.""" | |||||
| return self._iter | |||||
| @property | |||||
| def inner_iter(self) -> int: | |||||
| """int: Iteration in an epoch.""" | |||||
| return self._inner_iter | |||||
| @property | |||||
| def max_epochs(self): | |||||
| """int: Maximum training epochs.""" | |||||
| return self._max_epochs | |||||
| @property | |||||
| def max_iters(self): | |||||
| """int: Maximum training iterations.""" | |||||
| return self._max_epochs * len(self.data_loader) | |||||
| def to_task_dataset(self, | |||||
| datasets: Tuple[Dataset, List[Dataset]], | |||||
| mode: str, | |||||
| preprocessor: Optional[Preprocessor] = None): | |||||
| """Build the task specific dataset processor for this trainer. | |||||
| Returns: The task dataset processor for the task. If no result for the very model-type and task, | |||||
| the default TaskDataset will be returned. | |||||
| """ | |||||
| try: | |||||
| if not datasets: | |||||
| return datasets | |||||
| if isinstance(datasets, TorchTaskDataset): | |||||
| return datasets | |||||
| task_dataset = build_task_dataset( | |||||
| ConfigDict({ | |||||
| **self.cfg.model, | |||||
| 'mode': mode, | |||||
| 'preprocessor': preprocessor, | |||||
| 'datasets': datasets, | |||||
| }), getattr(self.cfg, 'task', None)) | |||||
| return task_dataset | |||||
| except Exception: | |||||
| if isinstance(datasets, (List, Tuple)) or preprocessor is not None: | |||||
| return TorchTaskDataset( | |||||
| datasets, | |||||
| mode=mode, | |||||
| preprocessor=preprocessor, | |||||
| **(self.cfg.model if hasattr(self.cfg, 'model') else {})) | |||||
| else: | |||||
| return datasets | |||||
| def build_preprocessor(self) -> Preprocessor: | |||||
| """Build the preprocessor. | |||||
| User can override this method to implement custom logits. | |||||
| Returns: The preprocessor instance. | |||||
| """ | |||||
| # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||||
| # when they are different ones in training and evaluation | |||||
| cfg = ConfigDict({ | |||||
| **getattr(self.cfg, 'preprocessor'), 'model_dir': | |||||
| self.model_dir | |||||
| }) | |||||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||||
| def get_metrics(self) -> List[str]: | |||||
| """Get the metric class types. | |||||
| The first choice will be the metrics configured in the config file, if not found, the default metrics will be | |||||
| used. | |||||
| If no metrics is found and the eval dataset exists, the method will raise an error. | |||||
| Returns: The metric types. | |||||
| """ | |||||
| metrics = self.cfg.evaluation.metrics if hasattr( | |||||
| self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, | |||||
| 'metrics') else None | |||||
| metrics = metrics if metrics is not None else task_default_metrics.get( | |||||
| self.cfg.task) | |||||
| if metrics is None and self.eval_dataset is not None: | |||||
| raise ValueError( | |||||
| f'Metrics are needed in evaluation, please try to either ' | |||||
| f'add metrics in configuration.json or add the default metric for {self.cfg.task}.' | |||||
| ) | |||||
| if isinstance(metrics, str): | |||||
| metrics = [metrics] | |||||
| return metrics | |||||
| def train(self, *args, **kwargs): | |||||
| self.model.train() | |||||
| self._mode = 'train' | |||||
| if self.train_dataset is None: | |||||
| self.train_dataloader = self.get_train_dataloader() | |||||
| else: | |||||
| self.train_dataloader = self._build_dataloader_with_dataset( | |||||
| self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||||
| self.data_loader = self.train_dataloader | |||||
| self.register_optimizers_hook() | |||||
| self.register_hook_from_cfg(self.cfg.train.hooks) | |||||
| self.train_loop(self.train_dataloader) | |||||
| def evaluate(self, checkpoint_path=None): | |||||
| self.model.eval() | |||||
| self._mode = 'val' | |||||
| if self.eval_dataset is None: | |||||
| self.eval_dataloader = self.get_eval_data_loader() | |||||
| else: | |||||
| self.eval_dataloader = self._build_dataloader_with_dataset( | |||||
| self.eval_dataset, **self.cfg.evaluation.get('dataloader', {})) | |||||
| self.data_loader = self.eval_dataloader | |||||
| metric_classes = [build_metric(metric) for metric in self.metrics] | |||||
| self.evaluation_loop(self.eval_dataloader, checkpoint_path, | |||||
| metric_classes) | |||||
| metric_values = {} | |||||
| for metric_cls in metric_classes: | |||||
| metric_values.update(metric_cls.evaluate()) | |||||
| return metric_values | |||||
| def build_model(self) -> Union[nn.Module, TorchModel]: | |||||
| """ Instantiate a pytorch model and return. | |||||
| By default, we will create a model using config from configuration file. You can | |||||
| subclass and override this method in a subclass. | |||||
| """ | |||||
| # TODO temp implementation, waiting for @zhangzhicheng | |||||
| model = Model.from_pretrained(self.model_dir) | |||||
| if not isinstance(model, nn.Module) and hasattr(model, 'model'): | |||||
| return model.model | |||||
| def collate_fn(self, data): | |||||
| """Prepare the input just before the forward function. | |||||
| This method will move the tensors to the right device. | |||||
| Usually this method does not need to be overridden. | |||||
| Args: | |||||
| data: The data out of the dataloader. | |||||
| Returns: The processed data. | |||||
| """ | |||||
| if isinstance(data, dict): | |||||
| return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||||
| elif isinstance(data, (tuple, np.ndarray, list)): | |||||
| return type(data)(self.collate_fn(v) for v in data) | |||||
| elif isinstance(data, torch.Tensor) and self.device is not None: | |||||
| kwargs = dict(device=self.device) | |||||
| return data.to(**kwargs) | |||||
| return data | |||||
| def train_step(self, model, inputs): | |||||
| """ Perform a training step on a batch of inputs. | |||||
| Subclass and override to inject custom behavior. | |||||
| Args: | |||||
| model (`TorchModel`): The model to train. | |||||
| inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||||
| The inputs and targets of the model. | |||||
| The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||||
| argument `labels`. Check your model's documentation for all accepted arguments. | |||||
| Return: | |||||
| `torch.Tensor`: The tensor with training loss on this batch. | |||||
| """ | |||||
| # EvaluationHook will do evaluate and change mode to val, return to train mode | |||||
| # TODO: find more pretty way to change mode | |||||
| model.train() | |||||
| self._mode = 'train' | |||||
| inputs = self.collate_fn(inputs) | |||||
| if isinstance(inputs, dict): | |||||
| train_outputs = model.forward(**inputs) | |||||
| else: | |||||
| train_outputs = model.forward(inputs) | |||||
| if not isinstance(train_outputs, dict): | |||||
| raise TypeError( | |||||
| '"model.train_step()" and "model.val_step()" must return a dict' | |||||
| ) | |||||
| # add model output info to log | |||||
| if 'log_vars' not in train_outputs: | |||||
| default_keys_pattern = ['loss'] | |||||
| match_keys = set([]) | |||||
| for key_p in default_keys_pattern: | |||||
| match_keys.update( | |||||
| [key for key in train_outputs.keys() if key_p in key]) | |||||
| log_vars = {} | |||||
| for key in match_keys: | |||||
| value = train_outputs.get(key, None) | |||||
| if value is not None: | |||||
| if dist.is_available() and dist.is_initialized(): | |||||
| value = value.data.clone() | |||||
| dist.all_reduce(value.div_(dist.get_world_size())) | |||||
| log_vars.update({key: value.item()}) | |||||
| self.log_buffer.update(log_vars) | |||||
| else: | |||||
| self.log_buffer.update(train_outputs['log_vars']) | |||||
| self.train_outputs = train_outputs | |||||
| def prediction_step(self, model, inputs): | |||||
| """ Perform forward step by `model` using `inputs`. | |||||
| Args: | |||||
| model (`TorchModel`): The model to evaluate. | |||||
| inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||||
| The inputs and targets of the model. | |||||
| The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||||
| argument `labels`. Check your model's documentation for all accepted arguments. | |||||
| prediction_loss_only (`bool`): | |||||
| Whether or not to return the loss only. | |||||
| ignore_keys (`Lst[str]`, *optional*): | |||||
| A list of keys in the output of your model (if it is a dictionary) that should be ignored when | |||||
| gathering predictions. | |||||
| Return: | |||||
| Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, | |||||
| logits and labels (each being optional). | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def get_train_dataloader(self): | |||||
| """ Builder torch dataloader for training. | |||||
| We provide a reasonable default that works well. If you want to use something else, you can change | |||||
| the config for data.train in configuration file, or subclass and override this method | |||||
| (or `get_train_dataloader` in a subclass. | |||||
| """ | |||||
| train_data = self.cfg.dataset.train | |||||
| if self.train_dataset is None: | |||||
| self.train_dataset = self.build_dataset(train_data, mode='train') | |||||
| data_loader = self._build_dataloader_with_dataset( | |||||
| self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||||
| return data_loader | |||||
| def get_eval_data_loader(self): | |||||
| """ Builder torch dataloader for evaluation. | |||||
| We provide a reasonable default that works well. If you want to use something else, you can change | |||||
| the config for dataset.eval in configuration file, or subclass and override this method in a subclass. | |||||
| pass | |||||
| """ | |||||
| val_data = self.cfg.dataset.val | |||||
| if self.eval_dataset is None: | |||||
| self.eval_dataset = self.build_dataset(val_data, mode='eval') | |||||
| batch_size = self.cfg.evaluation.batch_size | |||||
| workers = self.cfg.evaluation.workers | |||||
| shuffle = self.cfg.evaluation.get('shuffle', False) | |||||
| data_loader = self._build_dataloader_with_dataset( | |||||
| self.eval_dataset, | |||||
| batch_size_per_gpu=batch_size, | |||||
| workers_per_gpu=workers, | |||||
| shuffle=shuffle, | |||||
| dist=self._dist, | |||||
| seed=self._seed, | |||||
| persistent_workers=True, | |||||
| ) | |||||
| return data_loader | |||||
| def build_dataset(self, data_cfg, mode): | |||||
| """ Build torch dataset object using data config | |||||
| """ | |||||
| dataset = MsDataset.load( | |||||
| dataset_name=data_cfg.name, | |||||
| split=data_cfg.split, | |||||
| subset_name=data_cfg.subset_name if hasattr( | |||||
| data_cfg, 'subset_name') else None, | |||||
| hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||||
| ) | |||||
| torch_dataset = dataset.to_torch_dataset( | |||||
| preprocessors=self.preprocessor, ) | |||||
| dataset = self.to_task_dataset( | |||||
| torch_dataset, mode, preprocessor=self.preprocessor) | |||||
| return dataset | |||||
| def create_optimizer_and_scheduler(self): | |||||
| """ Create optimizer and lr scheduler | |||||
| We provide a default implementation, if you want to customize your own optimizer | |||||
| and lr scheduler, you can either pass a tuple through trainer init function or | |||||
| subclass this class and override this method. | |||||
| """ | |||||
| optimizer, lr_scheduler = self.optimizers | |||||
| if optimizer is None: | |||||
| optimizer_cfg = self.cfg.train.get('optimizer', None) | |||||
| else: | |||||
| optimizer_cfg = None | |||||
| optim_options = {} | |||||
| if optimizer_cfg is not None: | |||||
| optim_options = optimizer_cfg.pop('options', {}) | |||||
| optimizer = build_optimizer(self.model, cfg=optimizer_cfg) | |||||
| if lr_scheduler is None: | |||||
| lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None) | |||||
| else: | |||||
| lr_scheduler_cfg = None | |||||
| lr_options = {} | |||||
| if lr_scheduler_cfg is not None: | |||||
| assert optimizer is not None | |||||
| lr_options = lr_scheduler_cfg.pop('options', {}) | |||||
| lr_scheduler = build_lr_scheduler( | |||||
| cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer}) | |||||
| self.optimizer = optimizer | |||||
| self.lr_scheduler = lr_scheduler | |||||
| return self.optimizer, self.lr_scheduler, optim_options, lr_options | |||||
| def register_optimizers_hook(self): | |||||
| """ Register optimizer hook and lr scheduler hook. | |||||
| """ | |||||
| optimizer, lr_scheduler = self.optimizers | |||||
| opti_error_msg = 'optimizers should be a tuple of `torch.optim.Optimizer`'\ | |||||
| ' and `torch.optim.lr_scheduler._LRScheduler`' | |||||
| if optimizer is not None: | |||||
| assert isinstance(optimizer, torch.optim.Optimizer), opti_error_msg | |||||
| if lr_scheduler is not None: | |||||
| assert isinstance( | |||||
| lr_scheduler, | |||||
| torch.optim.lr_scheduler._LRScheduler), opti_error_msg | |||||
| _, _, optim_options, lr_options = self.create_optimizer_and_scheduler() | |||||
| lr_hook = dict(type='LrSchedulerHook', **lr_options) | |||||
| optim_hook = dict(type='OptimizerHook', **optim_options) | |||||
| self.register_hook_from_cfg([lr_hook, optim_hook]) | |||||
| def _build_dataloader_with_dataset(self, | |||||
| dataset: Dataset, | |||||
| batch_size_per_gpu: int, | |||||
| workers_per_gpu: int, | |||||
| dist: bool = False, | |||||
| shuffle: bool = True, | |||||
| seed: int = 0, | |||||
| persistent_workers=False, | |||||
| **kwargs) -> DataLoader: | |||||
| """Build dataloader using input dataset and cfg. Used by `EpochBasedTrainer.train()` | |||||
| and `EpochBasedTrainer.evaluate()`. | |||||
| In distributed training, each GPU/process has a dataloader. | |||||
| In non-distributed training, there is only one dataloader for all GPUs. | |||||
| Args: | |||||
| dataset (Dataset): A PyTorch dataset. | |||||
| batch_size_per_gpu (int): Number of training samples on each GPU, i.e., | |||||
| batch size of each GPU. | |||||
| workers_per_gpu (int): How many subprocesses to use for data loading | |||||
| for each GPU. | |||||
| dist (bool): Distributed training/test or not. Default: True. | |||||
| shuffle (bool): Whether to shuffle the data at every epoch. | |||||
| Default: True. | |||||
| seed (int, Optional): Seed to be used. Default: 0. | |||||
| runner_type (str): Type of runner. Default: `EpochBasedRunner` | |||||
| persistent_workers (bool): If True, the data loader will not shutdown | |||||
| the worker processes after a dataset has been consumed once. | |||||
| This allows to maintain the workers `Dataset` instances alive. | |||||
| This argument is only valid when PyTorch>=1.7.0. Default: False. | |||||
| kwargs: any keyword argument to be used to initialize DataLoader | |||||
| Returns: | |||||
| DataLoader: A PyTorch dataloader. | |||||
| """ | |||||
| rank, world_size = get_dist_info() | |||||
| if dist: | |||||
| # When model is :obj:`DistributedDataParallel`, | |||||
| # `batch_size` of :obj:`dataloader` is the | |||||
| # number of training samples on each GPU. | |||||
| batch_size = batch_size_per_gpu | |||||
| num_workers = workers_per_gpu | |||||
| else: | |||||
| batch_size = batch_size_per_gpu | |||||
| num_workers = workers_per_gpu | |||||
| if dist: | |||||
| sampler = DistributedSampler( | |||||
| dataset, world_size, rank, shuffle=shuffle, seed=seed) | |||||
| else: | |||||
| sampler = None | |||||
| batch_sampler = None | |||||
| init_fn = partial( | |||||
| worker_init_fn, num_workers=num_workers, rank=rank, | |||||
| seed=seed) if seed is not None else None | |||||
| if LooseVersion(torch.__version__) >= LooseVersion('1.7.0'): | |||||
| kwargs['persistent_workers'] = persistent_workers | |||||
| elif persistent_workers is True: | |||||
| self.logger.warning( | |||||
| 'persistent_workers is invalid because your pytorch ' | |||||
| 'version is lower than 1.7.0') | |||||
| data_loader = DataLoader( | |||||
| dataset, | |||||
| batch_size=batch_size, | |||||
| sampler=sampler, | |||||
| num_workers=num_workers, | |||||
| batch_sampler=batch_sampler, | |||||
| collate_fn=self.data_collator, | |||||
| pin_memory=kwargs.pop('pin_memory', False), | |||||
| worker_init_fn=init_fn, | |||||
| **kwargs) | |||||
| return data_loader | |||||
| def train_loop(self, data_loader): | |||||
| """ Training loop used by `EpochBasedTrainer.train()` | |||||
| """ | |||||
| self.invoke_hook('before_run') | |||||
| self._epoch = 0 | |||||
| kwargs = {} | |||||
| for _ in range(self._epoch, self._max_epochs): | |||||
| self.invoke_hook('before_train_epoch') | |||||
| time.sleep(2) # Prevent possible deadlock during epoch transition | |||||
| for i, data_batch in enumerate(data_loader): | |||||
| self.data_batch = data_batch | |||||
| self._inner_iter = i | |||||
| self.invoke_hook('before_train_iter') | |||||
| self.train_step(self.model, data_batch, **kwargs) | |||||
| self.invoke_hook('after_train_iter') | |||||
| del self.data_batch | |||||
| self._iter += 1 | |||||
| self.invoke_hook('after_train_epoch') | |||||
| self._epoch += 1 | |||||
| time.sleep(1) # wait for some hooks like loggers to finish | |||||
| self.invoke_hook('after_run') | |||||
| def evaluation_loop(self, data_loader, checkpoint_path, metric_classes): | |||||
| """ Evaluation loop used by `EpochBasedTrainer.evaluate()`. | |||||
| """ | |||||
| if self._dist: | |||||
| from modelscope.trainers.utils.inference import multi_gpu_test | |||||
| multi_gpu_test( | |||||
| self.model, | |||||
| data_loader, | |||||
| tmpdir=None, | |||||
| gpu_collect=False, | |||||
| data_collate_fn=self.collate_fn, | |||||
| metric_classes=metric_classes) | |||||
| else: | |||||
| from modelscope.trainers.utils.inference import single_gpu_test | |||||
| single_gpu_test( | |||||
| self.model, | |||||
| data_loader, | |||||
| data_collate_fn=self.collate_fn, | |||||
| metric_classes=metric_classes) | |||||
| def register_hook(self, hook: Hook) -> None: | |||||
| """Register a hook into the hook list. | |||||
| The hook will be inserted into a priority queue, with the specified | |||||
| priority (See :class:`Priority` for details of priorities). | |||||
| For hooks with the same priority, they will be triggered in the same | |||||
| order as they are registered. | |||||
| Args: | |||||
| hook (:obj:`Hook`): The hook to be registered. | |||||
| """ | |||||
| assert isinstance(hook, Hook) | |||||
| # insert the hook to a sorted list | |||||
| inserted = False | |||||
| for i in range(len(self._hooks) - 1, -1, -1): | |||||
| if get_priority(hook.PRIORITY) > get_priority( | |||||
| self._hooks[i].PRIORITY): | |||||
| self._hooks.insert(i + 1, hook) | |||||
| inserted = True | |||||
| break | |||||
| if not inserted: | |||||
| self._hooks.insert(0, hook) | |||||
| def register_hook_from_cfg(self, hook_cfg: Dict) -> None: | |||||
| """Register a hook from its cfg. | |||||
| Args: | |||||
| hook_cfg (dict): Hook config. It should have at least keys 'type' | |||||
| and 'priority' indicating its type and priority. | |||||
| Note: | |||||
| The specific hook class to register should not use 'type' and | |||||
| 'priority' arguments during initialization. | |||||
| """ | |||||
| hook_cfg = hook_cfg.copy() | |||||
| assert isinstance(hook_cfg, list) | |||||
| for cfg_i in hook_cfg: | |||||
| hook = build_from_cfg(cfg_i, HOOKS) | |||||
| self.register_hook(hook) | |||||
| def invoke_hook(self, fn_name: str) -> None: | |||||
| """Call all hooks. | |||||
| Args: | |||||
| fn_name (str): The function name in each hook to be called, such as | |||||
| "before_train_epoch". | |||||
| """ | |||||
| for hook in self._hooks: | |||||
| getattr(hook, fn_name)(self) | |||||
| def get_hook_info(self) -> str: | |||||
| # Get hooks info in each stage | |||||
| stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages} | |||||
| for hook in self.hooks: | |||||
| try: | |||||
| priority = Priority(hook.priority).name # type: ignore | |||||
| except ValueError: | |||||
| priority = hook.priority # type: ignore | |||||
| classname = hook.__class__.__name__ | |||||
| hook_info = f'({priority:<12}) {classname:<35}' | |||||
| for trigger_stage in hook.get_triggered_stages(): | |||||
| stage_hook_map[trigger_stage].append(hook_info) | |||||
| stage_hook_infos = [] | |||||
| for stage in Hook.stages: | |||||
| hook_infos = stage_hook_map[stage] | |||||
| if len(hook_infos) > 0: | |||||
| info = f'{stage}:\n' | |||||
| info += '\n'.join(hook_infos) | |||||
| info += '\n -------------------- ' | |||||
| stage_hook_infos.append(info) | |||||
| return '\n'.join(stage_hook_infos) | |||||
| def worker_init_fn(worker_id, num_workers, rank, seed): | |||||
| # The seed of each worker equals to | |||||
| # num_worker * rank + worker_id + user_seed | |||||
| worker_seed = num_workers * rank + worker_id + seed | |||||
| np.random.seed(worker_seed) | |||||
| random.seed(worker_seed) | |||||
| torch.manual_seed(worker_seed) | |||||
| @@ -0,0 +1,208 @@ | |||||
| # Copyright (c) OpenMMLab. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import pickle | |||||
| import shutil | |||||
| import tempfile | |||||
| import time | |||||
| import torch | |||||
| from torch import distributed as dist | |||||
| from tqdm import tqdm | |||||
| from modelscope.utils.torch_utils import get_dist_info | |||||
| def single_gpu_test(model, | |||||
| data_loader, | |||||
| data_collate_fn=None, | |||||
| metric_classes=None): | |||||
| """Test model with a single gpu. | |||||
| Args: | |||||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||||
| model (nn.Module): Model to be tested. | |||||
| data_loader (nn.Dataloader): Pytorch data loader. | |||||
| metric_classes(List): List of Metric class that uses to collect metrics | |||||
| Returns: | |||||
| list: The prediction results. | |||||
| """ | |||||
| model.eval() | |||||
| dataset = data_loader.dataset | |||||
| with tqdm(total=len(dataset), desc='test samples') as pbar: | |||||
| for data in data_loader: | |||||
| if data_collate_fn is not None: | |||||
| data = data_collate_fn(data) | |||||
| with torch.no_grad(): | |||||
| result = model(**data) | |||||
| if metric_classes is not None: | |||||
| for metric_cls in metric_classes: | |||||
| metric_cls.add(result, data) | |||||
| batch_size = len(result) | |||||
| for _ in range(batch_size): | |||||
| pbar.update() | |||||
| def multi_gpu_test(model, | |||||
| data_loader, | |||||
| tmpdir=None, | |||||
| gpu_collect=False, | |||||
| data_collate_fn=None, | |||||
| metric_classes=None): | |||||
| """Test model with multiple gpus. | |||||
| This method tests model with multiple gpus and collects the results | |||||
| under two different modes: gpu and cpu modes. By setting | |||||
| ``gpu_collect=True``, it encodes results to gpu tensors and use gpu | |||||
| communication for results collection. On cpu mode it saves the results on | |||||
| different gpus to ``tmpdir`` and collects them by the rank 0 worker. | |||||
| Args: | |||||
| model (nn.Module): Model to be tested. | |||||
| data_loader (nn.Dataloader): Pytorch data loader. | |||||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||||
| tmpdir (str): Path of directory to save the temporary results from | |||||
| different gpus under cpu mode. | |||||
| gpu_collect (bool): Option to use either gpu or cpu to collect results. | |||||
| metric_classes(List): List of Metric class that uses to collect metrics | |||||
| Returns: | |||||
| list: The prediction results. | |||||
| """ | |||||
| model.eval() | |||||
| results = [] | |||||
| dataset = data_loader.dataset | |||||
| time.sleep(2) # This line can prevent deadlock problem in some cases. | |||||
| count = 0 | |||||
| with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | |||||
| for _, data in enumerate(data_loader): | |||||
| if data_collate_fn is not None: | |||||
| data = data_collate_fn(data) | |||||
| with torch.no_grad(): | |||||
| result = model(**data) | |||||
| results.extend(result) | |||||
| rank, world_size = get_dist_info() | |||||
| if rank == 0: | |||||
| batch_size = len(result) | |||||
| batch_size_all = batch_size * world_size | |||||
| count += batch_size_all | |||||
| if count > len(dataset): | |||||
| batch_size_all = len(dataset) - (count - batch_size_all) | |||||
| for _ in range(batch_size_all): | |||||
| pbar.update() | |||||
| # collect results from all ranks | |||||
| if gpu_collect: | |||||
| results = collect_results_gpu(results, len(dataset)) | |||||
| else: | |||||
| results = collect_results_cpu(results, len(dataset), tmpdir) | |||||
| ground_truths = [dataset[i] for i in range(len(dataset))] | |||||
| if metric_classes is not None: | |||||
| for metric_cls in metric_classes: | |||||
| metric_cls.add(results, ground_truths) | |||||
| def collect_results_cpu(result_part, size, tmpdir=None): | |||||
| """Collect results under cpu mode. | |||||
| On cpu mode, this function will save the results on different gpus to | |||||
| ``tmpdir`` and collect them by the rank 0 worker. | |||||
| Args: | |||||
| result_part (list): Result list containing result parts | |||||
| to be collected. | |||||
| size (int): Size of the results, commonly equal to length of | |||||
| the results. | |||||
| tmpdir (str | None): temporal directory for collected results to | |||||
| store. If set to None, it will create a random temporal directory | |||||
| for it. | |||||
| Returns: | |||||
| list: The collected results. | |||||
| """ | |||||
| rank, world_size = get_dist_info() | |||||
| # TODO create a random tmp dir if it is not specified | |||||
| if tmpdir is None: | |||||
| tmpdir = tempfile.gettempdir() | |||||
| if not os.path.exists(tmpdir): | |||||
| os.makedirs(tmpdir) | |||||
| # dump the part result to the dir | |||||
| pickle.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl')) | |||||
| dist.barrier() | |||||
| # collect all parts | |||||
| if rank != 0: | |||||
| return None | |||||
| else: | |||||
| # load results of all parts from tmp dir | |||||
| part_list = [] | |||||
| for i in range(world_size): | |||||
| part_file = os.path.join(tmpdir, f'part_{i}.pkl') | |||||
| part_result = pickle.load(part_file) | |||||
| # When data is severely insufficient, an empty part_result | |||||
| # on a certain gpu could makes the overall outputs empty. | |||||
| if part_result: | |||||
| part_list.append(part_result) | |||||
| # sort the results | |||||
| ordered_results = [] | |||||
| for res in zip(*part_list): | |||||
| ordered_results.extend(list(res)) | |||||
| # the dataloader may pad some samples | |||||
| ordered_results = ordered_results[:size] | |||||
| # remove tmp dir | |||||
| shutil.rmtree(tmpdir) | |||||
| return ordered_results | |||||
| def collect_results_gpu(result_part, size): | |||||
| """Collect results under gpu mode. | |||||
| On gpu mode, this function will encode results to gpu tensors and use gpu | |||||
| communication for results collection. | |||||
| Args: | |||||
| result_part (list): Result list containing result parts | |||||
| to be collected. | |||||
| size (int): Size of the results, commonly equal to length of | |||||
| the results. | |||||
| Returns: | |||||
| list: The collected results. | |||||
| """ | |||||
| rank, world_size = get_dist_info() | |||||
| # dump result part to tensor with pickle | |||||
| part_tensor = torch.tensor( | |||||
| bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') | |||||
| # gather all result part tensor shape | |||||
| shape_tensor = torch.tensor(part_tensor.shape, device='cuda') | |||||
| shape_list = [shape_tensor.clone() for _ in range(world_size)] | |||||
| dist.all_gather(shape_list, shape_tensor) | |||||
| # padding result part tensor to max length | |||||
| shape_max = torch.tensor(shape_list).max() | |||||
| part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') | |||||
| part_send[:shape_tensor[0]] = part_tensor | |||||
| part_recv_list = [ | |||||
| part_tensor.new_zeros(shape_max) for _ in range(world_size) | |||||
| ] | |||||
| # gather all result part | |||||
| dist.all_gather(part_recv_list, part_send) | |||||
| if rank == 0: | |||||
| part_list = [] | |||||
| for recv, shape in zip(part_recv_list, shape_list): | |||||
| part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) | |||||
| # When data is severely insufficient, an empty part_result | |||||
| # on a certain gpu could makes the overall outputs empty. | |||||
| if part_result: | |||||
| part_list.append(part_result) | |||||
| # sort the results | |||||
| ordered_results = [] | |||||
| for res in zip(*part_list): | |||||
| ordered_results.extend(list(res)) | |||||
| # the dataloader may pad some samples | |||||
| ordered_results = ordered_results[:size] | |||||
| return ordered_results | |||||
| @@ -0,0 +1,42 @@ | |||||
| # Copyright (c) OpenMMLab. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from collections import OrderedDict | |||||
| import numpy as np | |||||
| class LogBuffer: | |||||
| def __init__(self): | |||||
| self.val_history = OrderedDict() | |||||
| self.n_history = OrderedDict() | |||||
| self.output = OrderedDict() | |||||
| self.ready = False | |||||
| def clear(self) -> None: | |||||
| self.val_history.clear() | |||||
| self.n_history.clear() | |||||
| self.clear_output() | |||||
| def clear_output(self) -> None: | |||||
| self.output.clear() | |||||
| self.ready = False | |||||
| def update(self, vars: dict, count: int = 1) -> None: | |||||
| assert isinstance(vars, dict) | |||||
| for key, var in vars.items(): | |||||
| if key not in self.val_history: | |||||
| self.val_history[key] = [] | |||||
| self.n_history[key] = [] | |||||
| self.val_history[key].append(var) | |||||
| self.n_history[key].append(count) | |||||
| def average(self, n: int = 0) -> None: | |||||
| """Average latest n values or all values.""" | |||||
| assert n >= 0 | |||||
| for key in self.val_history: | |||||
| values = np.array(self.val_history[key][-n:]) | |||||
| nums = np.array(self.n_history[key][-n:]) | |||||
| avg = np.sum(values * nums) / np.sum(nums) | |||||
| self.output[key] = avg | |||||
| self.ready = True | |||||
| @@ -0,0 +1,74 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import io | |||||
| import time | |||||
| from collections import OrderedDict | |||||
| from typing import Optional | |||||
| import torch | |||||
| from torch.optim import Optimizer | |||||
| from modelscope import __version__ | |||||
| from modelscope.fileio import File | |||||
| def weights_to_cpu(state_dict): | |||||
| """Copy a model state_dict to cpu. | |||||
| Args: | |||||
| state_dict (OrderedDict): Model weights on GPU. | |||||
| Returns: | |||||
| OrderedDict: Model weights on GPU. | |||||
| """ | |||||
| state_dict_cpu = OrderedDict() | |||||
| for key, val in state_dict.items(): | |||||
| state_dict_cpu[key] = val.cpu() | |||||
| # Keep metadata in state_dict | |||||
| state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) | |||||
| return state_dict_cpu | |||||
| def save_checkpoint(model: torch.nn.Module, | |||||
| filename: str, | |||||
| optimizer: Optional[Optimizer] = None, | |||||
| meta: Optional[dict] = None) -> None: | |||||
| """Save checkpoint to file. | |||||
| The checkpoint will have 3 fields: ``meta``, ``state_dict`` and | |||||
| ``optimizer``. By default ``meta`` will contain version and time info. | |||||
| Args: | |||||
| model (Module): Module whose params are to be saved. | |||||
| filename (str): Checkpoint filename. | |||||
| optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. | |||||
| meta (dict, optional): Metadata to be saved in checkpoint. | |||||
| """ | |||||
| if meta is None: | |||||
| meta = {} | |||||
| elif not isinstance(meta, dict): | |||||
| raise TypeError(f'meta must be a dict or None, but got {type(meta)}') | |||||
| meta.update(modescope=__version__, time=time.asctime()) | |||||
| if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||||
| model = model.module | |||||
| if hasattr(model, 'CLASSES') and model.CLASSES is not None: | |||||
| # save class name to the meta | |||||
| meta.update(CLASSES=model.CLASSES) | |||||
| checkpoint = { | |||||
| 'meta': meta, | |||||
| 'state_dict': weights_to_cpu(model.state_dict()) | |||||
| } | |||||
| # save optimizer state dict in the checkpoint | |||||
| if isinstance(optimizer, Optimizer): | |||||
| checkpoint['optimizer'] = optimizer.state_dict() | |||||
| elif isinstance(optimizer, dict): | |||||
| checkpoint['optimizer'] = {} | |||||
| for name, optim in optimizer.items(): | |||||
| checkpoint['optimizer'][name] = optim.state_dict() | |||||
| with io.BytesIO() as f: | |||||
| torch.save(checkpoint, f) | |||||
| File.write(f.getvalue(), filename) | |||||
| @@ -13,12 +13,7 @@ class Fields(object): | |||||
| multi_modal = 'multi-modal' | multi_modal = 'multi-modal' | ||||
| class Tasks(object): | |||||
| """ Names for tasks supported by modelscope. | |||||
| Holds the standard task name to use for identifying different tasks. | |||||
| This should be used to register models, pipelines, trainers. | |||||
| """ | |||||
| class CVTasks(object): | |||||
| # vision tasks | # vision tasks | ||||
| image_to_text = 'image-to-text' | image_to_text = 'image-to-text' | ||||
| pose_estimation = 'pose-estimation' | pose_estimation = 'pose-estimation' | ||||
| @@ -33,6 +28,8 @@ class Tasks(object): | |||||
| action_recognition = 'action-recognition' | action_recognition = 'action-recognition' | ||||
| video_embedding = 'video-embedding' | video_embedding = 'video-embedding' | ||||
| class NLPTasks(object): | |||||
| # nlp tasks | # nlp tasks | ||||
| word_segmentation = 'word-segmentation' | word_segmentation = 'word-segmentation' | ||||
| nli = 'nli' | nli = 'nli' | ||||
| @@ -56,11 +53,15 @@ class Tasks(object): | |||||
| question_answering = 'question-answering' | question_answering = 'question-answering' | ||||
| zero_shot_classification = 'zero-shot-classification' | zero_shot_classification = 'zero-shot-classification' | ||||
| class AudioTasks(object): | |||||
| # audio tasks | # audio tasks | ||||
| auto_speech_recognition = 'auto-speech-recognition' | auto_speech_recognition = 'auto-speech-recognition' | ||||
| text_to_speech = 'text-to-speech' | text_to_speech = 'text-to-speech' | ||||
| speech_signal_process = 'speech-signal-process' | speech_signal_process = 'speech-signal-process' | ||||
| class MultiModalTasks(object): | |||||
| # multi-modal tasks | # multi-modal tasks | ||||
| image_captioning = 'image-captioning' | image_captioning = 'image-captioning' | ||||
| visual_grounding = 'visual-grounding' | visual_grounding = 'visual-grounding' | ||||
| @@ -69,6 +70,47 @@ class Tasks(object): | |||||
| visual_question_answering = 'visual-question-answering' | visual_question_answering = 'visual-question-answering' | ||||
| class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks): | |||||
| """ Names for tasks supported by modelscope. | |||||
| Holds the standard task name to use for identifying different tasks. | |||||
| This should be used to register models, pipelines, trainers. | |||||
| """ | |||||
| reverse_field_index = {} | |||||
| @staticmethod | |||||
| def find_field_by_task(task_name): | |||||
| if len(Tasks.reverse_field_index) == 0: | |||||
| # Lazy init, not thread safe | |||||
| field_dict = { | |||||
| Fields.cv: [ | |||||
| getattr(Tasks, attr) for attr in dir(CVTasks) | |||||
| if not attr.startswith('__') | |||||
| ], | |||||
| Fields.nlp: [ | |||||
| getattr(Tasks, attr) for attr in dir(NLPTasks) | |||||
| if not attr.startswith('__') | |||||
| ], | |||||
| Fields.audio: [ | |||||
| getattr(Tasks, attr) for attr in dir(AudioTasks) | |||||
| if not attr.startswith('__') | |||||
| ], | |||||
| Fields.multi_modal: [ | |||||
| getattr(Tasks, attr) for attr in dir(MultiModalTasks) | |||||
| if not attr.startswith('__') | |||||
| ], | |||||
| } | |||||
| for field, tasks in field_dict.items(): | |||||
| for task in tasks: | |||||
| if task in Tasks.reverse_field_index: | |||||
| raise ValueError(f'Duplicate task: {task}') | |||||
| Tasks.reverse_field_index[task] = field | |||||
| return Tasks.reverse_field_index.get(task_name) | |||||
| class InputFields(object): | class InputFields(object): | ||||
| """ Names for input data fields in the input data for pipelines | """ Names for input data fields in the input data for pipelines | ||||
| """ | """ | ||||
| @@ -100,6 +142,7 @@ class ModelFile(object): | |||||
| TF_CKPT_PREFIX = 'ckpt-' | TF_CKPT_PREFIX = 'ckpt-' | ||||
| TORCH_MODEL_FILE = 'pytorch_model.pt' | TORCH_MODEL_FILE = 'pytorch_model.pt' | ||||
| TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' | TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' | ||||
| LABEL_MAPPING = 'label_mapping.json' | |||||
| class Requirements(object): | class Requirements(object): | ||||
| @@ -86,3 +86,16 @@ def get_model_type(model_dir): | |||||
| return cfg.model_type if hasattr(cfg, 'model_type') else None | return cfg.model_type if hasattr(cfg, 'model_type') else None | ||||
| except Exception as e: | except Exception as e: | ||||
| logger.error(f'parse config file failed with error: {e}') | logger.error(f'parse config file failed with error: {e}') | ||||
| def parse_label_mapping(model_dir): | |||||
| import os | |||||
| import json | |||||
| label2id = None | |||||
| label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING) | |||||
| if os.path.exists(label_path): | |||||
| with open(label_path) as f: | |||||
| label_mapping = json.load(f) | |||||
| label2id = {name: idx for name, idx in label_mapping.items()} | |||||
| return label2id | |||||
| @@ -54,6 +54,38 @@ def import_modules_from_file(py_file: str): | |||||
| return module_name, mod | return module_name, mod | ||||
| def is_method_overridden(method, base_class, derived_class): | |||||
| """Check if a method of base class is overridden in derived class. | |||||
| Args: | |||||
| method (str): the method name to check. | |||||
| base_class (type): the class of the base class. | |||||
| derived_class (type | Any): the class or instance of the derived class. | |||||
| """ | |||||
| assert isinstance(base_class, type), \ | |||||
| "base_class doesn't accept instance, Please pass class instead." | |||||
| if not isinstance(derived_class, type): | |||||
| derived_class = derived_class.__class__ | |||||
| base_method = getattr(base_class, method) | |||||
| derived_method = getattr(derived_class, method) | |||||
| return derived_method != base_method | |||||
| def has_method(obj: object, method: str) -> bool: | |||||
| """Check whether the object has a method. | |||||
| Args: | |||||
| method (str): The method name to check. | |||||
| obj (object): The object to check. | |||||
| Returns: | |||||
| bool: True if the object has the method else False. | |||||
| """ | |||||
| return hasattr(obj, method) and callable(getattr(obj, method)) | |||||
| def import_modules(imports, allow_failed_imports=False): | def import_modules(imports, allow_failed_imports=False): | ||||
| """Import modules from the given list of strings. | """Import modules from the given list of strings. | ||||
| @@ -0,0 +1,77 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Part of the implementation is borrowed from huggingface/transformers. | |||||
| def torch_nested_numpify(tensors): | |||||
| import torch | |||||
| "Numpify `tensors` (even if it's a nested list/tuple of tensors)." | |||||
| if isinstance(tensors, (list, tuple)): | |||||
| return type(tensors)(torch_nested_numpify(t) for t in tensors) | |||||
| if isinstance(tensors, torch.Tensor): | |||||
| t = tensors.cpu() | |||||
| return t.numpy() | |||||
| return tensors | |||||
| def torch_nested_detach(tensors): | |||||
| import torch | |||||
| "Detach `tensors` (even if it's a nested list/tuple of tensors)." | |||||
| if isinstance(tensors, (list, tuple)): | |||||
| return type(tensors)(torch_nested_detach(t) for t in tensors) | |||||
| if isinstance(tensors, torch.Tensor): | |||||
| return tensors.detach() | |||||
| return tensors | |||||
| def torch_default_data_collator(features): | |||||
| # TODO @jiangnana.jnn refine this default data collator | |||||
| import torch | |||||
| # if not isinstance(features[0], (dict, BatchEncoding)): | |||||
| # features = [vars(f) for f in features] | |||||
| first = features[0] | |||||
| if isinstance(first, dict): | |||||
| batch = {} | |||||
| # Special handling for labels. | |||||
| # Ensure that tensor is created with the correct type | |||||
| # (it should be automatically the case, but let's make sure of it.) | |||||
| if 'label' in first and first['label'] is not None: | |||||
| label = first['label'].item() if isinstance( | |||||
| first['label'], torch.Tensor) else first['label'] | |||||
| dtype = torch.long if isinstance(label, int) else torch.float | |||||
| batch['labels'] = torch.tensor([f['label'] for f in features], | |||||
| dtype=dtype) | |||||
| elif 'label_ids' in first and first['label_ids'] is not None: | |||||
| if isinstance(first['label_ids'], torch.Tensor): | |||||
| batch['labels'] = torch.stack( | |||||
| [f['label_ids'] for f in features]) | |||||
| else: | |||||
| dtype = torch.long if type( | |||||
| first['label_ids'][0]) is int else torch.float | |||||
| batch['labels'] = torch.tensor( | |||||
| [f['label_ids'] for f in features], dtype=dtype) | |||||
| # Handling of all other possible keys. | |||||
| # Again, we will use the first element to figure out which key/values are not None for this model. | |||||
| for k, v in first.items(): | |||||
| if k not in ('label', 'label_ids' | |||||
| ) and v is not None and not isinstance(v, str): | |||||
| if isinstance(v, torch.Tensor): | |||||
| batch[k] = torch.stack([f[k] for f in features]) | |||||
| else: | |||||
| batch[k] = torch.tensor([f[k] for f in features]) | |||||
| elif isinstance(first, tuple): | |||||
| batch = [] | |||||
| for idx in range(len(first)): | |||||
| if isinstance(first[idx], torch.Tensor): | |||||
| batch.append(torch.stack([f[k] for f in features])) | |||||
| else: | |||||
| batch.append(torch.tensor([f[k] for f in features])) | |||||
| else: | |||||
| if isinstance(first, torch.Tensor): | |||||
| batch = torch.stack(features) | |||||
| else: | |||||
| batch = torch.tensor(features) | |||||
| return batch | |||||
| @@ -0,0 +1,127 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Following code is partialy borrowed from openmmlab/mmcv | |||||
| import functools | |||||
| import os | |||||
| import socket | |||||
| import subprocess | |||||
| from collections import OrderedDict | |||||
| from typing import Callable, List, Optional, Tuple | |||||
| import torch | |||||
| import torch.multiprocessing as mp | |||||
| from torch import distributed as dist | |||||
| from torch._utils import (_flatten_dense_tensors, _take_tensors, | |||||
| _unflatten_dense_tensors) | |||||
| def _find_free_port() -> str: | |||||
| # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 | |||||
| sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |||||
| # Binding to port 0 will cause the OS to find an available port for us | |||||
| sock.bind(('', 0)) | |||||
| port = sock.getsockname()[1] | |||||
| sock.close() | |||||
| # NOTE: there is still a chance the port could be taken by other processes. | |||||
| return port | |||||
| def _is_free_port(port: int) -> bool: | |||||
| ips = socket.gethostbyname_ex(socket.gethostname())[-1] | |||||
| ips.append('localhost') | |||||
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |||||
| return all(s.connect_ex((ip, port)) != 0 for ip in ips) | |||||
| def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None: | |||||
| if mp.get_start_method(allow_none=True) is None: | |||||
| mp.set_start_method('spawn') | |||||
| if launcher == 'pytorch': | |||||
| _init_dist_pytorch(backend, **kwargs) | |||||
| elif launcher == 'mpi': | |||||
| _init_dist_mpi(backend, **kwargs) | |||||
| elif launcher == 'slurm': | |||||
| _init_dist_slurm(backend, **kwargs) | |||||
| else: | |||||
| raise ValueError(f'Invalid launcher type: {launcher}') | |||||
| def _init_dist_pytorch(backend: str, **kwargs) -> None: | |||||
| # rank = int(os.environ['RANK']) | |||||
| local_rank = int(os.environ['LOCAL_RANK']) | |||||
| torch.cuda.set_device(local_rank) | |||||
| dist.init_process_group(backend=backend, **kwargs) | |||||
| def _init_dist_mpi(backend: str, **kwargs) -> None: | |||||
| local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) | |||||
| torch.cuda.set_device(local_rank) | |||||
| if 'MASTER_PORT' not in os.environ: | |||||
| # 29500 is torch.distributed default port | |||||
| os.environ['MASTER_PORT'] = '29500' | |||||
| if 'MASTER_ADDR' not in os.environ: | |||||
| raise KeyError('The environment variable MASTER_ADDR is not set') | |||||
| os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] | |||||
| os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] | |||||
| dist.init_process_group(backend=backend, **kwargs) | |||||
| def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: | |||||
| """Initialize slurm distributed training environment. | |||||
| If argument ``port`` is not specified, then the master port will be system | |||||
| environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system | |||||
| environment variable, then a default port ``29500`` will be used. | |||||
| Args: | |||||
| backend (str): Backend of torch.distributed. | |||||
| port (int, optional): Master port. Defaults to None. | |||||
| """ | |||||
| proc_id = int(os.environ['SLURM_PROCID']) | |||||
| ntasks = int(os.environ['SLURM_NTASKS']) | |||||
| node_list = os.environ['SLURM_NODELIST'] | |||||
| num_gpus = torch.cuda.device_count() | |||||
| torch.cuda.set_device(proc_id % num_gpus) | |||||
| addr = subprocess.getoutput( | |||||
| f'scontrol show hostname {node_list} | head -n1') | |||||
| # specify master port | |||||
| if port is not None: | |||||
| os.environ['MASTER_PORT'] = str(port) | |||||
| elif 'MASTER_PORT' in os.environ: | |||||
| pass # use MASTER_PORT in the environment variable | |||||
| else: | |||||
| # if torch.distributed default port(29500) is available | |||||
| # then use it, else find a free port | |||||
| if _is_free_port(29500): | |||||
| os.environ['MASTER_PORT'] = '29500' | |||||
| else: | |||||
| os.environ['MASTER_PORT'] = str(_find_free_port()) | |||||
| # use MASTER_ADDR in the environment variable if it already exists | |||||
| if 'MASTER_ADDR' not in os.environ: | |||||
| os.environ['MASTER_ADDR'] = addr | |||||
| os.environ['WORLD_SIZE'] = str(ntasks) | |||||
| os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) | |||||
| os.environ['RANK'] = str(proc_id) | |||||
| dist.init_process_group(backend=backend) | |||||
| def get_dist_info() -> Tuple[int, int]: | |||||
| if dist.is_available() and dist.is_initialized(): | |||||
| rank = dist.get_rank() | |||||
| world_size = dist.get_world_size() | |||||
| else: | |||||
| rank = 0 | |||||
| world_size = 1 | |||||
| return rank, world_size | |||||
| def master_only(func: Callable) -> Callable: | |||||
| @functools.wraps(func) | |||||
| def wrapper(*args, **kwargs): | |||||
| rank, _ = get_dist_info() | |||||
| if rank == 0: | |||||
| return func(*args, **kwargs) | |||||
| return wrapper | |||||
| @@ -1,3 +0,0 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:b4153d9ffc0b72eeaf162b5c9f4426f95dcea2bb0da9e7b5e1b72fd2643b1915 | |||||
| size 50444 | |||||
| @@ -0,0 +1,60 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import unittest | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from modelscope.models.base_torch import TorchModel | |||||
| class TorchBaseTest(unittest.TestCase): | |||||
| def test_custom_model(self): | |||||
| class MyTorchModel(TorchModel): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.conv1 = nn.Conv2d(1, 20, 5) | |||||
| self.conv2 = nn.Conv2d(20, 20, 5) | |||||
| def forward(self, x): | |||||
| x = F.relu(self.conv1(x)) | |||||
| return F.relu(self.conv2(x)) | |||||
| model = MyTorchModel() | |||||
| model.train() | |||||
| model.eval() | |||||
| out = model.forward(torch.rand(1, 1, 10, 10)) | |||||
| self.assertEqual((1, 20, 2, 2), out.shape) | |||||
| def test_custom_model_with_postprocess(self): | |||||
| add_bias = 200 | |||||
| class MyTorchModel(TorchModel): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.conv1 = nn.Conv2d(1, 20, 5) | |||||
| self.conv2 = nn.Conv2d(20, 20, 5) | |||||
| def forward(self, x): | |||||
| x = F.relu(self.conv1(x)) | |||||
| return F.relu(self.conv2(x)) | |||||
| def postprocess(self, x): | |||||
| return x + add_bias | |||||
| model = MyTorchModel() | |||||
| model.train() | |||||
| model.eval() | |||||
| out = model(torch.rand(1, 1, 10, 10)) | |||||
| self.assertEqual((1, 20, 2, 2), out.shape) | |||||
| self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10))) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -6,9 +6,9 @@ from typing import Any, Dict, List, Tuple, Union | |||||
| import numpy as np | import numpy as np | ||||
| import PIL | import PIL | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import Pipeline, pipeline | from modelscope.pipelines import Pipeline, pipeline | ||||
| from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info | from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from modelscope.utils.registry import default_group | from modelscope.utils.registry import default_group | ||||
| @@ -2,8 +2,8 @@ | |||||
| import unittest | import unittest | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -7,8 +7,8 @@ import cv2 | |||||
| from modelscope.fileio import File | from modelscope.fileio import File | ||||
| from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -5,9 +5,9 @@ import unittest | |||||
| import cv2 | import cv2 | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.pipelines.base import Pipeline | from modelscope.pipelines.base import Pipeline | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -5,8 +5,8 @@ import unittest | |||||
| import numpy as np | import numpy as np | ||||
| from modelscope.models import Model | from modelscope.models import Model | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -9,8 +9,8 @@ from scipy.io.wavfile import write | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models import Model | from modelscope.models import Model | ||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.pipelines.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Fields, Tasks | from modelscope.utils.constant import Fields, Tasks | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -0,0 +1,108 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| from abc import ABCMeta | |||||
| import json | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | |||||
| from modelscope.utils.constant import ModelFile | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| class DummyModel(nn.Module): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.linear = nn.Linear(5, 4) | |||||
| self.bn = nn.BatchNorm1d(4) | |||||
| def forward(self, feat, labels): | |||||
| x = self.linear(feat) | |||||
| x = self.bn(x) | |||||
| loss = torch.sum(x) | |||||
| return dict(logits=x, loss=loss) | |||||
| class CheckpointHookTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| def test_checkpoint_hook(self): | |||||
| json_cfg = { | |||||
| 'task': 'image_classification', | |||||
| 'train': { | |||||
| 'work_dir': self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| }, | |||||
| 'optimizer': { | |||||
| 'type': 'SGD', | |||||
| 'lr': 0.01, | |||||
| 'options': { | |||||
| 'grad_clip': { | |||||
| 'max_norm': 2.0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| 'lr_scheduler': { | |||||
| 'type': 'StepLR', | |||||
| 'step_size': 2, | |||||
| 'options': { | |||||
| 'warmup': { | |||||
| 'type': 'LinearWarmup', | |||||
| 'warmup_iters': 2 | |||||
| } | |||||
| } | |||||
| }, | |||||
| 'hooks': [{ | |||||
| 'type': 'CheckpointHook', | |||||
| 'interval': 1 | |||||
| }] | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=DummyModel(), | |||||
| data_collator=None, | |||||
| train_dataset=DummyDataset(), | |||||
| max_epochs=2) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| trainer.train() | |||||
| results_files = os.listdir(self.tmp_dir) | |||||
| self.assertIn('epoch_1.pth', results_files) | |||||
| self.assertIn('epoch_2.pth', results_files) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -0,0 +1,188 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| from abc import ABCMeta | |||||
| import json | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.optim import SGD | |||||
| from torch.optim.lr_scheduler import MultiStepLR | |||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | |||||
| from modelscope.utils.constant import ModelFile | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 10 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| class DummyModel(nn.Module): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.linear = nn.Linear(5, 4) | |||||
| self.bn = nn.BatchNorm1d(4) | |||||
| def forward(self, feat, labels): | |||||
| x = self.linear(feat) | |||||
| x = self.bn(x) | |||||
| loss = torch.sum(x) | |||||
| return dict(logits=x, loss=loss) | |||||
| class LrSchedulerHookTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| def test_lr_scheduler_hook(self): | |||||
| json_cfg = { | |||||
| 'task': 'image_classification', | |||||
| 'train': { | |||||
| 'work_dir': self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| } | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, 'config.json') | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| model = DummyModel() | |||||
| optimizer = SGD(model.parameters(), lr=0.01) | |||||
| lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=model, | |||||
| train_dataset=DummyDataset(), | |||||
| optimizers=(optimizer, lr_scheduler), | |||||
| max_epochs=5) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||||
| trainer.register_optimizers_hook() | |||||
| trainer.invoke_hook('before_run') | |||||
| log_lrs = [] | |||||
| optim_lrs = [] | |||||
| for _ in range(trainer._epoch, trainer._max_epochs): | |||||
| trainer.invoke_hook('before_train_epoch') | |||||
| for _, data_batch in enumerate(train_dataloader): | |||||
| trainer.invoke_hook('before_train_iter') | |||||
| log_lrs.append(trainer.log_buffer.output['lr']) | |||||
| optim_lrs.append(optimizer.param_groups[0]['lr']) | |||||
| trainer.train_step(trainer.model, data_batch) | |||||
| trainer.invoke_hook('after_train_iter') | |||||
| trainer.invoke_hook('after_train_epoch') | |||||
| trainer._epoch += 1 | |||||
| trainer.invoke_hook('after_run') | |||||
| iters = 5 | |||||
| target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 | |||||
| ] * iters * 2 | |||||
| self.assertListEqual(log_lrs, target_lrs) | |||||
| self.assertListEqual(optim_lrs, target_lrs) | |||||
| def test_warmup_lr_scheduler_hook(self): | |||||
| json_cfg = { | |||||
| 'task': 'image_classification', | |||||
| 'train': { | |||||
| 'work_dir': self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| }, | |||||
| 'optimizer': { | |||||
| 'type': 'SGD', | |||||
| 'lr': 0.01 | |||||
| }, | |||||
| 'lr_scheduler': { | |||||
| 'type': 'MultiStepLR', | |||||
| 'milestones': [4, 6], | |||||
| 'options': { | |||||
| 'warmup': { | |||||
| 'type': 'LinearWarmup', | |||||
| 'warmup_iters': 3 | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| model = DummyModel() | |||||
| # optimmizer = SGD(model.parameters(), lr=0.01) | |||||
| # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=model, | |||||
| train_dataset=DummyDataset(), | |||||
| # optimizers=(optimmizer, lr_scheduler), | |||||
| max_epochs=7) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||||
| trainer.register_optimizers_hook() | |||||
| trainer.invoke_hook('before_run') | |||||
| log_lrs = [] | |||||
| optim_lrs = [] | |||||
| for _ in range(trainer._epoch, trainer._max_epochs): | |||||
| trainer.invoke_hook('before_train_epoch') | |||||
| for _, data_batch in enumerate(train_dataloader): | |||||
| trainer.invoke_hook('before_train_iter') | |||||
| log_lrs.append(round(trainer.log_buffer.output['lr'], 5)) | |||||
| optim_lrs.append( | |||||
| round(trainer.optimizer.param_groups[0]['lr'], 5)) | |||||
| trainer.train_step(trainer.model, data_batch) | |||||
| trainer.invoke_hook('after_train_iter') | |||||
| trainer.invoke_hook('after_train_epoch') | |||||
| trainer.invoke_hook('after_run') | |||||
| iters = 5 | |||||
| target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ | |||||
| 0.01 | |||||
| ] * iters * 1 + [0.001] * iters * 2 + [0.0001] * iters * 2 | |||||
| self.assertListEqual(log_lrs, target_lrs) | |||||
| self.assertListEqual(optim_lrs, target_lrs) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -0,0 +1,128 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| from abc import ABCMeta | |||||
| import json | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.optim import SGD | |||||
| from torch.optim.lr_scheduler import MultiStepLR | |||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | |||||
| from modelscope.utils.constant import ModelFile | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 10 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| class DummyModel(nn.Module): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.linear = nn.Linear(5, 4) | |||||
| self.bn = nn.BatchNorm1d(4) | |||||
| def forward(self, feat, labels): | |||||
| x = self.linear(feat) | |||||
| x = self.bn(x) | |||||
| loss = torch.sum(x) | |||||
| return dict(logits=x, loss=loss) | |||||
| class IterTimerHookTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| def test_iter_time_hook(self): | |||||
| json_cfg = { | |||||
| 'task': 'image_classification', | |||||
| 'train': { | |||||
| 'work_dir': self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| }, | |||||
| 'hooks': [{ | |||||
| 'type': 'IterTimerHook', | |||||
| }] | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| model = DummyModel() | |||||
| optimizer = SGD(model.parameters(), lr=0.01) | |||||
| lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=model, | |||||
| train_dataset=DummyDataset(), | |||||
| optimizers=(optimizer, lr_scheduler), | |||||
| max_epochs=5) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||||
| trainer.register_optimizers_hook() | |||||
| trainer.register_hook_from_cfg(trainer.cfg.train.hooks) | |||||
| trainer.invoke_hook('before_run') | |||||
| for i in range(trainer._epoch, trainer._max_epochs): | |||||
| trainer.invoke_hook('before_train_epoch') | |||||
| for _, data_batch in enumerate(train_dataloader): | |||||
| trainer.invoke_hook('before_train_iter') | |||||
| trainer.train_step(trainer.model, data_batch) | |||||
| trainer.invoke_hook('after_train_iter') | |||||
| self.assertIn('data_load_time', trainer.log_buffer.val_history) | |||||
| self.assertIn('time', trainer.log_buffer.val_history) | |||||
| self.assertIn('loss', trainer.log_buffer.val_history) | |||||
| trainer.invoke_hook('after_train_epoch') | |||||
| target_len = 5 * (i + 1) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.val_history['data_load_time']), | |||||
| target_len) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.val_history['time']), target_len) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.val_history['loss']), target_len) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.n_history['data_load_time']), | |||||
| target_len) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.n_history['time']), target_len) | |||||
| self.assertEqual( | |||||
| len(trainer.log_buffer.n_history['loss']), target_len) | |||||
| trainer.invoke_hook('after_run') | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -0,0 +1,79 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import unittest | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.optim.lr_scheduler import MultiStepLR | |||||
| class WarmupTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| def test_constant_warmup(self): | |||||
| from modelscope.trainers.lrscheduler.warmup import ConstantWarmup | |||||
| net = nn.Linear(2, 2) | |||||
| base_lr = 0.02 | |||||
| warmup_iters = 3 | |||||
| warmup_ratio = 0.2 | |||||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||||
| lr_scheduler_with_warmup = ConstantWarmup( | |||||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||||
| res = [] | |||||
| for _ in range(10): | |||||
| lr_scheduler_with_warmup.step() | |||||
| for _, group in enumerate(optimizer.param_groups): | |||||
| res.append(group['lr']) | |||||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||||
| self.assertListEqual(res, [0.004, 0.004, 0.02] + base_lrs) | |||||
| def test_linear_warmup(self): | |||||
| from modelscope.trainers.lrscheduler.warmup import LinearWarmup | |||||
| net = nn.Linear(2, 2) | |||||
| base_lr = 0.02 | |||||
| warmup_iters = 3 | |||||
| warmup_ratio = 0.1 | |||||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||||
| lr_scheduler_with_warmup = LinearWarmup( | |||||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||||
| res = [] | |||||
| for _ in range(10): | |||||
| lr_scheduler_with_warmup.step() | |||||
| for _, group in enumerate(optimizer.param_groups): | |||||
| res.append(round(group['lr'], 5)) | |||||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||||
| self.assertListEqual(res, [0.0080, 0.0140, 0.02] + base_lrs) | |||||
| def test_exp_warmup(self): | |||||
| from modelscope.trainers.lrscheduler.warmup import ExponentialWarmup | |||||
| net = nn.Linear(2, 2) | |||||
| base_lr = 0.02 | |||||
| warmup_iters = 3 | |||||
| warmup_ratio = 0.1 | |||||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||||
| lr_scheduler_with_warmup = ExponentialWarmup( | |||||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||||
| res = [] | |||||
| for _ in range(10): | |||||
| lr_scheduler_with_warmup.step() | |||||
| for _, group in enumerate(optimizer.param_groups): | |||||
| res.append(round(group['lr'], 5)) | |||||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||||
| self.assertListEqual(res, [0.00431, 0.00928, 0.02] + base_lrs) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -0,0 +1,209 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| from abc import ABCMeta | |||||
| import json | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.optim import SGD | |||||
| from torch.optim.lr_scheduler import StepLR | |||||
| from torch.utils.data import Dataset | |||||
| from modelscope.trainers import build_trainer | |||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.utils.test_utils import test_level | |||||
| class DummyMetric: | |||||
| def __call__(self, ground_truth, predict_results): | |||||
| return {'accuracy': 0.5} | |||||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||||
| """Base Dataset | |||||
| """ | |||||
| def __len__(self): | |||||
| return 20 | |||||
| def __getitem__(self, idx): | |||||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||||
| class DummyModel(nn.Module): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| self.linear = nn.Linear(5, 4) | |||||
| self.bn = nn.BatchNorm1d(4) | |||||
| def forward(self, feat, labels): | |||||
| x = self.linear(feat) | |||||
| x = self.bn(x) | |||||
| loss = torch.sum(x) | |||||
| return dict(logits=x, loss=loss) | |||||
| class TrainerTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| def tearDown(self): | |||||
| super().tearDown() | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_train_0(self): | |||||
| json_cfg = { | |||||
| 'train': { | |||||
| 'work_dir': | |||||
| self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| }, | |||||
| 'optimizer': { | |||||
| 'type': 'SGD', | |||||
| 'lr': 0.01, | |||||
| 'options': { | |||||
| 'grad_clip': { | |||||
| 'max_norm': 2.0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| 'lr_scheduler': { | |||||
| 'type': 'StepLR', | |||||
| 'step_size': 2, | |||||
| 'options': { | |||||
| 'warmup': { | |||||
| 'type': 'LinearWarmup', | |||||
| 'warmup_iters': 2 | |||||
| } | |||||
| } | |||||
| }, | |||||
| 'hooks': [{ | |||||
| 'type': 'CheckpointHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'TextLoggerHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'IterTimerHook' | |||||
| }, { | |||||
| 'type': 'EvaluationHook', | |||||
| 'interval': 1 | |||||
| }] | |||||
| }, | |||||
| 'evaluation': { | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1, | |||||
| 'shuffle': False | |||||
| }, | |||||
| 'metrics': ['seq_cls_metric'] | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=DummyModel(), | |||||
| data_collator=None, | |||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| max_epochs=3) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| trainer.train() | |||||
| results_files = os.listdir(self.tmp_dir) | |||||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||||
| self.assertIn('epoch_1.pth', results_files) | |||||
| self.assertIn('epoch_2.pth', results_files) | |||||
| self.assertIn('epoch_3.pth', results_files) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_train_1(self): | |||||
| json_cfg = { | |||||
| 'train': { | |||||
| 'work_dir': | |||||
| self.tmp_dir, | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1 | |||||
| }, | |||||
| 'hooks': [{ | |||||
| 'type': 'CheckpointHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'TextLoggerHook', | |||||
| 'interval': 1 | |||||
| }, { | |||||
| 'type': 'IterTimerHook' | |||||
| }, { | |||||
| 'type': 'EvaluationHook', | |||||
| 'interval': 1 | |||||
| }] | |||||
| }, | |||||
| 'evaluation': { | |||||
| 'dataloader': { | |||||
| 'batch_size_per_gpu': 2, | |||||
| 'workers_per_gpu': 1, | |||||
| 'shuffle': False | |||||
| }, | |||||
| 'metrics': ['seq_cls_metric'] | |||||
| } | |||||
| } | |||||
| config_path = os.path.join(self.tmp_dir, 'config.json') | |||||
| with open(config_path, 'w') as f: | |||||
| json.dump(json_cfg, f) | |||||
| model = DummyModel() | |||||
| optimmizer = SGD(model.parameters(), lr=0.01) | |||||
| lr_scheduler = StepLR(optimmizer, 2) | |||||
| trainer_name = 'EpochBasedTrainer' | |||||
| kwargs = dict( | |||||
| cfg_file=config_path, | |||||
| model=model, | |||||
| data_collator=None, | |||||
| train_dataset=DummyDataset(), | |||||
| eval_dataset=DummyDataset(), | |||||
| optimizers=(optimmizer, lr_scheduler), | |||||
| max_epochs=3) | |||||
| trainer = build_trainer(trainer_name, kwargs) | |||||
| trainer.train() | |||||
| results_files = os.listdir(self.tmp_dir) | |||||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||||
| self.assertIn('epoch_1.pth', results_files) | |||||
| self.assertIn('epoch_2.pth', results_files) | |||||
| self.assertIn('epoch_3.pth', results_files) | |||||
| class DummyTrainerTest(unittest.TestCase): | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_dummy(self): | |||||
| default_args = dict(cfg_file='configs/examples/train.json') | |||||
| trainer = build_trainer('dummy', default_args) | |||||
| trainer.train() | |||||
| trainer.evaluate() | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -1,19 +0,0 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import unittest | |||||
| from modelscope.trainers import build_trainer | |||||
| class DummyTrainerTest(unittest.TestCase): | |||||
| def test_dummy(self): | |||||
| default_args = dict(cfg_file='configs/examples/train.json') | |||||
| trainer = build_trainer('dummy', default_args) | |||||
| trainer.train() | |||||
| trainer.evaluate() | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -0,0 +1,91 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| from modelscope.hub.snapshot_download import snapshot_download | |||||
| from modelscope.models.nlp.sbert_for_sequence_classification import \ | |||||
| SbertTextClassfier | |||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.trainers import build_trainer | |||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.utils.test_utils import test_level | |||||
| class TestTrainerWithNlp(unittest.TestCase): | |||||
| def setUp(self): | |||||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(self.tmp_dir): | |||||
| os.makedirs(self.tmp_dir) | |||||
| from datasets import Dataset | |||||
| dataset_dict = { | |||||
| 'sentence1': [ | |||||
| 'This is test sentence1-1', 'This is test sentence2-1', | |||||
| 'This is test sentence3-1' | |||||
| ], | |||||
| 'sentence2': [ | |||||
| 'This is test sentence1-2', 'This is test sentence2-2', | |||||
| 'This is test sentence3-2' | |||||
| ], | |||||
| 'label': [0, 1, 1] | |||||
| } | |||||
| dataset = Dataset.from_dict(dataset_dict) | |||||
| class MsDatasetDummy(MsDataset): | |||||
| def __len__(self): | |||||
| return len(self._hf_ds) | |||||
| self.dataset = MsDatasetDummy(dataset) | |||||
| def tearDown(self): | |||||
| shutil.rmtree(self.tmp_dir) | |||||
| super().tearDown() | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_trainer(self): | |||||
| model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||||
| kwargs = dict( | |||||
| model=model_id, | |||||
| train_dataset=self.dataset, | |||||
| eval_dataset=self.dataset, | |||||
| work_dir=self.tmp_dir) | |||||
| trainer = build_trainer(default_args=kwargs) | |||||
| trainer.train() | |||||
| results_files = os.listdir(self.tmp_dir) | |||||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||||
| for i in range(10): | |||||
| self.assertIn(f'epoch_{i+1}.pth', results_files) | |||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
| def test_trainer_with_model_and_args(self): | |||||
| tmp_dir = tempfile.TemporaryDirectory().name | |||||
| if not os.path.exists(tmp_dir): | |||||
| os.makedirs(tmp_dir) | |||||
| model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||||
| cache_path = snapshot_download(model_id) | |||||
| model = SbertTextClassfier.from_pretrained(cache_path) | |||||
| kwargs = dict( | |||||
| cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | |||||
| model=model, | |||||
| train_dataset=self.dataset, | |||||
| eval_dataset=self.dataset, | |||||
| max_epochs=2, | |||||
| work_dir=self.tmp_dir) | |||||
| trainer = build_trainer(default_args=kwargs) | |||||
| trainer.train() | |||||
| results_files = os.listdir(self.tmp_dir) | |||||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||||
| for i in range(2): | |||||
| self.assertIn(f'epoch_{i+1}.pth', results_files) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||