co-contributed with 夕陌&雨泓 * add torch epoch based trainer and dis utils * add hooks including optimizer, lrscheduler, logging, checkpoint, evaluation, time profiling * add torch mdoel base and test * add optimizer and lrscheduler module * add sbert for text classification example * add task_dataset for dataset-level processor Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9338412master
| @@ -0,0 +1,175 @@ | |||
| { | |||
| "framework": "pytorch", | |||
| "task": "image_classification", | |||
| "work_dir": "./work_dir", | |||
| "model": { | |||
| "type": "classification", | |||
| "pretrained": null, | |||
| "backbone": { | |||
| "type": "ResNet", | |||
| "depth": 50, | |||
| "out_indices": [ | |||
| 4 | |||
| ], | |||
| "norm_cfg": { | |||
| "type": "BN" | |||
| } | |||
| }, | |||
| "head": { | |||
| "type": "ClsHead", | |||
| "with_avg_pool": true, | |||
| "in_channels": 2048, | |||
| "loss_config": { | |||
| "type": "CrossEntropyLossWithLabelSmooth", | |||
| "label_smooth": 0 | |||
| }, | |||
| "num_classes": 1000 | |||
| } | |||
| }, | |||
| "dataset": { | |||
| "train": { | |||
| "type": "ClsDataset", | |||
| "data_source": { | |||
| "list_file": "data/imagenet_raw/meta/train_labeled.txt", | |||
| "root": "data/imagenet_raw/train/", | |||
| "type": "ClsSourceImageList" | |||
| } | |||
| }, | |||
| "val": { | |||
| "type": "ClsDataset", | |||
| "data_source": { | |||
| "list_file": "data/imagenet_raw/meta/val_labeled.txt", | |||
| "root": "data/imagenet_raw/validation/", | |||
| "type": "ClsSourceImageList" | |||
| } | |||
| }, | |||
| "test": {} | |||
| }, | |||
| "preprocessor":{ | |||
| "train": [ | |||
| { | |||
| "type": "RandomResizedCrop", | |||
| "size": 224 | |||
| }, | |||
| { | |||
| "type": "RandomHorizontalFlip" | |||
| }, | |||
| { | |||
| "type": "ToTensor" | |||
| }, | |||
| { | |||
| "type": "Normalize", | |||
| "mean": [ | |||
| 0.485, | |||
| 0.456, | |||
| 0.406 | |||
| ], | |||
| "std": [ | |||
| 0.229, | |||
| 0.224, | |||
| 0.225 | |||
| ] | |||
| }, | |||
| { | |||
| "type": "Collect", | |||
| "keys": [ | |||
| "img", | |||
| "gt_labels" | |||
| ] | |||
| } | |||
| ], | |||
| "val": [ | |||
| { | |||
| "type": "Resize", | |||
| "size": 256 | |||
| }, | |||
| { | |||
| "type": "CenterCrop", | |||
| "size": 224 | |||
| }, | |||
| { | |||
| "type": "ToTensor" | |||
| }, | |||
| { | |||
| "type": "Normalize", | |||
| "mean": [ | |||
| 0.485, | |||
| 0.456, | |||
| 0.406 | |||
| ], | |||
| "std": [ | |||
| 0.229, | |||
| 0.224, | |||
| 0.225 | |||
| ] | |||
| }, | |||
| { | |||
| "type": "Collect", | |||
| "keys": [ | |||
| "img", | |||
| "gt_labels" | |||
| ] | |||
| } | |||
| ] | |||
| }, | |||
| "train": { | |||
| "dataloader": { | |||
| "batch_size_per_gpu": 2, | |||
| "workers_per_gpu": 1 | |||
| }, | |||
| "optimizer": { | |||
| "type": "SGD", | |||
| "lr": 0.01, | |||
| "options": { | |||
| "grad_clip": { | |||
| "max_norm": 2.0 | |||
| } | |||
| } | |||
| }, | |||
| "lr_scheduler": { | |||
| "type": "StepLR", | |||
| "step_size": 2, | |||
| "options": { | |||
| "warmup": { | |||
| "type": "LinearWarmup", | |||
| "warmup_iters": 2 | |||
| } | |||
| } | |||
| }, | |||
| "hooks": | |||
| [ | |||
| { | |||
| "type": "CheckpointHook", | |||
| "interval": 2 | |||
| }, | |||
| { | |||
| "type": "TextLoggerHook", | |||
| "interval": 1 | |||
| }, | |||
| { | |||
| "type": "IterTimerHook" | |||
| }, | |||
| { | |||
| "type": "EvaluationHook", | |||
| "interval": 1 | |||
| } | |||
| ] | |||
| }, | |||
| "evaluation": { | |||
| "dataloader": { | |||
| "batch_size_per_gpu": 2, | |||
| "workers_per_gpu": 1, | |||
| "shuffle": false | |||
| }, | |||
| "metrics": ["accuracy", "precision", "recall"] | |||
| } | |||
| } | |||
| @@ -0,0 +1,77 @@ | |||
| { | |||
| "task": "sentence-similarity", | |||
| "preprocessor": { | |||
| "type": "bert-seq-cls-tokenizer-finetune", | |||
| "first_sequence": "sentence1", | |||
| "second_sequence": "sentence2" | |||
| }, | |||
| "model": { | |||
| "type": "structbert", | |||
| "attention_probs_dropout_prob": 0.1, | |||
| "easynlp_version": "0.0.3", | |||
| "gradient_checkpointing": false, | |||
| "hidden_act": "gelu", | |||
| "hidden_dropout_prob": 0.1, | |||
| "hidden_size": 768, | |||
| "initializer_range": 0.02, | |||
| "intermediate_size": 3072, | |||
| "layer_norm_eps": 1e-12, | |||
| "max_position_embeddings": 512, | |||
| "num_attention_heads": 12, | |||
| "num_hidden_layers": 12, | |||
| "pad_token_id": 0, | |||
| "position_embedding_type": "absolute", | |||
| "transformers_version": "4.6.0.dev0", | |||
| "type_vocab_size": 2, | |||
| "use_cache": true, | |||
| "vocab_size": 30522 | |||
| }, | |||
| "pipeline": { | |||
| "type": "sentence-similarity" | |||
| }, | |||
| "work_dir": "/tmp", | |||
| "train": { | |||
| "dataloader": { | |||
| "batch_size_per_gpu": 2, | |||
| "workers_per_gpu": 1 | |||
| }, | |||
| "optimizer": { | |||
| "type": "SGD", | |||
| "lr": 0.01, | |||
| "options": { | |||
| "grad_clip": { | |||
| "max_norm": 2.0 | |||
| } | |||
| } | |||
| }, | |||
| "lr_scheduler": { | |||
| "type": "StepLR", | |||
| "step_size": 2, | |||
| "options": { | |||
| "warmup": { | |||
| "type": "LinearWarmup", | |||
| "warmup_iters": 2 | |||
| } | |||
| } | |||
| }, | |||
| "hooks": [{ | |||
| "type": "CheckpointHook", | |||
| "interval": 1 | |||
| }, { | |||
| "type": "TextLoggerHook", | |||
| "interval": 1 | |||
| }, { | |||
| "type": "IterTimerHook" | |||
| }, { | |||
| "type": "EvaluationHook", | |||
| "interval": 1 | |||
| }] | |||
| }, | |||
| "evaluation": { | |||
| "dataloader": { | |||
| "batch_size_per_gpu": 2, | |||
| "workers_per_gpu": 1, | |||
| "shuffle": false | |||
| } | |||
| } | |||
| } | |||
| @@ -36,10 +36,10 @@ modelscope.pipelines.base module | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| modelscope.pipelines.outputs module | |||
| modelscope.outputs module | |||
| ----------------------------------- | |||
| .. automodule:: modelscope.pipelines.outputs | |||
| .. automodule:: modelscope.outputs | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -127,3 +127,16 @@ class Preprocessors(object): | |||
| # multi-modal | |||
| ofa_image_caption = 'ofa-image-caption' | |||
| mplug_visual_question_answering = 'mplug-visual-question-answering' | |||
| class Metrics(object): | |||
| """ Names for different metrics. | |||
| """ | |||
| # accuracy | |||
| accuracy = 'accuracy' | |||
| # metrics for sequence classification task | |||
| seq_cls_metric = 'seq_cls_metric' | |||
| # metrics for token-classification task | |||
| token_cls_metric = 'token-cls-metric' | |||
| @@ -0,0 +1,3 @@ | |||
| from .base import Metric | |||
| from .builder import METRICS, build_metric, task_default_metrics | |||
| from .sequence_classification_metric import SequenceClassificationMetric | |||
| @@ -0,0 +1,37 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from abc import ABC, abstractmethod | |||
| from typing import Dict | |||
| class Metric(ABC): | |||
| """The metric base class for computing metrics. | |||
| The subclasses can either compute a single metric like 'accuracy', or compute the | |||
| complex metrics for a specific task with or without other Metric subclasses. | |||
| """ | |||
| @abstractmethod | |||
| def add(self, outputs: Dict, inputs: Dict): | |||
| """ Append logits and labels within an eval loop. | |||
| Will be called after every batch finished to gather the model predictions and the labels. | |||
| Args: | |||
| outputs: The model prediction outputs. | |||
| inputs: The mini batch inputs from the dataloader. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| @abstractmethod | |||
| def evaluate(self): | |||
| """Evaluate the metrics after the eval finished. | |||
| Will be called after the whole validation finished. | |||
| Returns: The actual metric dict with standard names. | |||
| """ | |||
| pass | |||
| @@ -0,0 +1,35 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from ..metainfo import Metrics | |||
| from ..utils.config import ConfigDict | |||
| from ..utils.constant import Tasks | |||
| from ..utils.registry import Registry, build_from_cfg, default_group | |||
| METRICS = Registry('metrics') | |||
| class MetricKeys(object): | |||
| ACCURACY = 'accuracy' | |||
| F1 = 'f1' | |||
| PRECISION = 'precision' | |||
| RECALL = 'recall' | |||
| task_default_metrics = { | |||
| Tasks.sentence_similarity: [Metrics.seq_cls_metric], | |||
| } | |||
| def build_metric(metric_name: str, | |||
| field: str = default_group, | |||
| default_args: dict = None): | |||
| """ Build metric given metric_name and field. | |||
| Args: | |||
| metric_name (:obj:`str`): The metric name. | |||
| field (str, optional): The field of this metric, default value: 'default' for all fields. | |||
| default_args (dict, optional): Default initialization arguments. | |||
| """ | |||
| cfg = ConfigDict({'type': metric_name}) | |||
| return build_from_cfg( | |||
| cfg, METRICS, group_key=field, default_args=default_args) | |||
| @@ -0,0 +1,40 @@ | |||
| from typing import Dict, List, Union | |||
| import numpy as np | |||
| from modelscope.outputs import OutputKeys | |||
| from ..metainfo import Metrics | |||
| from ..utils.registry import default_group | |||
| from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify | |||
| from .base import Metric | |||
| from .builder import METRICS, MetricKeys | |||
| @METRICS.register_module( | |||
| group_key=default_group, module_name=Metrics.seq_cls_metric) | |||
| class SequenceClassificationMetric(Metric): | |||
| """The metric computation class for sequence classification classes. | |||
| """ | |||
| label_name = 'labels' | |||
| def __init__(self): | |||
| self.preds = [] | |||
| self.labels = [] | |||
| def add(self, outputs: Dict, inputs: Dict): | |||
| ground_truths = inputs[SequenceClassificationMetric.label_name] | |||
| eval_results = outputs[OutputKeys.LOGITS] | |||
| self.preds.append( | |||
| torch_nested_numpify(torch_nested_detach(eval_results))) | |||
| self.labels.append( | |||
| torch_nested_numpify(torch_nested_detach(ground_truths))) | |||
| def evaluate(self): | |||
| preds = np.concatenate(self.preds, axis=0) | |||
| labels = np.concatenate(self.labels, axis=0) | |||
| preds = np.argmax(preds, axis=1) | |||
| return { | |||
| MetricKeys.ACCURACY: | |||
| (preds == labels).astype(np.float32).mean().item() | |||
| } | |||
| @@ -0,0 +1,23 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Dict | |||
| import torch | |||
| from .base import Model | |||
| class TorchModel(Model, torch.nn.Module): | |||
| """ Base model interface for pytorch | |||
| """ | |||
| def __init__(self, model_dir=None, *args, **kwargs): | |||
| # init reference: https://stackoverflow.com/questions\ | |||
| # /9575409/calling-parent-class-init-with-multiple-inheritance-whats-the-right-way | |||
| super().__init__(model_dir) | |||
| super(Model, self).__init__() | |||
| def forward(self, inputs: Dict[str, | |||
| torch.Tensor]) -> Dict[str, torch.Tensor]: | |||
| raise NotImplementedError | |||
| @@ -108,7 +108,7 @@ class CLIPForMultiModalEmbedding(Model): | |||
| return text_ids_tensor, text_mask_tensor | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.outputs import OutputKeys | |||
| output = { | |||
| OutputKeys.IMG_EMBEDDING: None, | |||
| OutputKeys.TEXT_EMBEDDING: None | |||
| @@ -134,7 +134,7 @@ class CLIPForMultiModalEmbedding(Model): | |||
| img_embedding = self.clip_model( | |||
| input_data=img_tensor, input_type='img') | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.outputs import OutputKeys | |||
| output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() | |||
| if 'text' in input and input['text'] is not None: | |||
| @@ -76,7 +76,7 @@ class OfaForImageCaptioning(Model): | |||
| input = fairseq.utils.move_to_cuda(input, device=self._device) | |||
| results, _ = self.eval_caption(self.task, self.generator, self.models, | |||
| input) | |||
| from ...pipelines.outputs import OutputKeys | |||
| from modelscope.outputs import OutputKeys | |||
| return { | |||
| 'image_id': results[0]['image_id'], | |||
| OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] | |||
| @@ -7,7 +7,10 @@ import torch | |||
| from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel | |||
| from torch import nn | |||
| from modelscope.metainfo import Models | |||
| from modelscope.utils.constant import Tasks | |||
| from ..base import Model | |||
| from ..builder import MODELS | |||
| class SbertTextClassfier(SbertPreTrainedModel): | |||
| @@ -20,7 +23,11 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
| self.classifier = nn.Linear(config.hidden_size, config.num_labels) | |||
| def forward(self, input_ids=None, token_type_ids=None): | |||
| def forward(self, | |||
| input_ids=None, | |||
| token_type_ids=None, | |||
| labels=None, | |||
| **kwargs): | |||
| outputs = self.encoder( | |||
| input_ids, | |||
| token_type_ids=token_type_ids, | |||
| @@ -29,6 +36,10 @@ class SbertTextClassfier(SbertPreTrainedModel): | |||
| pooled_output = outputs[1] | |||
| pooled_output = self.dropout(pooled_output) | |||
| logits = self.classifier(pooled_output) | |||
| if labels is not None: | |||
| loss_fct = nn.CrossEntropyLoss() | |||
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |||
| return {'logits': logits, 'loss': loss} | |||
| return {'logits': logits} | |||
| @@ -4,6 +4,7 @@ from modelscope.utils.constant import Tasks | |||
| class OutputKeys(object): | |||
| LOGITS = 'logits' | |||
| SCORES = 'scores' | |||
| LABEL = 'label' | |||
| LABELS = 'labels' | |||
| @@ -7,10 +7,10 @@ import soundfile as sf | |||
| import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from ..base import Input, Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| def audio_norm(x): | |||
| @@ -9,12 +9,12 @@ import yaml | |||
| from modelscope.fileio import File | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.preprocessors.audio import LinearAECAndFbank | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| logger = get_logger() | |||
| @@ -5,9 +5,9 @@ import numpy as np | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models import Model | |||
| from modelscope.models.audio.tts import SambertHifigan | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input, InputModel, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Fields, Tasks | |||
| __all__ = ['TextToSpeechSambertHifiganPipeline'] | |||
| @@ -7,10 +7,10 @@ from typing import Any, Dict, Generator, List, Union | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.models.base import Model | |||
| from modelscope.msdatasets import MsDataset | |||
| from modelscope.outputs import TASK_OUTPUTS | |||
| from modelscope.preprocessors import Preprocessor | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.logger import get_logger | |||
| from .outputs import TASK_OUTPUTS | |||
| from .util import is_model, is_official_hub_path | |||
| Tensor = Union['torch.Tensor', 'tf.Tensor'] | |||
| @@ -6,6 +6,7 @@ import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.cv.action_recognition.models import BaseVideoModel | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors.video import ReadVideoData | |||
| from modelscope.utils.config import Config | |||
| @@ -13,7 +14,6 @@ from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| logger = get_logger() | |||
| @@ -10,13 +10,13 @@ from torchvision import transforms | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.cv.animal_recognition import resnet | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors import load_image | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| logger = get_logger() | |||
| @@ -11,8 +11,8 @@ from PIL import Image | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.cv.cmdssl_video_embedding.resnet2p1d import \ | |||
| resnet26_2p1d | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -11,13 +11,13 @@ from modelscope.models.cv.cartoon.facelib.facer import FaceAna | |||
| from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( | |||
| get_reference_facial_points, warp_and_crop_face) | |||
| from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors import load_image | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| if tf.__version__ >= '2.0': | |||
| tf = tf.compat.v1 | |||
| @@ -6,13 +6,13 @@ import numpy as np | |||
| import PIL | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors import load_image | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| logger = get_logger() | |||
| @@ -7,13 +7,13 @@ import PIL | |||
| import tensorflow as tf | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors import load_image | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||
| if tf.__version__ >= '2.0': | |||
| @@ -3,12 +3,12 @@ from typing import Any, Dict | |||
| import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Model, Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| logger = get_logger() | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SpaceForDialogIntent | |||
| @@ -9,7 +10,6 @@ from ...preprocessors import DialogIntentPredictionPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['DialogIntentPredictionPipeline'] | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Dict, Union | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SpaceForDialogModeling | |||
| @@ -9,7 +10,6 @@ from ...preprocessors import DialogModelingPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline, Tensor | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['DialogModelingPipeline'] | |||
| @@ -1,12 +1,12 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model, SpaceForDialogStateTracking | |||
| from ...preprocessors import DialogStateTrackingPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['DialogStateTrackingPipeline'] | |||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp.masked_language import MaskedLanguageModelBase | |||
| @@ -11,7 +12,6 @@ from ...utils.config import Config | |||
| from ...utils.constant import ModelFile, Tasks | |||
| from ..base import Pipeline, Tensor | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['FillMaskPipeline'] | |||
| _type_map = {'veco': 'roberta', 'sbert': 'bert'} | |||
| @@ -4,6 +4,7 @@ from typing import Any, Dict, Union | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SbertForNLI | |||
| @@ -11,7 +12,6 @@ from ...preprocessors import NLIPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['NLIPipeline'] | |||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SbertForSentenceSimilarity | |||
| @@ -10,7 +11,6 @@ from ...preprocessors import SentenceSimilarityPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Input, Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['SentenceSimilarityPipeline'] | |||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SbertForSentimentClassification | |||
| @@ -10,7 +11,6 @@ from ...preprocessors import SentimentClassificationPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['SentimentClassificationPipeline'] | |||
| @@ -4,12 +4,12 @@ import numpy as np | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.nlp import BertForSequenceClassification | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from ...models import Model | |||
| from ..base import Input, Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['SequenceClassificationPipeline'] | |||
| @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import PalmForTextGeneration | |||
| @@ -9,7 +10,6 @@ from ...preprocessors import TextGenerationPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline, Tensor | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['TextGenerationPipeline'] | |||
| @@ -4,6 +4,7 @@ from typing import Any, Dict | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from modelscope.outputs import OutputKeys | |||
| from ...hub.snapshot_download import snapshot_download | |||
| from ...metainfo import Pipelines | |||
| from ...models.nlp import CsanmtForTranslation | |||
| @@ -11,7 +12,6 @@ from ...utils.constant import ModelFile, Tasks | |||
| from ...utils.logger import get_logger | |||
| from ..base import Pipeline, Tensor | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| if tf.__version__ >= '2.0': | |||
| tf = tf.compat.v1 | |||
| @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SbertForTokenClassification | |||
| @@ -9,7 +10,6 @@ from ...preprocessors import TokenClassificationPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline, Tensor | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['WordSegmentationPipeline'] | |||
| @@ -3,6 +3,7 @@ from typing import Any, Dict, Union | |||
| import torch | |||
| from scipy.special import softmax | |||
| from modelscope.outputs import OutputKeys | |||
| from ...metainfo import Pipelines | |||
| from ...models import Model | |||
| from ...models.nlp import SbertForZeroShotClassification | |||
| @@ -10,7 +11,6 @@ from ...preprocessors import ZeroShotClassificationPreprocessor | |||
| from ...utils.constant import Tasks | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from ..outputs import OutputKeys | |||
| __all__ = ['ZeroShotClassificationPipeline'] | |||
| @@ -8,6 +8,7 @@ from transformers import AutoTokenizer | |||
| from ..metainfo import Preprocessors | |||
| from ..models import Model | |||
| from ..utils.constant import Fields, InputFields | |||
| from ..utils.hub import parse_label_mapping | |||
| from ..utils.type_assert import type_assert | |||
| from .base import Preprocessor | |||
| from .builder import PREPROCESSORS | |||
| @@ -115,7 +116,8 @@ class SentenceSimilarityPreprocessor(NLPPreprocessorBase): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| kwargs['truncation'] = True | |||
| kwargs['padding'] = False | |||
| kwargs['padding'] = False if 'padding' not in kwargs else kwargs[ | |||
| 'padding'] | |||
| kwargs['return_tensors'] = 'pt' | |||
| kwargs['max_length'] = kwargs.pop('sequence_length', 128) | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| @@ -143,6 +145,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | |||
| print(f'this is the tokenzier {self.tokenizer}') | |||
| self.label2id = parse_label_mapping(self.model_dir) | |||
| @type_assert(object, (str, tuple, Dict)) | |||
| def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||
| @@ -164,7 +167,7 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
| 'id': [], | |||
| 'input_ids': [], | |||
| 'attention_mask': [], | |||
| 'token_type_ids': [] | |||
| 'token_type_ids': [], | |||
| } | |||
| max_seq_length = self.sequence_length | |||
| @@ -186,6 +189,29 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
| return rst | |||
| @PREPROCESSORS.register_module( | |||
| Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune') | |||
| class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor): | |||
| """Sentence similarity preprocessor in the finetune scenario | |||
| Mainly added the label mapping procedure. | |||
| """ | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| kwargs['padding'] = 'max_length' | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| self.label2id = parse_label_mapping(self.model_dir) | |||
| @type_assert(object, (str, tuple, Dict)) | |||
| def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||
| rst = super().__call__(data) | |||
| rst = {k: v.squeeze() for k, v in rst.items()} | |||
| if self.label2id is not None and 'label' in data: | |||
| rst['labels'] = [] | |||
| rst['labels'].append(self.label2id[str(data['label'])]) | |||
| return rst | |||
| @PREPROCESSORS.register_module( | |||
| Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) | |||
| class TextGenerationPreprocessor(NLPPreprocessorBase): | |||
| @@ -0,0 +1,3 @@ | |||
| from .base import TaskDataset | |||
| from .builder import build_task_dataset | |||
| from .torch_base_dataset import TorchTaskDataset | |||
| @@ -0,0 +1,48 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from abc import ABC, abstractmethod | |||
| from typing import Any, List, Tuple | |||
| class TaskDataset(ABC): | |||
| """The task dataset base class for all the task specific dataset processors. | |||
| """ | |||
| def __init__(self, | |||
| datasets: Tuple[Any, List[Any]], | |||
| mode, | |||
| preprocessor=None, | |||
| **kwargs): | |||
| super().__init__() | |||
| self.mode = mode | |||
| self.preprocessor = preprocessor | |||
| self._inner_dataset = self.compose_dataset(datasets) | |||
| @abstractmethod | |||
| def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||
| """Prepare a dataset. | |||
| User can process the input datasets in a whole dataset perspective. | |||
| This method also helps to merge several datasets to one. | |||
| Args: | |||
| datasets: The original dataset(s) | |||
| Returns: A single dataset, which may be created after merging. | |||
| """ | |||
| pass | |||
| @abstractmethod | |||
| def preprocess_dataset(self, data): | |||
| """Preprocess the data fetched from the inner_dataset. | |||
| If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||
| User can override this method to implement custom logics. | |||
| Args: | |||
| data: The data fetched from the dataset. | |||
| Returns: The processed data. | |||
| """ | |||
| pass | |||
| @@ -0,0 +1,21 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.utils.config import ConfigDict | |||
| from modelscope.utils.registry import Registry, build_from_cfg | |||
| TASK_DATASETS = Registry('task_datasets') | |||
| def build_task_dataset(cfg: ConfigDict, | |||
| task_name: str = None, | |||
| default_args: dict = None): | |||
| """ Build task specific dataset processor given model config dict and the task name. | |||
| Args: | |||
| cfg (:obj:`ConfigDict`): config dict for model object. | |||
| task_name (str, optional): task name, refer to | |||
| :obj:`Tasks` for more details | |||
| default_args (dict, optional): Default initialization arguments. | |||
| """ | |||
| return build_from_cfg( | |||
| cfg, TASK_DATASETS, group_key=task_name, default_args=default_args) | |||
| @@ -0,0 +1,63 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, List, Tuple | |||
| from torch.utils.data import ConcatDataset, Dataset | |||
| from .base import TaskDataset | |||
| class TorchTaskDataset(TaskDataset, Dataset): | |||
| """The task dataset base class for all the torch-based task processors. | |||
| This base class is enough for most cases, except there are procedures which can not be executed in | |||
| preprocessors and Datasets like dataset merging. | |||
| """ | |||
| def __init__(self, | |||
| datasets: Tuple[Any, List[Any]], | |||
| mode, | |||
| preprocessor=None, | |||
| **kwargs): | |||
| TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) | |||
| def __getitem__(self, index) -> Any: | |||
| return self.preprocess_dataset(self._inner_dataset[index]) | |||
| def __len__(self): | |||
| return len(self._inner_dataset) | |||
| def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: | |||
| """Prepare a dataset. | |||
| User can process the input datasets in a whole dataset perspective. | |||
| This method gives a default implementation of datasets merging, user can override this | |||
| method to write custom logics. | |||
| Args: | |||
| datasets: The original dataset(s) | |||
| Returns: A single dataset, which may be created after merging. | |||
| """ | |||
| if isinstance(datasets, List): | |||
| if len(datasets) == 1: | |||
| return datasets[0] | |||
| elif len(datasets) > 1: | |||
| return ConcatDataset(datasets) | |||
| else: | |||
| return datasets | |||
| def preprocess_dataset(self, data): | |||
| """Preprocess the data fetched from the inner_dataset. | |||
| If the preprocessor is None, the original data will be returned, else the preprocessor will be called. | |||
| User can override this method to implement custom logics. | |||
| Args: | |||
| data: The data fetched from the dataset. | |||
| Returns: The processed data. | |||
| """ | |||
| return self.preprocessor( | |||
| data) if self.preprocessor is not None else data | |||
| @@ -1,3 +1,4 @@ | |||
| from .base import DummyTrainer | |||
| from .builder import build_trainer | |||
| from .nlp import SequenceClassificationTrainer | |||
| from .trainer import EpochBasedTrainer | |||
| @@ -1,10 +1,12 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import time | |||
| from abc import ABC, abstractmethod | |||
| from typing import Callable, Dict, List, Optional, Tuple, Union | |||
| from modelscope.trainers.builder import TRAINERS | |||
| from modelscope.utils.config import Config | |||
| from .utils.log_buffer import LogBuffer | |||
| class BaseTrainer(ABC): | |||
| @@ -27,6 +29,8 @@ class BaseTrainer(ABC): | |||
| self.args = self.cfg.to_args(arg_parse_fn) | |||
| else: | |||
| self.args = None | |||
| self.log_buffer = LogBuffer() | |||
| self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) | |||
| @abstractmethod | |||
| def train(self, *args, **kwargs): | |||
| @@ -5,9 +5,10 @@ from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.registry import Registry, build_from_cfg | |||
| TRAINERS = Registry('trainers') | |||
| HOOKS = Registry('hooks') | |||
| def build_trainer(name: str = None, default_args: dict = None): | |||
| def build_trainer(name: str = 'EpochBasedTrainer', default_args: dict = None): | |||
| """ build trainer given a trainer name | |||
| Args: | |||
| @@ -15,7 +16,5 @@ def build_trainer(name: str = None, default_args: dict = None): | |||
| will be used. | |||
| default_args (dict, optional): Default initialization arguments. | |||
| """ | |||
| if name is None: | |||
| name = 'Trainer' | |||
| cfg = dict(type=name) | |||
| return build_from_cfg(cfg, TRAINERS, default_args=default_args) | |||
| @@ -0,0 +1,16 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .builder import HOOKS, build_hook | |||
| from .checkpoint_hook import CheckpointHook | |||
| from .evaluation_hook import EvaluationHook | |||
| from .hook import Hook | |||
| from .iter_timer_hook import IterTimerHook | |||
| from .logger.text_logger_hook import TextLoggerHook | |||
| from .lr_scheduler_hook import LrSchedulerHook | |||
| from .optimizer_hook import OptimizerHook | |||
| from .priority import Priority | |||
| __all__ = [ | |||
| 'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', | |||
| 'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', | |||
| 'IterTimerHook' | |||
| ] | |||
| @@ -0,0 +1,9 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
| HOOKS = Registry('hooks') | |||
| def build_hook(cfg, default_args=None): | |||
| return build_from_cfg( | |||
| cfg, HOOKS, group_key=default_group, default_args=default_args) | |||
| @@ -0,0 +1,92 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from modelscope import __version__ | |||
| from modelscope.utils.checkpoint import save_checkpoint | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.torch_utils import get_dist_info | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| from .priority import Priority | |||
| @HOOKS.register_module() | |||
| class CheckpointHook(Hook): | |||
| """Save checkpoints periodically. | |||
| Args: | |||
| interval (int): The frequency to save model. If `by_epoch=True`, | |||
| it means the number of epochs, else means the number of iterations | |||
| by_epoch (bool): Saving checkpoints by epoch or by iteration. | |||
| save_optimizer (bool): Whether to save optimizer state dict. Default: True. | |||
| save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` | |||
| save_last (bool): Whether to save the last checkpoint. Default: True. | |||
| """ | |||
| PRIORITY = Priority.NORMAL | |||
| def __init__(self, | |||
| interval=0, | |||
| by_epoch=True, | |||
| save_optimizer=True, | |||
| save_dir=None, | |||
| save_last=True): | |||
| self.interval = interval | |||
| self.by_epoch = by_epoch | |||
| self.save_optimizer = save_optimizer | |||
| self.save_dir = save_dir | |||
| self.save_last = save_last | |||
| def before_run(self, trainer): | |||
| if not self.save_dir: | |||
| self.save_dir = trainer.work_dir | |||
| if not hasattr(trainer, 'logger'): | |||
| self.logger = get_logger(__name__) | |||
| else: | |||
| self.logger = trainer.logger | |||
| self.logger.info(f'Checkpoints will be saved to {self.save_dir}') | |||
| def after_train_epoch(self, trainer): | |||
| if not self.by_epoch: | |||
| return | |||
| if self._should_save(trainer): | |||
| self.logger.info(f'Saving checkpoint at {trainer.epoch + 1} epoch') | |||
| self._save_checkpoint(trainer) | |||
| def _save_checkpoint(self, trainer): | |||
| if self.by_epoch: | |||
| cur_save_name = os.path.join(self.save_dir, | |||
| f'epoch_{trainer.epoch + 1}.pth') | |||
| else: | |||
| cur_save_name = os.path.join(self.save_dir, | |||
| f'iter_{trainer.epoch + 1}.pth') | |||
| rank, _ = get_dist_info() | |||
| if rank == 0: | |||
| save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) | |||
| def after_train_iter(self, trainer): | |||
| if self.by_epoch: | |||
| return | |||
| if self._should_save(trainer): | |||
| self.logger.info( | |||
| f'Saving checkpoint at {trainer.iter + 1} iterations') | |||
| self._save_checkpoint(trainer) | |||
| def _should_save(self, trainer): | |||
| if self.by_epoch: | |||
| check_last = self.is_last_epoch | |||
| check_frequency = self.every_n_epochs | |||
| else: | |||
| check_last = self.is_last_iter | |||
| check_frequency = self.every_n_iters | |||
| if check_frequency(trainer, | |||
| self.interval) or (self.save_last | |||
| and check_last(trainer)): | |||
| return True | |||
| return False | |||
| @@ -0,0 +1,74 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| from .priority import Priority | |||
| @HOOKS.register_module() | |||
| class EvaluationHook(Hook): | |||
| """Evaluation hook. | |||
| Args: | |||
| interval (int): Evaluation interval. | |||
| by_epoch (bool): Evaluate by epoch or by iteration. | |||
| start_idx (int | None, optional): The epoch/iterations validation begins. | |||
| Default: None, validate every interval epochs/iterations from scratch. | |||
| """ | |||
| PRIORITY = Priority.NORMAL | |||
| def __init__(self, interval=1, by_epoch=True, start_idx=None): | |||
| assert interval > 0, 'interval must be a positive number' | |||
| self.interval = interval | |||
| self.start_idx = start_idx | |||
| self.by_epoch = by_epoch | |||
| def after_train_iter(self, trainer): | |||
| """Called after every training iter to evaluate the results.""" | |||
| if not self.by_epoch and self._should_evaluate(trainer): | |||
| self.do_evaluate(trainer) | |||
| def after_train_epoch(self, trainer): | |||
| """Called after every training epoch to evaluate the results.""" | |||
| if self.by_epoch and self._should_evaluate(trainer): | |||
| self.do_evaluate(trainer) | |||
| def do_evaluate(self, trainer): | |||
| """Evaluate the results.""" | |||
| eval_res = trainer.evaluate() | |||
| for name, val in eval_res.items(): | |||
| trainer.log_buffer.output[name] = val | |||
| trainer.log_buffer.ready = True | |||
| def _should_evaluate(self, trainer): | |||
| """Judge whether to perform evaluation. | |||
| Here is the rule to judge whether to perform evaluation: | |||
| 1. It will not perform evaluation during the epoch/iteration interval, | |||
| which is determined by ``self.interval``. | |||
| 2. It will not perform evaluation if the ``start_idx`` is larger than | |||
| current epochs/iters. | |||
| 3. It will not perform evaluation when current epochs/iters is larger than | |||
| the ``start_idx`` but during epoch/iteration interval. | |||
| Returns: | |||
| bool: The flag indicating whether to perform evaluation. | |||
| """ | |||
| if self.by_epoch: | |||
| current = trainer.epoch | |||
| check_time = self.every_n_epochs | |||
| else: | |||
| current = trainer.iter | |||
| check_time = self.every_n_iters | |||
| if self.start_idx is None: | |||
| if not check_time(trainer, self.interval): | |||
| return False | |||
| elif (current + 1) < self.start_idx: | |||
| return False | |||
| else: | |||
| if (current + 1 - self.start_idx) % self.interval: | |||
| return False | |||
| return True | |||
| @@ -0,0 +1,208 @@ | |||
| # Copyright (c) OpenMMLab. All rights reserved. | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.utils.import_utils import is_method_overridden | |||
| from .priority import Priority | |||
| class Hook: | |||
| """ | |||
| The Hook base class of any modelscope trainer. You can build your own hook inherited from this class. | |||
| """ | |||
| # TODO @jiangnana.jnn use constant variable for stages | |||
| stages = ('before_run', 'before_train_epoch', 'before_train_iter', | |||
| 'after_train_iter', 'after_train_epoch', 'before_val_epoch', | |||
| 'before_val_iter', 'after_val_iter', 'after_val_epoch', | |||
| 'after_run') | |||
| PRIORITY = Priority.NORMAL | |||
| def before_run(self, trainer): | |||
| """ | |||
| Will be called before any loop begins. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def after_run(self, trainer): | |||
| """ | |||
| Will be called after all loops end. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def before_epoch(self, trainer): | |||
| """ | |||
| Will be called before every epoch begins. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def after_epoch(self, trainer): | |||
| """ | |||
| Will be called after every epoch ends. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def before_iter(self, trainer): | |||
| """ | |||
| Will be called before every loop begins. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def after_iter(self, trainer): | |||
| """ | |||
| Will be called after every loop ends. | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| pass | |||
| def before_train_epoch(self, trainer): | |||
| """ | |||
| Will be called before every train epoch begins. Default call ``self.before_epoch`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.before_epoch(trainer) | |||
| def before_val_epoch(self, trainer): | |||
| """ | |||
| Will be called before every validation epoch begins. Default call ``self.before_epoch`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.before_epoch(trainer) | |||
| def after_train_epoch(self, trainer): | |||
| """ | |||
| Will be called after every train epoch ends. Default call ``self.after_epoch`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.after_epoch(trainer) | |||
| def after_val_epoch(self, trainer): | |||
| """ | |||
| Will be called after every validation epoch ends. Default call ``self.after_epoch`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.after_epoch(trainer) | |||
| def before_train_iter(self, trainer): | |||
| """ | |||
| Will be called before every train loop begins. Default call ``self.before_iter`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.before_iter(trainer) | |||
| def before_val_iter(self, trainer): | |||
| """ | |||
| Will be called before every validation loop begins. Default call ``self.before_iter`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.before_iter(trainer) | |||
| def after_train_iter(self, trainer): | |||
| """ | |||
| Will be called after every train loop ends. Default call ``self.after_iter`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.after_iter(trainer) | |||
| def after_val_iter(self, trainer): | |||
| """ | |||
| Will be called after every validation loop ends. Default call ``self.after_iter`` | |||
| Args: | |||
| trainer: The trainer instance. | |||
| Returns: None | |||
| """ | |||
| self.after_iter(trainer) | |||
| def every_n_epochs(self, trainer, n): | |||
| """ | |||
| Whether to reach every ``n`` epochs | |||
| Returns: bool | |||
| """ | |||
| return (trainer.epoch + 1) % n == 0 if n > 0 else False | |||
| def every_n_iters(self, trainer, n): | |||
| """ | |||
| Whether to reach every ``n`` iterations | |||
| Returns: bool | |||
| """ | |||
| return (trainer.iter + 1) % n == 0 if n > 0 else False | |||
| def end_of_epoch(self, trainer): | |||
| """ | |||
| Whether to reach the end of every epoch | |||
| Returns: bool | |||
| """ | |||
| return trainer.inner_iter + 1 == len(trainer.data_loader) | |||
| def is_last_epoch(self, trainer): | |||
| """ | |||
| Whether to reach the last epoch | |||
| Returns: bool | |||
| """ | |||
| return trainer.epoch + 1 == trainer._max_epochs | |||
| def is_last_iter(self, trainer): | |||
| """ | |||
| Whether to reach the last iteration in the entire training process | |||
| Returns: bool | |||
| """ | |||
| return trainer.iter + 1 == trainer._max_iters | |||
| def get_triggered_stages(self): | |||
| trigger_stages = set() | |||
| for stage in Hook.stages: | |||
| if is_method_overridden(stage, Hook, self): | |||
| trigger_stages.add(stage) | |||
| return [stage for stage in Hook.stages if stage in trigger_stages] | |||
| @@ -0,0 +1,22 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import time | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| from .priority import Priority | |||
| @HOOKS.register_module() | |||
| class IterTimerHook(Hook): | |||
| PRIORITY = Priority.LOW | |||
| def before_epoch(self, trainer): | |||
| self.start_time = time.time() | |||
| def before_iter(self, trainer): | |||
| trainer.log_buffer.update( | |||
| {'data_load_time': time.time() - self.start_time}) | |||
| def after_iter(self, trainer): | |||
| trainer.log_buffer.update({'time': time.time() - self.start_time}) | |||
| self.start_time = time.time() | |||
| @@ -0,0 +1,6 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.trainers.utils.log_buffer import LogBuffer | |||
| from .base import LoggerHook | |||
| from .text_logger_hook import TextLoggerHook | |||
| __all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer'] | |||
| @@ -0,0 +1,115 @@ | |||
| # Copyright (c) OpenMMLab. All rights reserved. | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import numbers | |||
| from abc import ABCMeta, abstractmethod | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.trainers.hooks.hook import Hook | |||
| from ..priority import Priority | |||
| class LoggerHook(Hook): | |||
| """Base class for logger hooks. | |||
| Args: | |||
| interval (int): Logging interval (every k iterations). | |||
| ignore_last (bool): Ignore the log of last iterations in each epoch | |||
| if less than `interval`. | |||
| reset_flag (bool): Whether to clear the output buffer after logging. | |||
| by_epoch (bool): Whether EpochBasedtrainer is used. | |||
| """ | |||
| __metaclass__ = ABCMeta | |||
| PRIORITY = Priority.VERY_LOW | |||
| def __init__(self, | |||
| interval=10, | |||
| ignore_last=True, | |||
| reset_flag=False, | |||
| by_epoch=True): | |||
| self.interval = interval | |||
| self.ignore_last = ignore_last | |||
| self.reset_flag = reset_flag | |||
| self.by_epoch = by_epoch | |||
| @abstractmethod | |||
| def log(self, trainer): | |||
| pass | |||
| @staticmethod | |||
| def is_scalar(val, include_np=True, include_torch=True): | |||
| """Tell the input variable is a scalar or not. | |||
| Args: | |||
| val: Input variable. | |||
| include_np (bool): Whether to treat 0-d np.ndarray as a scalar. | |||
| include_torch (bool): Whether to treat 0-d torch.Tensor as a scalar. | |||
| Returns: | |||
| bool: True or False. | |||
| """ | |||
| if isinstance(val, numbers.Number): | |||
| return True | |||
| elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: | |||
| return True | |||
| elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: | |||
| return True | |||
| else: | |||
| return False | |||
| def get_epoch(self, trainer): | |||
| if trainer.mode == 'train': | |||
| epoch = trainer.epoch + 1 | |||
| elif trainer.mode == 'val': | |||
| # normal val mode | |||
| # trainer.epoch += 1 has been done before val workflow | |||
| epoch = trainer.epoch | |||
| else: | |||
| raise ValueError(f"trainer mode should be 'train' or 'val', " | |||
| f'but got {trainer.mode}') | |||
| return epoch | |||
| def get_iter(self, trainer, inner_iter=False): | |||
| """Get the current training iteration step.""" | |||
| if self.by_epoch and inner_iter: | |||
| current_iter = trainer.inner_iter + 1 | |||
| else: | |||
| current_iter = trainer.iter + 1 | |||
| return current_iter | |||
| def before_run(self, trainer): | |||
| for hook in trainer.hooks[::-1]: | |||
| if isinstance(hook, LoggerHook): | |||
| hook.reset_flag = True | |||
| break | |||
| def before_epoch(self, trainer): | |||
| trainer.log_buffer.clear() # clear logs of last epoch | |||
| def after_train_iter(self, trainer): | |||
| if self.by_epoch and self.every_n_epochs(trainer, self.interval): | |||
| trainer.log_buffer.average(self.interval) | |||
| elif not self.by_epoch and self.every_n_iters(trainer, self.interval): | |||
| trainer.log_buffer.average(self.interval) | |||
| elif self.end_of_epoch(trainer) and not self.ignore_last: | |||
| # not precise but more stable | |||
| trainer.log_buffer.average(self.interval) | |||
| if trainer.log_buffer.ready: | |||
| self.log(trainer) | |||
| if self.reset_flag: | |||
| trainer.log_buffer.clear_output() | |||
| def after_train_epoch(self, trainer): | |||
| if trainer.log_buffer.ready: | |||
| self.log(trainer) | |||
| if self.reset_flag: | |||
| trainer.log_buffer.clear_output() | |||
| def after_val_epoch(self, trainer): | |||
| trainer.log_buffer.average() | |||
| self.log(trainer) | |||
| if self.reset_flag: | |||
| trainer.log_buffer.clear_output() | |||
| @@ -0,0 +1,159 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import datetime | |||
| import os | |||
| import os.path as osp | |||
| from collections import OrderedDict | |||
| import json | |||
| import torch | |||
| from torch import distributed as dist | |||
| from modelscope.utils.torch_utils import get_dist_info | |||
| from ..builder import HOOKS | |||
| from .base import LoggerHook | |||
| @HOOKS.register_module() | |||
| class TextLoggerHook(LoggerHook): | |||
| """Logger hook in text, Output log to both console and local json file. | |||
| Args: | |||
| by_epoch (bool, optional): Whether EpochBasedtrainer is used. | |||
| Default: True. | |||
| interval (int, optional): Logging interval (every k iterations). | |||
| Default: 10. | |||
| ignore_last (bool, optional): Ignore the log of last iterations in each | |||
| epoch if less than :attr:`interval`. Default: True. | |||
| reset_flag (bool, optional): Whether to clear the output buffer after | |||
| logging. Default: False. | |||
| out_dir (str): The directory to save log. If is None, use `trainer.work_dir` | |||
| """ | |||
| def __init__(self, | |||
| by_epoch=True, | |||
| interval=10, | |||
| ignore_last=True, | |||
| reset_flag=False, | |||
| out_dir=None): | |||
| super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, | |||
| by_epoch) | |||
| self.by_epoch = by_epoch | |||
| self.time_sec_tot = 0 | |||
| self.out_dir = out_dir | |||
| self._logged_keys = [] # store the key has been logged | |||
| def before_run(self, trainer): | |||
| super(TextLoggerHook, self).before_run(trainer) | |||
| if self.out_dir is None: | |||
| self.out_dir = trainer.work_dir | |||
| if not osp.exists(self.out_dir): | |||
| os.makedirs(self.out_dir) | |||
| trainer.logger.info('Text logs will be saved to {}'.format( | |||
| self.out_dir)) | |||
| self.start_iter = trainer.iter | |||
| self.json_log_path = osp.join(self.out_dir, | |||
| '{}.log.json'.format(trainer.timestamp)) | |||
| if hasattr(trainer, 'meta') and trainer.meta is not None: | |||
| self._dump_log(trainer.meta, trainer) | |||
| def _get_max_memory(self, trainer): | |||
| device = getattr(trainer.model, 'output_device', None) | |||
| mem = torch.cuda.max_memory_allocated(device=device) | |||
| mem_mb = torch.tensor([mem / (1024 * 1024)], | |||
| dtype=torch.int, | |||
| device=device) | |||
| _, world_size = get_dist_info() | |||
| if world_size > 1: | |||
| dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) | |||
| return mem_mb.item() | |||
| def _log_info(self, log_dict, trainer): | |||
| if log_dict['mode'] == 'train': | |||
| if isinstance(log_dict['lr'], dict): | |||
| lr_str = [] | |||
| for k, val in log_dict['lr'].items(): | |||
| lr_str.append(f'lr_{k}: {val:.3e}') | |||
| lr_str = ' '.join(lr_str) | |||
| else: | |||
| lr_str = f'lr: {log_dict["lr"]:.3e}' | |||
| if self.by_epoch: | |||
| log_str = f'Epoch [{log_dict["epoch"]}][{log_dict["iter"]}/{len(trainer.data_loader)}]\t' | |||
| else: | |||
| log_str = f'Iter [{log_dict["iter"]}/{trainer.max_iters}]\t' | |||
| log_str += f'{lr_str}, ' | |||
| self._logged_keys.extend(['lr', 'mode', 'iter', 'epoch']) | |||
| if 'time' in log_dict.keys(): | |||
| self.time_sec_tot += (log_dict['time'] * self.interval) | |||
| time_sec_avg = self.time_sec_tot / ( | |||
| trainer.iter - self.start_iter + 1) | |||
| eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) | |||
| eta_str = str(datetime.timedelta(seconds=int(eta_sec))) | |||
| log_str += f'eta: {eta_str}, ' | |||
| log_str += f'time: {log_dict["time"]:.3f}, data_load_time: {log_dict["data_load_time"]:.3f}, ' | |||
| self._logged_keys.extend([ | |||
| 'time', | |||
| 'data_load_time', | |||
| ]) | |||
| else: | |||
| # val/test time | |||
| # here 1000 is the length of the val dataloader | |||
| # by epoch: Epoch[val] [4][1000] | |||
| # by iter: Iter[val] [1000] | |||
| if self.by_epoch: | |||
| log_str = f'Epoch({log_dict["mode"]}) [{log_dict["epoch"]}][{log_dict["iter"]}]\t' | |||
| else: | |||
| log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' | |||
| self._logged_keys.extend(['mode', 'iter', 'epoch']) | |||
| log_items = [] | |||
| for name, val in log_dict.items(): | |||
| if name in self._logged_keys: | |||
| continue | |||
| if isinstance(val, float): | |||
| val = f'{val:.4f}' | |||
| log_items.append(f'{name}: {val}') | |||
| log_str += ', '.join(log_items) | |||
| trainer.logger.info(log_str) | |||
| def _dump_log(self, log_dict): | |||
| # dump log in json format | |||
| json_log = OrderedDict() | |||
| for k, v in log_dict.items(): | |||
| json_log[k] = self._round_float(v) | |||
| rank, _ = get_dist_info() | |||
| if rank == 0: | |||
| with open(self.json_log_path, 'a+') as f: | |||
| json.dump(json_log, f) | |||
| f.write('\n') | |||
| def _round_float(self, items, ndigits=5): | |||
| if isinstance(items, list): | |||
| return [self._round_float(item) for item in items] | |||
| elif isinstance(items, float): | |||
| return round(items, ndigits) | |||
| else: | |||
| return items | |||
| def log(self, trainer): | |||
| cur_iter = self.get_iter(trainer, inner_iter=True) | |||
| log_dict = OrderedDict( | |||
| mode=trainer.mode, epoch=self.get_epoch(trainer), iter=cur_iter) | |||
| # statistic memory | |||
| if torch.cuda.is_available(): | |||
| log_dict['memory'] = self._get_max_memory(trainer) | |||
| log_dict = dict(log_dict, **trainer.log_buffer.output) | |||
| self._log_info(log_dict, trainer) | |||
| self._dump_log(log_dict) | |||
| return log_dict | |||
| @@ -0,0 +1,71 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| from .priority import Priority | |||
| @HOOKS.register_module() | |||
| class LrSchedulerHook(Hook): | |||
| """Lr scheduler. | |||
| Args: | |||
| by_epoch (bool): Whether lr changes by epoch | |||
| warmup (dict): warm up config | |||
| """ | |||
| PRIORITY = Priority.VERY_HIGH | |||
| def __init__(self, by_epoch=True, warmup=None) -> None: | |||
| super().__init__() | |||
| self.by_epoch = by_epoch | |||
| if not self.by_epoch: | |||
| raise ValueError('We only support ``by_epoch=True`` now!') | |||
| self.warmup = warmup | |||
| self.warmup_lr_scheduler = None | |||
| def before_run(self, trainer): | |||
| if self.warmup is not None: | |||
| assert isinstance(self.warmup, dict) and 'type' in self.warmup | |||
| self.warmup_lr_scheduler = build_lr_scheduler( | |||
| cfg=self.warmup, | |||
| default_args={'base_scheduler': trainer.lr_scheduler}) | |||
| def get_current_lr(self, trainer): | |||
| import torch | |||
| if isinstance(trainer.optimizer, torch.optim.Optimizer): | |||
| lr = [group['lr'] for group in trainer.optimizer.param_groups] | |||
| elif isinstance(trainer.optimizer, dict): | |||
| lr = dict() | |||
| for name, optim in trainer.optimizer.items(): | |||
| lr[name] = [group['lr'] for group in optim.param_groups] | |||
| else: | |||
| raise RuntimeError( | |||
| 'lr is not applicable because optimizer does not exist.') | |||
| return lr | |||
| def before_train_iter(self, trainer): | |||
| trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||
| def before_train_epoch(self, trainer): | |||
| if self.by_epoch: | |||
| if self.warmup_lr_scheduler is not None: | |||
| self.warmup_lr_scheduler.step() | |||
| else: | |||
| trainer.lr_scheduler.step() | |||
| trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) | |||
| def _get_log_lr(self, trainer): | |||
| cur_lr = self.get_current_lr(trainer) | |||
| # only record lr of the first param group | |||
| if isinstance(cur_lr, list): | |||
| lr = cur_lr[0] | |||
| else: | |||
| assert isinstance(cur_lr, dict) | |||
| lr = {} | |||
| for k, lr_ in cur_lr.items(): | |||
| assert isinstance(lr_, list) | |||
| lr.update({k: lr_[0]}) | |||
| return lr | |||
| @@ -0,0 +1,37 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from torch.nn.utils import clip_grad | |||
| from .builder import HOOKS | |||
| from .hook import Hook | |||
| from .priority import Priority | |||
| @HOOKS.register_module() | |||
| class OptimizerHook(Hook): | |||
| PRIORITY = Priority.ABOVE_NORMAL | |||
| def __init__(self, grad_clip=None, loss_keys='loss') -> None: | |||
| if isinstance(loss_keys, str): | |||
| loss_keys = [loss_keys] | |||
| assert isinstance(loss_keys, (tuple, list)) | |||
| self.loss_keys = loss_keys | |||
| self.grad_clip = grad_clip | |||
| def clip_grads(self, params, **clip_args): | |||
| params = list( | |||
| filter(lambda p: p.requires_grad and p.grad is not None, params)) | |||
| if len(params) > 0: | |||
| return clip_grad.clip_grad_norm_(params, **clip_args) | |||
| def after_train_iter(self, trainer): | |||
| trainer.optimizer.zero_grad() | |||
| for k in self.loss_keys: | |||
| trainer.train_outputs[k].backward() | |||
| clip_args = self.grad_clip | |||
| if clip_args is not None: | |||
| self.clip_grads(trainer.model.parameters(), **clip_args) | |||
| trainer.optimizer.step() | |||
| @@ -0,0 +1,62 @@ | |||
| # Copyright (c) OpenMMLab. All rights reserved. | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from enum import Enum | |||
| from typing import Union | |||
| class Priority(Enum): | |||
| """Hook priority levels. | |||
| +--------------+------------+ | |||
| | Level | Value | | |||
| +==============+============+ | |||
| | HIGHEST | 0 | | |||
| +--------------+------------+ | |||
| | VERY_HIGH | 10 | | |||
| +--------------+------------+ | |||
| | HIGH | 30 | | |||
| +--------------+------------+ | |||
| | ABOVE_NORMAL | 40 | | |||
| +--------------+------------+ | |||
| | NORMAL | 50 | | |||
| +--------------+------------+ | |||
| | BELOW_NORMAL | 60 | | |||
| +--------------+------------+ | |||
| | LOW | 70 | | |||
| +--------------+------------+ | |||
| | VERY_LOW | 90 | | |||
| +--------------+------------+ | |||
| | LOWEST | 100 | | |||
| +--------------+------------+ | |||
| """ | |||
| HIGHEST = 0 | |||
| VERY_HIGH = 10 | |||
| HIGH = 30 | |||
| ABOVE_NORMAL = 40 | |||
| NORMAL = 50 | |||
| BELOW_NORMAL = 60 | |||
| LOW = 70 | |||
| VERY_LOW = 90 | |||
| LOWEST = 100 | |||
| def get_priority(priority: Union[int, str, Priority]) -> int: | |||
| """Get priority value. | |||
| Args: | |||
| priority (int or str or :obj:`Priority`): Priority. | |||
| Returns: | |||
| int: The priority value. | |||
| """ | |||
| if isinstance(priority, int): | |||
| if priority < 0 or priority > 100: | |||
| raise ValueError('priority must be between 0 and 100') | |||
| return priority | |||
| elif isinstance(priority, Priority): | |||
| return priority.value | |||
| elif isinstance(priority, str): | |||
| return Priority[priority.upper()].value | |||
| else: | |||
| raise TypeError('priority must be an integer or Priority enum value') | |||
| @@ -0,0 +1,8 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .builder import LR_SCHEDULER, build_lr_scheduler | |||
| from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup | |||
| __all__ = [ | |||
| 'LR_SCHEDULER', 'build_lr_scheduler', 'BaseWarmup', 'ConstantWarmup', | |||
| 'LinearWarmup', 'ExponentialWarmup' | |||
| ] | |||
| @@ -0,0 +1,47 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import inspect | |||
| from modelscope.utils.config import ConfigDict | |||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
| LR_SCHEDULER = Registry('lr scheduler') | |||
| def build_lr_scheduler(cfg: ConfigDict, default_args: dict = None): | |||
| """ build lr scheduler from given lr scheduler config dict | |||
| Args: | |||
| cfg (:obj:`ConfigDict`): config dict for lr scheduler object. | |||
| default_args (dict, optional): Default initialization arguments. | |||
| """ | |||
| if cfg['type'].lower().endswith('warmup'): | |||
| # build warmup lr scheduler | |||
| if not hasattr(cfg, 'base_scheduler'): | |||
| if default_args is None or ('base_scheduler' not in default_args): | |||
| raise ValueError( | |||
| 'Must provide ``base_scheduler`` which is an instance of ``torch.optim.lr_scheduler._LRScheduler`` ' | |||
| 'for build warmup lr scheduler.') | |||
| else: | |||
| # build lr scheduler without warmup | |||
| if not hasattr(cfg, 'optimizer'): | |||
| if default_args is None or ('optimizer' not in default_args): | |||
| raise ValueError( | |||
| 'Must provide ``optimizer`` which is an instance of ``torch.optim.Optimizer`` ' | |||
| 'for build lr scheduler') | |||
| return build_from_cfg( | |||
| cfg, LR_SCHEDULER, group_key=default_group, default_args=default_args) | |||
| def register_torch_lr_scheduler(): | |||
| from torch.optim import lr_scheduler | |||
| from torch.optim.lr_scheduler import _LRScheduler | |||
| members = inspect.getmembers(lr_scheduler) | |||
| for name, obj in members: | |||
| if inspect.isclass(obj) and issubclass(obj, _LRScheduler): | |||
| LR_SCHEDULER.register_module(module_name=name, module_cls=obj) | |||
| register_torch_lr_scheduler() | |||
| @@ -0,0 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .base import BaseWarmup | |||
| from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup | |||
| __all__ = ['BaseWarmup', 'ConstantWarmup', 'LinearWarmup', 'ExponentialWarmup'] | |||
| @@ -0,0 +1,75 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from torch.optim.lr_scheduler import _LRScheduler | |||
| class BaseWarmup(_LRScheduler): | |||
| """Base warmup scheduler | |||
| Args: | |||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
| warmup_iters (int | list): Warmup iterations | |||
| last_epoch (int): The index of last epoch. | |||
| """ | |||
| def __init__(self, | |||
| base_scheduler, | |||
| warmup_iters, | |||
| last_epoch=-1, | |||
| verbose=False): | |||
| self.base_scheduler = base_scheduler | |||
| self.warmup_iters = warmup_iters | |||
| optimizer = self.base_scheduler.optimizer | |||
| self._is_init_step = True | |||
| super(BaseWarmup, self).__init__( | |||
| optimizer, last_epoch=last_epoch, verbose=verbose) | |||
| def get_lr(self): | |||
| return self.base_scheduler.get_lr() | |||
| def state_dict(self): | |||
| self.base_scheduler.state_dict() | |||
| def load_state_dict(self, state_dict): | |||
| self.base_scheduler.load_state_dict(state_dict) | |||
| def scale(self): | |||
| """Scale the learning rates. | |||
| """ | |||
| scale_value = self.get_warmup_scale(self.base_scheduler._step_count | |||
| - 1) | |||
| if isinstance(scale_value, (int, float)): | |||
| scale_value = [ | |||
| scale_value for _ in range(len(self.optimizer.param_groups)) | |||
| ] | |||
| else: | |||
| assert isinstance( | |||
| scale_value, (list, tuple)), 'Only support list or tuple type!' | |||
| assert len(scale_value) == len( | |||
| self.optimizer.param_groups), ('Size mismatch {} != {}'.format( | |||
| len(scale_value), len(self.optimizer.param_groups))) | |||
| for i, group in enumerate(self.optimizer.param_groups): | |||
| group['lr'] *= scale_value[i] | |||
| def step(self, epoch=None): | |||
| """ | |||
| When ``self.base_scheduler._step_count`` is less than ``self.warmup_iters``, multiply lr by scale | |||
| """ | |||
| if self.base_scheduler._step_count > self.warmup_iters: | |||
| return self.base_scheduler.step(epoch=epoch) | |||
| for group, lr in zip(self.optimizer.param_groups, self.base_lrs): | |||
| group['lr'] = lr | |||
| # `base_scheduler` has done step() at init when build | |||
| if self._is_init_step: | |||
| self._is_init_step = False | |||
| else: | |||
| self.base_scheduler.step(epoch=epoch) | |||
| self.scale() | |||
| @classmethod | |||
| def get_warmup_scale(self, cur_iter): | |||
| pass | |||
| @@ -0,0 +1,79 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.trainers.lrscheduler.builder import LR_SCHEDULER | |||
| from .base import BaseWarmup | |||
| @LR_SCHEDULER.register_module() | |||
| class ConstantWarmup(BaseWarmup): | |||
| """Linear warmup scheduler. | |||
| Args: | |||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
| warmup_ratio (float): Lr used at warmup stage equals to warmup_ratio * initial_lr | |||
| warmup_iters (int | list): Warmup iterations | |||
| last_epoch (int): The index of last epoch. | |||
| """ | |||
| def __init__(self, | |||
| base_scheduler, | |||
| warmup_iters, | |||
| warmup_ratio=0.1, | |||
| last_epoch=-1): | |||
| self.warmup_ratio = warmup_ratio | |||
| super(ConstantWarmup, self).__init__( | |||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
| def get_warmup_scale(self, cur_iter): | |||
| if cur_iter >= self.warmup_iters: | |||
| return 1.0 | |||
| return self.warmup_ratio | |||
| @LR_SCHEDULER.register_module() | |||
| class LinearWarmup(BaseWarmup): | |||
| """Linear warmup scheduler. | |||
| Args: | |||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
| warmup_iters (int | list): Warmup iterations | |||
| warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||
| last_epoch (int): The index of last epoch. | |||
| """ | |||
| def __init__(self, | |||
| base_scheduler, | |||
| warmup_iters, | |||
| warmup_ratio=0.1, | |||
| last_epoch=-1): | |||
| self.warmup_ratio = warmup_ratio | |||
| super(LinearWarmup, self).__init__( | |||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
| def get_warmup_scale(self, cur_iter): | |||
| k = (1 - cur_iter / self.warmup_iters) * (1 - self.warmup_ratio) | |||
| return 1 - k | |||
| @LR_SCHEDULER.register_module() | |||
| class ExponentialWarmup(BaseWarmup): | |||
| """Exponential warmup scheduler. | |||
| Args: | |||
| base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type | |||
| warmup_iters (int | list): Warmup iterations | |||
| warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr | |||
| last_epoch (int): The index of last epoch. | |||
| """ | |||
| def __init__(self, | |||
| base_scheduler, | |||
| warmup_iters, | |||
| warmup_ratio=0.1, | |||
| last_epoch=-1): | |||
| self.warmup_ratio = warmup_ratio | |||
| super(ExponentialWarmup, self).__init__( | |||
| base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) | |||
| def get_warmup_scale(self, cur_iter): | |||
| k = self.warmup_ratio**(1 - cur_iter / self.warmup_iters) | |||
| return k | |||
| @@ -0,0 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .builder import OPTIMIZERS, build_optimizer | |||
| __all__ = ['OPTIMIZERS', 'build_optimizer'] | |||
| @@ -0,0 +1,39 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import inspect | |||
| import torch | |||
| from modelscope.utils.config import ConfigDict | |||
| from modelscope.utils.registry import Registry, build_from_cfg, default_group | |||
| OPTIMIZERS = Registry('optimizer') | |||
| def build_optimizer(model: torch.nn.Module, | |||
| cfg: ConfigDict, | |||
| default_args: dict = None): | |||
| """ build optimizer from optimizer config dict | |||
| Args: | |||
| cfg (:obj:`ConfigDict`): config dict for optimizer object. | |||
| default_args (dict, optional): Default initialization arguments. | |||
| """ | |||
| if hasattr(model, 'module'): | |||
| model = model.module | |||
| cfg.params = model.parameters() | |||
| return build_from_cfg( | |||
| cfg, OPTIMIZERS, group_key=default_group, default_args=default_args) | |||
| def register_torch_optimizers(): | |||
| for name, module in inspect.getmembers(torch.optim): | |||
| if name.startswith('__'): | |||
| continue | |||
| if inspect.isclass(module) and issubclass(module, | |||
| torch.optim.Optimizer): | |||
| OPTIMIZERS.register_module( | |||
| default_group, module_name=name, module_cls=module) | |||
| register_torch_optimizers() | |||
| @@ -0,0 +1,703 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os.path | |||
| import random | |||
| import time | |||
| from distutils.version import LooseVersion | |||
| from functools import partial | |||
| from typing import Callable, List, Optional, Tuple, Union | |||
| import numpy as np | |||
| import torch | |||
| from addict import Dict | |||
| from torch import distributed as dist | |||
| from torch import nn | |||
| from torch.utils.data import DataLoader, Dataset | |||
| from torch.utils.data.distributed import DistributedSampler | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.metrics import build_metric, task_default_metrics | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.base_torch import TorchModel | |||
| from modelscope.msdatasets.ms_dataset import MsDataset | |||
| from modelscope.preprocessors import build_preprocessor | |||
| from modelscope.preprocessors.base import Preprocessor | |||
| from modelscope.task_datasets import TorchTaskDataset, build_task_dataset | |||
| from modelscope.trainers.hooks.builder import HOOKS | |||
| from modelscope.trainers.hooks.priority import Priority, get_priority | |||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
| from modelscope.trainers.optimizer.builder import build_optimizer | |||
| from modelscope.utils.config import ConfigDict | |||
| from modelscope.utils.constant import Hubs, ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.registry import build_from_cfg | |||
| from modelscope.utils.tensor_utils import torch_default_data_collator | |||
| from modelscope.utils.torch_utils import get_dist_info | |||
| from .base import BaseTrainer | |||
| from .builder import TRAINERS | |||
| from .hooks.hook import Hook | |||
| @TRAINERS.register_module() | |||
| class EpochBasedTrainer(BaseTrainer): | |||
| """Epoch based Trainer, a training helper for PyTorch. | |||
| Args: | |||
| cfg_file(str): The local config file. | |||
| model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir | |||
| or a model id. If model is None, build_model method will be called. | |||
| data_collator (`Callable`, *optional*): | |||
| The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. | |||
| train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): | |||
| The dataset to use for training. | |||
| Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a | |||
| distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a | |||
| `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will | |||
| manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally | |||
| sets the seed of the RNGs used. | |||
| eval_dataset (`torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation. | |||
| preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor. | |||
| NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code, | |||
| this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file. | |||
| Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and | |||
| this preprocessing action will be executed every time the dataset's __getitem__ is called. | |||
| optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple | |||
| containing the optimizer and the scheduler to use. | |||
| max_epochs: (int, optional): Total training epochs. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| model: Optional[Union[TorchModel, nn.Module, str]] = None, | |||
| cfg_file: Optional[str] = None, | |||
| arg_parse_fn: Optional[Callable] = None, | |||
| data_collator: Optional[Callable] = None, | |||
| train_dataset: Optional[Dataset] = None, | |||
| eval_dataset: Optional[Dataset] = None, | |||
| preprocessor: Optional[Preprocessor] = None, | |||
| optimizers: Tuple[torch.optim.Optimizer, | |||
| torch.optim.lr_scheduler._LRScheduler] = (None, | |||
| None), | |||
| **kwargs): | |||
| if isinstance(model, str): | |||
| if os.path.exists(model): | |||
| self.model_dir = model if os.path.isdir( | |||
| model) else os.path.dirname(model) | |||
| else: | |||
| self.model_dir = snapshot_download(model) | |||
| cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) | |||
| self.model = self.build_model() | |||
| else: | |||
| assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' | |||
| assert isinstance( | |||
| model, | |||
| (TorchModel, nn.Module | |||
| )), 'model should be either str, TorchMode or nn.Module.' | |||
| self.model_dir = os.path.dirname(cfg_file) | |||
| self.model = model | |||
| super().__init__(cfg_file, arg_parse_fn) | |||
| if 'work_dir' in kwargs: | |||
| self.work_dir = kwargs['work_dir'] | |||
| else: | |||
| self.work_dir = self.cfg.train.get('work_dir', './work_dir') | |||
| self.preprocessor = None | |||
| if isinstance(preprocessor, Preprocessor): | |||
| self.preprocessor = preprocessor | |||
| elif hasattr(self.cfg, 'preprocessor'): | |||
| self.preprocessor = self.build_preprocessor() | |||
| # TODO @wenmeng.zwm add data collator option | |||
| # TODO how to fill device option? | |||
| self.device = int( | |||
| os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None | |||
| self.train_dataset = self.to_task_dataset( | |||
| train_dataset, mode='train', preprocessor=self.preprocessor) | |||
| self.eval_dataset = self.to_task_dataset( | |||
| eval_dataset, mode='eval', preprocessor=self.preprocessor) | |||
| self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||
| self.metrics = self.get_metrics() | |||
| self.optimizers = optimizers | |||
| self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) | |||
| self._mode = 'train' | |||
| self._hooks: List[Hook] = [] | |||
| self._epoch = 0 | |||
| self._iter = 0 | |||
| self._inner_iter = 0 | |||
| if 'max_epochs' not in kwargs: | |||
| assert hasattr( | |||
| self.cfg.train, | |||
| 'max_epochs'), 'max_epochs is missing in configuration file' | |||
| self._max_epochs = self.cfg.train.max_epochs | |||
| else: | |||
| self._max_epochs = kwargs['max_epochs'] | |||
| # TODO @wenmeng.zwm add seed init fn | |||
| self._seed = 0 | |||
| self._dist = get_dist_info()[1] > 1 | |||
| @property | |||
| def mode(self): | |||
| return self._mode | |||
| @property | |||
| def hooks(self) -> List[Hook]: | |||
| """list[:obj:`Hook`]: A list of registered hooks.""" | |||
| return self._hooks | |||
| @property | |||
| def epoch(self) -> int: | |||
| """int: Current epoch.""" | |||
| return self._epoch | |||
| @property | |||
| def iter(self) -> int: | |||
| """int: Current iteration.""" | |||
| return self._iter | |||
| @property | |||
| def inner_iter(self) -> int: | |||
| """int: Iteration in an epoch.""" | |||
| return self._inner_iter | |||
| @property | |||
| def max_epochs(self): | |||
| """int: Maximum training epochs.""" | |||
| return self._max_epochs | |||
| @property | |||
| def max_iters(self): | |||
| """int: Maximum training iterations.""" | |||
| return self._max_epochs * len(self.data_loader) | |||
| def to_task_dataset(self, | |||
| datasets: Tuple[Dataset, List[Dataset]], | |||
| mode: str, | |||
| preprocessor: Optional[Preprocessor] = None): | |||
| """Build the task specific dataset processor for this trainer. | |||
| Returns: The task dataset processor for the task. If no result for the very model-type and task, | |||
| the default TaskDataset will be returned. | |||
| """ | |||
| try: | |||
| if not datasets: | |||
| return datasets | |||
| if isinstance(datasets, TorchTaskDataset): | |||
| return datasets | |||
| task_dataset = build_task_dataset( | |||
| ConfigDict({ | |||
| **self.cfg.model, | |||
| 'mode': mode, | |||
| 'preprocessor': preprocessor, | |||
| 'datasets': datasets, | |||
| }), getattr(self.cfg, 'task', None)) | |||
| return task_dataset | |||
| except Exception: | |||
| if isinstance(datasets, (List, Tuple)) or preprocessor is not None: | |||
| return TorchTaskDataset( | |||
| datasets, | |||
| mode=mode, | |||
| preprocessor=preprocessor, | |||
| **(self.cfg.model if hasattr(self.cfg, 'model') else {})) | |||
| else: | |||
| return datasets | |||
| def build_preprocessor(self) -> Preprocessor: | |||
| """Build the preprocessor. | |||
| User can override this method to implement custom logits. | |||
| Returns: The preprocessor instance. | |||
| """ | |||
| # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||
| # when they are different ones in training and evaluation | |||
| cfg = ConfigDict({ | |||
| **getattr(self.cfg, 'preprocessor'), 'model_dir': | |||
| self.model_dir | |||
| }) | |||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
| def get_metrics(self) -> List[str]: | |||
| """Get the metric class types. | |||
| The first choice will be the metrics configured in the config file, if not found, the default metrics will be | |||
| used. | |||
| If no metrics is found and the eval dataset exists, the method will raise an error. | |||
| Returns: The metric types. | |||
| """ | |||
| metrics = self.cfg.evaluation.metrics if hasattr( | |||
| self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, | |||
| 'metrics') else None | |||
| metrics = metrics if metrics is not None else task_default_metrics.get( | |||
| self.cfg.task) | |||
| if metrics is None and self.eval_dataset is not None: | |||
| raise ValueError( | |||
| f'Metrics are needed in evaluation, please try to either ' | |||
| f'add metrics in configuration.json or add the default metric for {self.cfg.task}.' | |||
| ) | |||
| if isinstance(metrics, str): | |||
| metrics = [metrics] | |||
| return metrics | |||
| def train(self, *args, **kwargs): | |||
| self.model.train() | |||
| self._mode = 'train' | |||
| if self.train_dataset is None: | |||
| self.train_dataloader = self.get_train_dataloader() | |||
| else: | |||
| self.train_dataloader = self._build_dataloader_with_dataset( | |||
| self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||
| self.data_loader = self.train_dataloader | |||
| self.register_optimizers_hook() | |||
| self.register_hook_from_cfg(self.cfg.train.hooks) | |||
| self.train_loop(self.train_dataloader) | |||
| def evaluate(self, checkpoint_path=None): | |||
| self.model.eval() | |||
| self._mode = 'val' | |||
| if self.eval_dataset is None: | |||
| self.eval_dataloader = self.get_eval_data_loader() | |||
| else: | |||
| self.eval_dataloader = self._build_dataloader_with_dataset( | |||
| self.eval_dataset, **self.cfg.evaluation.get('dataloader', {})) | |||
| self.data_loader = self.eval_dataloader | |||
| metric_classes = [build_metric(metric) for metric in self.metrics] | |||
| self.evaluation_loop(self.eval_dataloader, checkpoint_path, | |||
| metric_classes) | |||
| metric_values = {} | |||
| for metric_cls in metric_classes: | |||
| metric_values.update(metric_cls.evaluate()) | |||
| return metric_values | |||
| def build_model(self) -> Union[nn.Module, TorchModel]: | |||
| """ Instantiate a pytorch model and return. | |||
| By default, we will create a model using config from configuration file. You can | |||
| subclass and override this method in a subclass. | |||
| """ | |||
| # TODO temp implementation, waiting for @zhangzhicheng | |||
| model = Model.from_pretrained(self.model_dir) | |||
| if not isinstance(model, nn.Module) and hasattr(model, 'model'): | |||
| return model.model | |||
| def collate_fn(self, data): | |||
| """Prepare the input just before the forward function. | |||
| This method will move the tensors to the right device. | |||
| Usually this method does not need to be overridden. | |||
| Args: | |||
| data: The data out of the dataloader. | |||
| Returns: The processed data. | |||
| """ | |||
| if isinstance(data, dict): | |||
| return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||
| elif isinstance(data, (tuple, np.ndarray, list)): | |||
| return type(data)(self.collate_fn(v) for v in data) | |||
| elif isinstance(data, torch.Tensor) and self.device is not None: | |||
| kwargs = dict(device=self.device) | |||
| return data.to(**kwargs) | |||
| return data | |||
| def train_step(self, model, inputs): | |||
| """ Perform a training step on a batch of inputs. | |||
| Subclass and override to inject custom behavior. | |||
| Args: | |||
| model (`TorchModel`): The model to train. | |||
| inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||
| The inputs and targets of the model. | |||
| The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||
| argument `labels`. Check your model's documentation for all accepted arguments. | |||
| Return: | |||
| `torch.Tensor`: The tensor with training loss on this batch. | |||
| """ | |||
| # EvaluationHook will do evaluate and change mode to val, return to train mode | |||
| # TODO: find more pretty way to change mode | |||
| model.train() | |||
| self._mode = 'train' | |||
| inputs = self.collate_fn(inputs) | |||
| if isinstance(inputs, dict): | |||
| train_outputs = model.forward(**inputs) | |||
| else: | |||
| train_outputs = model.forward(inputs) | |||
| if not isinstance(train_outputs, dict): | |||
| raise TypeError( | |||
| '"model.train_step()" and "model.val_step()" must return a dict' | |||
| ) | |||
| # add model output info to log | |||
| if 'log_vars' not in train_outputs: | |||
| default_keys_pattern = ['loss'] | |||
| match_keys = set([]) | |||
| for key_p in default_keys_pattern: | |||
| match_keys.update( | |||
| [key for key in train_outputs.keys() if key_p in key]) | |||
| log_vars = {} | |||
| for key in match_keys: | |||
| value = train_outputs.get(key, None) | |||
| if value is not None: | |||
| if dist.is_available() and dist.is_initialized(): | |||
| value = value.data.clone() | |||
| dist.all_reduce(value.div_(dist.get_world_size())) | |||
| log_vars.update({key: value.item()}) | |||
| self.log_buffer.update(log_vars) | |||
| else: | |||
| self.log_buffer.update(train_outputs['log_vars']) | |||
| self.train_outputs = train_outputs | |||
| def prediction_step(self, model, inputs): | |||
| """ Perform forward step by `model` using `inputs`. | |||
| Args: | |||
| model (`TorchModel`): The model to evaluate. | |||
| inputs (`Dict[str, Union[torch.Tensor, Any]]`): | |||
| The inputs and targets of the model. | |||
| The dictionary will be unpacked before being fed to the model. Most models expect the targets under the | |||
| argument `labels`. Check your model's documentation for all accepted arguments. | |||
| prediction_loss_only (`bool`): | |||
| Whether or not to return the loss only. | |||
| ignore_keys (`Lst[str]`, *optional*): | |||
| A list of keys in the output of your model (if it is a dictionary) that should be ignored when | |||
| gathering predictions. | |||
| Return: | |||
| Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, | |||
| logits and labels (each being optional). | |||
| """ | |||
| raise NotImplementedError | |||
| def get_train_dataloader(self): | |||
| """ Builder torch dataloader for training. | |||
| We provide a reasonable default that works well. If you want to use something else, you can change | |||
| the config for data.train in configuration file, or subclass and override this method | |||
| (or `get_train_dataloader` in a subclass. | |||
| """ | |||
| train_data = self.cfg.dataset.train | |||
| if self.train_dataset is None: | |||
| self.train_dataset = self.build_dataset(train_data, mode='train') | |||
| data_loader = self._build_dataloader_with_dataset( | |||
| self.train_dataset, **self.cfg.train.get('dataloader', {})) | |||
| return data_loader | |||
| def get_eval_data_loader(self): | |||
| """ Builder torch dataloader for evaluation. | |||
| We provide a reasonable default that works well. If you want to use something else, you can change | |||
| the config for dataset.eval in configuration file, or subclass and override this method in a subclass. | |||
| pass | |||
| """ | |||
| val_data = self.cfg.dataset.val | |||
| if self.eval_dataset is None: | |||
| self.eval_dataset = self.build_dataset(val_data, mode='eval') | |||
| batch_size = self.cfg.evaluation.batch_size | |||
| workers = self.cfg.evaluation.workers | |||
| shuffle = self.cfg.evaluation.get('shuffle', False) | |||
| data_loader = self._build_dataloader_with_dataset( | |||
| self.eval_dataset, | |||
| batch_size_per_gpu=batch_size, | |||
| workers_per_gpu=workers, | |||
| shuffle=shuffle, | |||
| dist=self._dist, | |||
| seed=self._seed, | |||
| persistent_workers=True, | |||
| ) | |||
| return data_loader | |||
| def build_dataset(self, data_cfg, mode): | |||
| """ Build torch dataset object using data config | |||
| """ | |||
| dataset = MsDataset.load( | |||
| dataset_name=data_cfg.name, | |||
| split=data_cfg.split, | |||
| subset_name=data_cfg.subset_name if hasattr( | |||
| data_cfg, 'subset_name') else None, | |||
| hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||
| ) | |||
| torch_dataset = dataset.to_torch_dataset( | |||
| preprocessors=self.preprocessor, ) | |||
| dataset = self.to_task_dataset( | |||
| torch_dataset, mode, preprocessor=self.preprocessor) | |||
| return dataset | |||
| def create_optimizer_and_scheduler(self): | |||
| """ Create optimizer and lr scheduler | |||
| We provide a default implementation, if you want to customize your own optimizer | |||
| and lr scheduler, you can either pass a tuple through trainer init function or | |||
| subclass this class and override this method. | |||
| """ | |||
| optimizer, lr_scheduler = self.optimizers | |||
| if optimizer is None: | |||
| optimizer_cfg = self.cfg.train.get('optimizer', None) | |||
| else: | |||
| optimizer_cfg = None | |||
| optim_options = {} | |||
| if optimizer_cfg is not None: | |||
| optim_options = optimizer_cfg.pop('options', {}) | |||
| optimizer = build_optimizer(self.model, cfg=optimizer_cfg) | |||
| if lr_scheduler is None: | |||
| lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None) | |||
| else: | |||
| lr_scheduler_cfg = None | |||
| lr_options = {} | |||
| if lr_scheduler_cfg is not None: | |||
| assert optimizer is not None | |||
| lr_options = lr_scheduler_cfg.pop('options', {}) | |||
| lr_scheduler = build_lr_scheduler( | |||
| cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer}) | |||
| self.optimizer = optimizer | |||
| self.lr_scheduler = lr_scheduler | |||
| return self.optimizer, self.lr_scheduler, optim_options, lr_options | |||
| def register_optimizers_hook(self): | |||
| """ Register optimizer hook and lr scheduler hook. | |||
| """ | |||
| optimizer, lr_scheduler = self.optimizers | |||
| opti_error_msg = 'optimizers should be a tuple of `torch.optim.Optimizer`'\ | |||
| ' and `torch.optim.lr_scheduler._LRScheduler`' | |||
| if optimizer is not None: | |||
| assert isinstance(optimizer, torch.optim.Optimizer), opti_error_msg | |||
| if lr_scheduler is not None: | |||
| assert isinstance( | |||
| lr_scheduler, | |||
| torch.optim.lr_scheduler._LRScheduler), opti_error_msg | |||
| _, _, optim_options, lr_options = self.create_optimizer_and_scheduler() | |||
| lr_hook = dict(type='LrSchedulerHook', **lr_options) | |||
| optim_hook = dict(type='OptimizerHook', **optim_options) | |||
| self.register_hook_from_cfg([lr_hook, optim_hook]) | |||
| def _build_dataloader_with_dataset(self, | |||
| dataset: Dataset, | |||
| batch_size_per_gpu: int, | |||
| workers_per_gpu: int, | |||
| dist: bool = False, | |||
| shuffle: bool = True, | |||
| seed: int = 0, | |||
| persistent_workers=False, | |||
| **kwargs) -> DataLoader: | |||
| """Build dataloader using input dataset and cfg. Used by `EpochBasedTrainer.train()` | |||
| and `EpochBasedTrainer.evaluate()`. | |||
| In distributed training, each GPU/process has a dataloader. | |||
| In non-distributed training, there is only one dataloader for all GPUs. | |||
| Args: | |||
| dataset (Dataset): A PyTorch dataset. | |||
| batch_size_per_gpu (int): Number of training samples on each GPU, i.e., | |||
| batch size of each GPU. | |||
| workers_per_gpu (int): How many subprocesses to use for data loading | |||
| for each GPU. | |||
| dist (bool): Distributed training/test or not. Default: True. | |||
| shuffle (bool): Whether to shuffle the data at every epoch. | |||
| Default: True. | |||
| seed (int, Optional): Seed to be used. Default: 0. | |||
| runner_type (str): Type of runner. Default: `EpochBasedRunner` | |||
| persistent_workers (bool): If True, the data loader will not shutdown | |||
| the worker processes after a dataset has been consumed once. | |||
| This allows to maintain the workers `Dataset` instances alive. | |||
| This argument is only valid when PyTorch>=1.7.0. Default: False. | |||
| kwargs: any keyword argument to be used to initialize DataLoader | |||
| Returns: | |||
| DataLoader: A PyTorch dataloader. | |||
| """ | |||
| rank, world_size = get_dist_info() | |||
| if dist: | |||
| # When model is :obj:`DistributedDataParallel`, | |||
| # `batch_size` of :obj:`dataloader` is the | |||
| # number of training samples on each GPU. | |||
| batch_size = batch_size_per_gpu | |||
| num_workers = workers_per_gpu | |||
| else: | |||
| batch_size = batch_size_per_gpu | |||
| num_workers = workers_per_gpu | |||
| if dist: | |||
| sampler = DistributedSampler( | |||
| dataset, world_size, rank, shuffle=shuffle, seed=seed) | |||
| else: | |||
| sampler = None | |||
| batch_sampler = None | |||
| init_fn = partial( | |||
| worker_init_fn, num_workers=num_workers, rank=rank, | |||
| seed=seed) if seed is not None else None | |||
| if LooseVersion(torch.__version__) >= LooseVersion('1.7.0'): | |||
| kwargs['persistent_workers'] = persistent_workers | |||
| elif persistent_workers is True: | |||
| self.logger.warning( | |||
| 'persistent_workers is invalid because your pytorch ' | |||
| 'version is lower than 1.7.0') | |||
| data_loader = DataLoader( | |||
| dataset, | |||
| batch_size=batch_size, | |||
| sampler=sampler, | |||
| num_workers=num_workers, | |||
| batch_sampler=batch_sampler, | |||
| collate_fn=self.data_collator, | |||
| pin_memory=kwargs.pop('pin_memory', False), | |||
| worker_init_fn=init_fn, | |||
| **kwargs) | |||
| return data_loader | |||
| def train_loop(self, data_loader): | |||
| """ Training loop used by `EpochBasedTrainer.train()` | |||
| """ | |||
| self.invoke_hook('before_run') | |||
| self._epoch = 0 | |||
| kwargs = {} | |||
| for _ in range(self._epoch, self._max_epochs): | |||
| self.invoke_hook('before_train_epoch') | |||
| time.sleep(2) # Prevent possible deadlock during epoch transition | |||
| for i, data_batch in enumerate(data_loader): | |||
| self.data_batch = data_batch | |||
| self._inner_iter = i | |||
| self.invoke_hook('before_train_iter') | |||
| self.train_step(self.model, data_batch, **kwargs) | |||
| self.invoke_hook('after_train_iter') | |||
| del self.data_batch | |||
| self._iter += 1 | |||
| self.invoke_hook('after_train_epoch') | |||
| self._epoch += 1 | |||
| time.sleep(1) # wait for some hooks like loggers to finish | |||
| self.invoke_hook('after_run') | |||
| def evaluation_loop(self, data_loader, checkpoint_path, metric_classes): | |||
| """ Evaluation loop used by `EpochBasedTrainer.evaluate()`. | |||
| """ | |||
| if self._dist: | |||
| from modelscope.trainers.utils.inference import multi_gpu_test | |||
| multi_gpu_test( | |||
| self.model, | |||
| data_loader, | |||
| tmpdir=None, | |||
| gpu_collect=False, | |||
| data_collate_fn=self.collate_fn, | |||
| metric_classes=metric_classes) | |||
| else: | |||
| from modelscope.trainers.utils.inference import single_gpu_test | |||
| single_gpu_test( | |||
| self.model, | |||
| data_loader, | |||
| data_collate_fn=self.collate_fn, | |||
| metric_classes=metric_classes) | |||
| def register_hook(self, hook: Hook) -> None: | |||
| """Register a hook into the hook list. | |||
| The hook will be inserted into a priority queue, with the specified | |||
| priority (See :class:`Priority` for details of priorities). | |||
| For hooks with the same priority, they will be triggered in the same | |||
| order as they are registered. | |||
| Args: | |||
| hook (:obj:`Hook`): The hook to be registered. | |||
| """ | |||
| assert isinstance(hook, Hook) | |||
| # insert the hook to a sorted list | |||
| inserted = False | |||
| for i in range(len(self._hooks) - 1, -1, -1): | |||
| if get_priority(hook.PRIORITY) > get_priority( | |||
| self._hooks[i].PRIORITY): | |||
| self._hooks.insert(i + 1, hook) | |||
| inserted = True | |||
| break | |||
| if not inserted: | |||
| self._hooks.insert(0, hook) | |||
| def register_hook_from_cfg(self, hook_cfg: Dict) -> None: | |||
| """Register a hook from its cfg. | |||
| Args: | |||
| hook_cfg (dict): Hook config. It should have at least keys 'type' | |||
| and 'priority' indicating its type and priority. | |||
| Note: | |||
| The specific hook class to register should not use 'type' and | |||
| 'priority' arguments during initialization. | |||
| """ | |||
| hook_cfg = hook_cfg.copy() | |||
| assert isinstance(hook_cfg, list) | |||
| for cfg_i in hook_cfg: | |||
| hook = build_from_cfg(cfg_i, HOOKS) | |||
| self.register_hook(hook) | |||
| def invoke_hook(self, fn_name: str) -> None: | |||
| """Call all hooks. | |||
| Args: | |||
| fn_name (str): The function name in each hook to be called, such as | |||
| "before_train_epoch". | |||
| """ | |||
| for hook in self._hooks: | |||
| getattr(hook, fn_name)(self) | |||
| def get_hook_info(self) -> str: | |||
| # Get hooks info in each stage | |||
| stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages} | |||
| for hook in self.hooks: | |||
| try: | |||
| priority = Priority(hook.priority).name # type: ignore | |||
| except ValueError: | |||
| priority = hook.priority # type: ignore | |||
| classname = hook.__class__.__name__ | |||
| hook_info = f'({priority:<12}) {classname:<35}' | |||
| for trigger_stage in hook.get_triggered_stages(): | |||
| stage_hook_map[trigger_stage].append(hook_info) | |||
| stage_hook_infos = [] | |||
| for stage in Hook.stages: | |||
| hook_infos = stage_hook_map[stage] | |||
| if len(hook_infos) > 0: | |||
| info = f'{stage}:\n' | |||
| info += '\n'.join(hook_infos) | |||
| info += '\n -------------------- ' | |||
| stage_hook_infos.append(info) | |||
| return '\n'.join(stage_hook_infos) | |||
| def worker_init_fn(worker_id, num_workers, rank, seed): | |||
| # The seed of each worker equals to | |||
| # num_worker * rank + worker_id + user_seed | |||
| worker_seed = num_workers * rank + worker_id + seed | |||
| np.random.seed(worker_seed) | |||
| random.seed(worker_seed) | |||
| torch.manual_seed(worker_seed) | |||
| @@ -0,0 +1,208 @@ | |||
| # Copyright (c) OpenMMLab. All rights reserved. | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import pickle | |||
| import shutil | |||
| import tempfile | |||
| import time | |||
| import torch | |||
| from torch import distributed as dist | |||
| from tqdm import tqdm | |||
| from modelscope.utils.torch_utils import get_dist_info | |||
| def single_gpu_test(model, | |||
| data_loader, | |||
| data_collate_fn=None, | |||
| metric_classes=None): | |||
| """Test model with a single gpu. | |||
| Args: | |||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||
| model (nn.Module): Model to be tested. | |||
| data_loader (nn.Dataloader): Pytorch data loader. | |||
| metric_classes(List): List of Metric class that uses to collect metrics | |||
| Returns: | |||
| list: The prediction results. | |||
| """ | |||
| model.eval() | |||
| dataset = data_loader.dataset | |||
| with tqdm(total=len(dataset), desc='test samples') as pbar: | |||
| for data in data_loader: | |||
| if data_collate_fn is not None: | |||
| data = data_collate_fn(data) | |||
| with torch.no_grad(): | |||
| result = model(**data) | |||
| if metric_classes is not None: | |||
| for metric_cls in metric_classes: | |||
| metric_cls.add(result, data) | |||
| batch_size = len(result) | |||
| for _ in range(batch_size): | |||
| pbar.update() | |||
| def multi_gpu_test(model, | |||
| data_loader, | |||
| tmpdir=None, | |||
| gpu_collect=False, | |||
| data_collate_fn=None, | |||
| metric_classes=None): | |||
| """Test model with multiple gpus. | |||
| This method tests model with multiple gpus and collects the results | |||
| under two different modes: gpu and cpu modes. By setting | |||
| ``gpu_collect=True``, it encodes results to gpu tensors and use gpu | |||
| communication for results collection. On cpu mode it saves the results on | |||
| different gpus to ``tmpdir`` and collects them by the rank 0 worker. | |||
| Args: | |||
| model (nn.Module): Model to be tested. | |||
| data_loader (nn.Dataloader): Pytorch data loader. | |||
| data_collate_fn: An optional data_collate_fn before fed into the model | |||
| tmpdir (str): Path of directory to save the temporary results from | |||
| different gpus under cpu mode. | |||
| gpu_collect (bool): Option to use either gpu or cpu to collect results. | |||
| metric_classes(List): List of Metric class that uses to collect metrics | |||
| Returns: | |||
| list: The prediction results. | |||
| """ | |||
| model.eval() | |||
| results = [] | |||
| dataset = data_loader.dataset | |||
| time.sleep(2) # This line can prevent deadlock problem in some cases. | |||
| count = 0 | |||
| with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | |||
| for _, data in enumerate(data_loader): | |||
| if data_collate_fn is not None: | |||
| data = data_collate_fn(data) | |||
| with torch.no_grad(): | |||
| result = model(**data) | |||
| results.extend(result) | |||
| rank, world_size = get_dist_info() | |||
| if rank == 0: | |||
| batch_size = len(result) | |||
| batch_size_all = batch_size * world_size | |||
| count += batch_size_all | |||
| if count > len(dataset): | |||
| batch_size_all = len(dataset) - (count - batch_size_all) | |||
| for _ in range(batch_size_all): | |||
| pbar.update() | |||
| # collect results from all ranks | |||
| if gpu_collect: | |||
| results = collect_results_gpu(results, len(dataset)) | |||
| else: | |||
| results = collect_results_cpu(results, len(dataset), tmpdir) | |||
| ground_truths = [dataset[i] for i in range(len(dataset))] | |||
| if metric_classes is not None: | |||
| for metric_cls in metric_classes: | |||
| metric_cls.add(results, ground_truths) | |||
| def collect_results_cpu(result_part, size, tmpdir=None): | |||
| """Collect results under cpu mode. | |||
| On cpu mode, this function will save the results on different gpus to | |||
| ``tmpdir`` and collect them by the rank 0 worker. | |||
| Args: | |||
| result_part (list): Result list containing result parts | |||
| to be collected. | |||
| size (int): Size of the results, commonly equal to length of | |||
| the results. | |||
| tmpdir (str | None): temporal directory for collected results to | |||
| store. If set to None, it will create a random temporal directory | |||
| for it. | |||
| Returns: | |||
| list: The collected results. | |||
| """ | |||
| rank, world_size = get_dist_info() | |||
| # TODO create a random tmp dir if it is not specified | |||
| if tmpdir is None: | |||
| tmpdir = tempfile.gettempdir() | |||
| if not os.path.exists(tmpdir): | |||
| os.makedirs(tmpdir) | |||
| # dump the part result to the dir | |||
| pickle.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl')) | |||
| dist.barrier() | |||
| # collect all parts | |||
| if rank != 0: | |||
| return None | |||
| else: | |||
| # load results of all parts from tmp dir | |||
| part_list = [] | |||
| for i in range(world_size): | |||
| part_file = os.path.join(tmpdir, f'part_{i}.pkl') | |||
| part_result = pickle.load(part_file) | |||
| # When data is severely insufficient, an empty part_result | |||
| # on a certain gpu could makes the overall outputs empty. | |||
| if part_result: | |||
| part_list.append(part_result) | |||
| # sort the results | |||
| ordered_results = [] | |||
| for res in zip(*part_list): | |||
| ordered_results.extend(list(res)) | |||
| # the dataloader may pad some samples | |||
| ordered_results = ordered_results[:size] | |||
| # remove tmp dir | |||
| shutil.rmtree(tmpdir) | |||
| return ordered_results | |||
| def collect_results_gpu(result_part, size): | |||
| """Collect results under gpu mode. | |||
| On gpu mode, this function will encode results to gpu tensors and use gpu | |||
| communication for results collection. | |||
| Args: | |||
| result_part (list): Result list containing result parts | |||
| to be collected. | |||
| size (int): Size of the results, commonly equal to length of | |||
| the results. | |||
| Returns: | |||
| list: The collected results. | |||
| """ | |||
| rank, world_size = get_dist_info() | |||
| # dump result part to tensor with pickle | |||
| part_tensor = torch.tensor( | |||
| bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') | |||
| # gather all result part tensor shape | |||
| shape_tensor = torch.tensor(part_tensor.shape, device='cuda') | |||
| shape_list = [shape_tensor.clone() for _ in range(world_size)] | |||
| dist.all_gather(shape_list, shape_tensor) | |||
| # padding result part tensor to max length | |||
| shape_max = torch.tensor(shape_list).max() | |||
| part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') | |||
| part_send[:shape_tensor[0]] = part_tensor | |||
| part_recv_list = [ | |||
| part_tensor.new_zeros(shape_max) for _ in range(world_size) | |||
| ] | |||
| # gather all result part | |||
| dist.all_gather(part_recv_list, part_send) | |||
| if rank == 0: | |||
| part_list = [] | |||
| for recv, shape in zip(part_recv_list, shape_list): | |||
| part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) | |||
| # When data is severely insufficient, an empty part_result | |||
| # on a certain gpu could makes the overall outputs empty. | |||
| if part_result: | |||
| part_list.append(part_result) | |||
| # sort the results | |||
| ordered_results = [] | |||
| for res in zip(*part_list): | |||
| ordered_results.extend(list(res)) | |||
| # the dataloader may pad some samples | |||
| ordered_results = ordered_results[:size] | |||
| return ordered_results | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) OpenMMLab. All rights reserved. | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from collections import OrderedDict | |||
| import numpy as np | |||
| class LogBuffer: | |||
| def __init__(self): | |||
| self.val_history = OrderedDict() | |||
| self.n_history = OrderedDict() | |||
| self.output = OrderedDict() | |||
| self.ready = False | |||
| def clear(self) -> None: | |||
| self.val_history.clear() | |||
| self.n_history.clear() | |||
| self.clear_output() | |||
| def clear_output(self) -> None: | |||
| self.output.clear() | |||
| self.ready = False | |||
| def update(self, vars: dict, count: int = 1) -> None: | |||
| assert isinstance(vars, dict) | |||
| for key, var in vars.items(): | |||
| if key not in self.val_history: | |||
| self.val_history[key] = [] | |||
| self.n_history[key] = [] | |||
| self.val_history[key].append(var) | |||
| self.n_history[key].append(count) | |||
| def average(self, n: int = 0) -> None: | |||
| """Average latest n values or all values.""" | |||
| assert n >= 0 | |||
| for key in self.val_history: | |||
| values = np.array(self.val_history[key][-n:]) | |||
| nums = np.array(self.n_history[key][-n:]) | |||
| avg = np.sum(values * nums) / np.sum(nums) | |||
| self.output[key] = avg | |||
| self.ready = True | |||
| @@ -0,0 +1,74 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import io | |||
| import time | |||
| from collections import OrderedDict | |||
| from typing import Optional | |||
| import torch | |||
| from torch.optim import Optimizer | |||
| from modelscope import __version__ | |||
| from modelscope.fileio import File | |||
| def weights_to_cpu(state_dict): | |||
| """Copy a model state_dict to cpu. | |||
| Args: | |||
| state_dict (OrderedDict): Model weights on GPU. | |||
| Returns: | |||
| OrderedDict: Model weights on GPU. | |||
| """ | |||
| state_dict_cpu = OrderedDict() | |||
| for key, val in state_dict.items(): | |||
| state_dict_cpu[key] = val.cpu() | |||
| # Keep metadata in state_dict | |||
| state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) | |||
| return state_dict_cpu | |||
| def save_checkpoint(model: torch.nn.Module, | |||
| filename: str, | |||
| optimizer: Optional[Optimizer] = None, | |||
| meta: Optional[dict] = None) -> None: | |||
| """Save checkpoint to file. | |||
| The checkpoint will have 3 fields: ``meta``, ``state_dict`` and | |||
| ``optimizer``. By default ``meta`` will contain version and time info. | |||
| Args: | |||
| model (Module): Module whose params are to be saved. | |||
| filename (str): Checkpoint filename. | |||
| optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. | |||
| meta (dict, optional): Metadata to be saved in checkpoint. | |||
| """ | |||
| if meta is None: | |||
| meta = {} | |||
| elif not isinstance(meta, dict): | |||
| raise TypeError(f'meta must be a dict or None, but got {type(meta)}') | |||
| meta.update(modescope=__version__, time=time.asctime()) | |||
| if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
| model = model.module | |||
| if hasattr(model, 'CLASSES') and model.CLASSES is not None: | |||
| # save class name to the meta | |||
| meta.update(CLASSES=model.CLASSES) | |||
| checkpoint = { | |||
| 'meta': meta, | |||
| 'state_dict': weights_to_cpu(model.state_dict()) | |||
| } | |||
| # save optimizer state dict in the checkpoint | |||
| if isinstance(optimizer, Optimizer): | |||
| checkpoint['optimizer'] = optimizer.state_dict() | |||
| elif isinstance(optimizer, dict): | |||
| checkpoint['optimizer'] = {} | |||
| for name, optim in optimizer.items(): | |||
| checkpoint['optimizer'][name] = optim.state_dict() | |||
| with io.BytesIO() as f: | |||
| torch.save(checkpoint, f) | |||
| File.write(f.getvalue(), filename) | |||
| @@ -13,12 +13,7 @@ class Fields(object): | |||
| multi_modal = 'multi-modal' | |||
| class Tasks(object): | |||
| """ Names for tasks supported by modelscope. | |||
| Holds the standard task name to use for identifying different tasks. | |||
| This should be used to register models, pipelines, trainers. | |||
| """ | |||
| class CVTasks(object): | |||
| # vision tasks | |||
| image_to_text = 'image-to-text' | |||
| pose_estimation = 'pose-estimation' | |||
| @@ -33,6 +28,8 @@ class Tasks(object): | |||
| action_recognition = 'action-recognition' | |||
| video_embedding = 'video-embedding' | |||
| class NLPTasks(object): | |||
| # nlp tasks | |||
| word_segmentation = 'word-segmentation' | |||
| nli = 'nli' | |||
| @@ -56,11 +53,15 @@ class Tasks(object): | |||
| question_answering = 'question-answering' | |||
| zero_shot_classification = 'zero-shot-classification' | |||
| class AudioTasks(object): | |||
| # audio tasks | |||
| auto_speech_recognition = 'auto-speech-recognition' | |||
| text_to_speech = 'text-to-speech' | |||
| speech_signal_process = 'speech-signal-process' | |||
| class MultiModalTasks(object): | |||
| # multi-modal tasks | |||
| image_captioning = 'image-captioning' | |||
| visual_grounding = 'visual-grounding' | |||
| @@ -69,6 +70,47 @@ class Tasks(object): | |||
| visual_question_answering = 'visual-question-answering' | |||
| class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks): | |||
| """ Names for tasks supported by modelscope. | |||
| Holds the standard task name to use for identifying different tasks. | |||
| This should be used to register models, pipelines, trainers. | |||
| """ | |||
| reverse_field_index = {} | |||
| @staticmethod | |||
| def find_field_by_task(task_name): | |||
| if len(Tasks.reverse_field_index) == 0: | |||
| # Lazy init, not thread safe | |||
| field_dict = { | |||
| Fields.cv: [ | |||
| getattr(Tasks, attr) for attr in dir(CVTasks) | |||
| if not attr.startswith('__') | |||
| ], | |||
| Fields.nlp: [ | |||
| getattr(Tasks, attr) for attr in dir(NLPTasks) | |||
| if not attr.startswith('__') | |||
| ], | |||
| Fields.audio: [ | |||
| getattr(Tasks, attr) for attr in dir(AudioTasks) | |||
| if not attr.startswith('__') | |||
| ], | |||
| Fields.multi_modal: [ | |||
| getattr(Tasks, attr) for attr in dir(MultiModalTasks) | |||
| if not attr.startswith('__') | |||
| ], | |||
| } | |||
| for field, tasks in field_dict.items(): | |||
| for task in tasks: | |||
| if task in Tasks.reverse_field_index: | |||
| raise ValueError(f'Duplicate task: {task}') | |||
| Tasks.reverse_field_index[task] = field | |||
| return Tasks.reverse_field_index.get(task_name) | |||
| class InputFields(object): | |||
| """ Names for input data fields in the input data for pipelines | |||
| """ | |||
| @@ -100,6 +142,7 @@ class ModelFile(object): | |||
| TF_CKPT_PREFIX = 'ckpt-' | |||
| TORCH_MODEL_FILE = 'pytorch_model.pt' | |||
| TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' | |||
| LABEL_MAPPING = 'label_mapping.json' | |||
| class Requirements(object): | |||
| @@ -86,3 +86,16 @@ def get_model_type(model_dir): | |||
| return cfg.model_type if hasattr(cfg, 'model_type') else None | |||
| except Exception as e: | |||
| logger.error(f'parse config file failed with error: {e}') | |||
| def parse_label_mapping(model_dir): | |||
| import os | |||
| import json | |||
| label2id = None | |||
| label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING) | |||
| if os.path.exists(label_path): | |||
| with open(label_path) as f: | |||
| label_mapping = json.load(f) | |||
| label2id = {name: idx for name, idx in label_mapping.items()} | |||
| return label2id | |||
| @@ -54,6 +54,38 @@ def import_modules_from_file(py_file: str): | |||
| return module_name, mod | |||
| def is_method_overridden(method, base_class, derived_class): | |||
| """Check if a method of base class is overridden in derived class. | |||
| Args: | |||
| method (str): the method name to check. | |||
| base_class (type): the class of the base class. | |||
| derived_class (type | Any): the class or instance of the derived class. | |||
| """ | |||
| assert isinstance(base_class, type), \ | |||
| "base_class doesn't accept instance, Please pass class instead." | |||
| if not isinstance(derived_class, type): | |||
| derived_class = derived_class.__class__ | |||
| base_method = getattr(base_class, method) | |||
| derived_method = getattr(derived_class, method) | |||
| return derived_method != base_method | |||
| def has_method(obj: object, method: str) -> bool: | |||
| """Check whether the object has a method. | |||
| Args: | |||
| method (str): The method name to check. | |||
| obj (object): The object to check. | |||
| Returns: | |||
| bool: True if the object has the method else False. | |||
| """ | |||
| return hasattr(obj, method) and callable(getattr(obj, method)) | |||
| def import_modules(imports, allow_failed_imports=False): | |||
| """Import modules from the given list of strings. | |||
| @@ -0,0 +1,77 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| # Part of the implementation is borrowed from huggingface/transformers. | |||
| def torch_nested_numpify(tensors): | |||
| import torch | |||
| "Numpify `tensors` (even if it's a nested list/tuple of tensors)." | |||
| if isinstance(tensors, (list, tuple)): | |||
| return type(tensors)(torch_nested_numpify(t) for t in tensors) | |||
| if isinstance(tensors, torch.Tensor): | |||
| t = tensors.cpu() | |||
| return t.numpy() | |||
| return tensors | |||
| def torch_nested_detach(tensors): | |||
| import torch | |||
| "Detach `tensors` (even if it's a nested list/tuple of tensors)." | |||
| if isinstance(tensors, (list, tuple)): | |||
| return type(tensors)(torch_nested_detach(t) for t in tensors) | |||
| if isinstance(tensors, torch.Tensor): | |||
| return tensors.detach() | |||
| return tensors | |||
| def torch_default_data_collator(features): | |||
| # TODO @jiangnana.jnn refine this default data collator | |||
| import torch | |||
| # if not isinstance(features[0], (dict, BatchEncoding)): | |||
| # features = [vars(f) for f in features] | |||
| first = features[0] | |||
| if isinstance(first, dict): | |||
| batch = {} | |||
| # Special handling for labels. | |||
| # Ensure that tensor is created with the correct type | |||
| # (it should be automatically the case, but let's make sure of it.) | |||
| if 'label' in first and first['label'] is not None: | |||
| label = first['label'].item() if isinstance( | |||
| first['label'], torch.Tensor) else first['label'] | |||
| dtype = torch.long if isinstance(label, int) else torch.float | |||
| batch['labels'] = torch.tensor([f['label'] for f in features], | |||
| dtype=dtype) | |||
| elif 'label_ids' in first and first['label_ids'] is not None: | |||
| if isinstance(first['label_ids'], torch.Tensor): | |||
| batch['labels'] = torch.stack( | |||
| [f['label_ids'] for f in features]) | |||
| else: | |||
| dtype = torch.long if type( | |||
| first['label_ids'][0]) is int else torch.float | |||
| batch['labels'] = torch.tensor( | |||
| [f['label_ids'] for f in features], dtype=dtype) | |||
| # Handling of all other possible keys. | |||
| # Again, we will use the first element to figure out which key/values are not None for this model. | |||
| for k, v in first.items(): | |||
| if k not in ('label', 'label_ids' | |||
| ) and v is not None and not isinstance(v, str): | |||
| if isinstance(v, torch.Tensor): | |||
| batch[k] = torch.stack([f[k] for f in features]) | |||
| else: | |||
| batch[k] = torch.tensor([f[k] for f in features]) | |||
| elif isinstance(first, tuple): | |||
| batch = [] | |||
| for idx in range(len(first)): | |||
| if isinstance(first[idx], torch.Tensor): | |||
| batch.append(torch.stack([f[k] for f in features])) | |||
| else: | |||
| batch.append(torch.tensor([f[k] for f in features])) | |||
| else: | |||
| if isinstance(first, torch.Tensor): | |||
| batch = torch.stack(features) | |||
| else: | |||
| batch = torch.tensor(features) | |||
| return batch | |||
| @@ -0,0 +1,127 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| # Following code is partialy borrowed from openmmlab/mmcv | |||
| import functools | |||
| import os | |||
| import socket | |||
| import subprocess | |||
| from collections import OrderedDict | |||
| from typing import Callable, List, Optional, Tuple | |||
| import torch | |||
| import torch.multiprocessing as mp | |||
| from torch import distributed as dist | |||
| from torch._utils import (_flatten_dense_tensors, _take_tensors, | |||
| _unflatten_dense_tensors) | |||
| def _find_free_port() -> str: | |||
| # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 | |||
| sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |||
| # Binding to port 0 will cause the OS to find an available port for us | |||
| sock.bind(('', 0)) | |||
| port = sock.getsockname()[1] | |||
| sock.close() | |||
| # NOTE: there is still a chance the port could be taken by other processes. | |||
| return port | |||
| def _is_free_port(port: int) -> bool: | |||
| ips = socket.gethostbyname_ex(socket.gethostname())[-1] | |||
| ips.append('localhost') | |||
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |||
| return all(s.connect_ex((ip, port)) != 0 for ip in ips) | |||
| def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None: | |||
| if mp.get_start_method(allow_none=True) is None: | |||
| mp.set_start_method('spawn') | |||
| if launcher == 'pytorch': | |||
| _init_dist_pytorch(backend, **kwargs) | |||
| elif launcher == 'mpi': | |||
| _init_dist_mpi(backend, **kwargs) | |||
| elif launcher == 'slurm': | |||
| _init_dist_slurm(backend, **kwargs) | |||
| else: | |||
| raise ValueError(f'Invalid launcher type: {launcher}') | |||
| def _init_dist_pytorch(backend: str, **kwargs) -> None: | |||
| # rank = int(os.environ['RANK']) | |||
| local_rank = int(os.environ['LOCAL_RANK']) | |||
| torch.cuda.set_device(local_rank) | |||
| dist.init_process_group(backend=backend, **kwargs) | |||
| def _init_dist_mpi(backend: str, **kwargs) -> None: | |||
| local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) | |||
| torch.cuda.set_device(local_rank) | |||
| if 'MASTER_PORT' not in os.environ: | |||
| # 29500 is torch.distributed default port | |||
| os.environ['MASTER_PORT'] = '29500' | |||
| if 'MASTER_ADDR' not in os.environ: | |||
| raise KeyError('The environment variable MASTER_ADDR is not set') | |||
| os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] | |||
| os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] | |||
| dist.init_process_group(backend=backend, **kwargs) | |||
| def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: | |||
| """Initialize slurm distributed training environment. | |||
| If argument ``port`` is not specified, then the master port will be system | |||
| environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system | |||
| environment variable, then a default port ``29500`` will be used. | |||
| Args: | |||
| backend (str): Backend of torch.distributed. | |||
| port (int, optional): Master port. Defaults to None. | |||
| """ | |||
| proc_id = int(os.environ['SLURM_PROCID']) | |||
| ntasks = int(os.environ['SLURM_NTASKS']) | |||
| node_list = os.environ['SLURM_NODELIST'] | |||
| num_gpus = torch.cuda.device_count() | |||
| torch.cuda.set_device(proc_id % num_gpus) | |||
| addr = subprocess.getoutput( | |||
| f'scontrol show hostname {node_list} | head -n1') | |||
| # specify master port | |||
| if port is not None: | |||
| os.environ['MASTER_PORT'] = str(port) | |||
| elif 'MASTER_PORT' in os.environ: | |||
| pass # use MASTER_PORT in the environment variable | |||
| else: | |||
| # if torch.distributed default port(29500) is available | |||
| # then use it, else find a free port | |||
| if _is_free_port(29500): | |||
| os.environ['MASTER_PORT'] = '29500' | |||
| else: | |||
| os.environ['MASTER_PORT'] = str(_find_free_port()) | |||
| # use MASTER_ADDR in the environment variable if it already exists | |||
| if 'MASTER_ADDR' not in os.environ: | |||
| os.environ['MASTER_ADDR'] = addr | |||
| os.environ['WORLD_SIZE'] = str(ntasks) | |||
| os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) | |||
| os.environ['RANK'] = str(proc_id) | |||
| dist.init_process_group(backend=backend) | |||
| def get_dist_info() -> Tuple[int, int]: | |||
| if dist.is_available() and dist.is_initialized(): | |||
| rank = dist.get_rank() | |||
| world_size = dist.get_world_size() | |||
| else: | |||
| rank = 0 | |||
| world_size = 1 | |||
| return rank, world_size | |||
| def master_only(func: Callable) -> Callable: | |||
| @functools.wraps(func) | |||
| def wrapper(*args, **kwargs): | |||
| rank, _ = get_dist_info() | |||
| if rank == 0: | |||
| return func(*args, **kwargs) | |||
| return wrapper | |||
| @@ -1,3 +0,0 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:b4153d9ffc0b72eeaf162b5c9f4426f95dcea2bb0da9e7b5e1b72fd2643b1915 | |||
| size 50444 | |||
| @@ -0,0 +1,60 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import unittest | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from modelscope.models.base_torch import TorchModel | |||
| class TorchBaseTest(unittest.TestCase): | |||
| def test_custom_model(self): | |||
| class MyTorchModel(TorchModel): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.conv1 = nn.Conv2d(1, 20, 5) | |||
| self.conv2 = nn.Conv2d(20, 20, 5) | |||
| def forward(self, x): | |||
| x = F.relu(self.conv1(x)) | |||
| return F.relu(self.conv2(x)) | |||
| model = MyTorchModel() | |||
| model.train() | |||
| model.eval() | |||
| out = model.forward(torch.rand(1, 1, 10, 10)) | |||
| self.assertEqual((1, 20, 2, 2), out.shape) | |||
| def test_custom_model_with_postprocess(self): | |||
| add_bias = 200 | |||
| class MyTorchModel(TorchModel): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.conv1 = nn.Conv2d(1, 20, 5) | |||
| self.conv2 = nn.Conv2d(20, 20, 5) | |||
| def forward(self, x): | |||
| x = F.relu(self.conv1(x)) | |||
| return F.relu(self.conv2(x)) | |||
| def postprocess(self, x): | |||
| return x + add_bias | |||
| model = MyTorchModel() | |||
| model.train() | |||
| model.eval() | |||
| out = model(torch.rand(1, 1, 10, 10)) | |||
| self.assertEqual((1, 20, 2, 2), out.shape) | |||
| self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10))) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -6,9 +6,9 @@ from typing import Any, Dict, List, Tuple, Union | |||
| import numpy as np | |||
| import PIL | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import Pipeline, pipeline | |||
| from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.registry import default_group | |||
| @@ -2,8 +2,8 @@ | |||
| import unittest | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -7,8 +7,8 @@ import cv2 | |||
| from modelscope.fileio import File | |||
| from modelscope.msdatasets import MsDataset | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -5,9 +5,9 @@ import unittest | |||
| import cv2 | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pipelines.base import Pipeline | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -5,8 +5,8 @@ import unittest | |||
| import numpy as np | |||
| from modelscope.models import Model | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -9,8 +9,8 @@ from scipy.io.wavfile import write | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models import Model | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pipelines.outputs import OutputKeys | |||
| from modelscope.utils.constant import Fields, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -0,0 +1,108 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| from abc import ABCMeta | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from torch.utils.data import Dataset | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.utils.constant import ModelFile | |||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||
| def __len__(self): | |||
| return 20 | |||
| def __getitem__(self, idx): | |||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
| class DummyModel(nn.Module): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.linear = nn.Linear(5, 4) | |||
| self.bn = nn.BatchNorm1d(4) | |||
| def forward(self, feat, labels): | |||
| x = self.linear(feat) | |||
| x = self.bn(x) | |||
| loss = torch.sum(x) | |||
| return dict(logits=x, loss=loss) | |||
| class CheckpointHookTest(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(self.tmp_dir): | |||
| os.makedirs(self.tmp_dir) | |||
| def tearDown(self): | |||
| super().tearDown() | |||
| shutil.rmtree(self.tmp_dir) | |||
| def test_checkpoint_hook(self): | |||
| json_cfg = { | |||
| 'task': 'image_classification', | |||
| 'train': { | |||
| 'work_dir': self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'optimizer': { | |||
| 'type': 'SGD', | |||
| 'lr': 0.01, | |||
| 'options': { | |||
| 'grad_clip': { | |||
| 'max_norm': 2.0 | |||
| } | |||
| } | |||
| }, | |||
| 'lr_scheduler': { | |||
| 'type': 'StepLR', | |||
| 'step_size': 2, | |||
| 'options': { | |||
| 'warmup': { | |||
| 'type': 'LinearWarmup', | |||
| 'warmup_iters': 2 | |||
| } | |||
| } | |||
| }, | |||
| 'hooks': [{ | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }] | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=DummyModel(), | |||
| data_collator=None, | |||
| train_dataset=DummyDataset(), | |||
| max_epochs=2) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn('epoch_1.pth', results_files) | |||
| self.assertIn('epoch_2.pth', results_files) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,188 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| from abc import ABCMeta | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from torch.optim import SGD | |||
| from torch.optim.lr_scheduler import MultiStepLR | |||
| from torch.utils.data import Dataset | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.utils.constant import ModelFile | |||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||
| """Base Dataset | |||
| """ | |||
| def __len__(self): | |||
| return 10 | |||
| def __getitem__(self, idx): | |||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
| class DummyModel(nn.Module): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.linear = nn.Linear(5, 4) | |||
| self.bn = nn.BatchNorm1d(4) | |||
| def forward(self, feat, labels): | |||
| x = self.linear(feat) | |||
| x = self.bn(x) | |||
| loss = torch.sum(x) | |||
| return dict(logits=x, loss=loss) | |||
| class LrSchedulerHookTest(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(self.tmp_dir): | |||
| os.makedirs(self.tmp_dir) | |||
| def tearDown(self): | |||
| super().tearDown() | |||
| shutil.rmtree(self.tmp_dir) | |||
| def test_lr_scheduler_hook(self): | |||
| json_cfg = { | |||
| 'task': 'image_classification', | |||
| 'train': { | |||
| 'work_dir': self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| } | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, 'config.json') | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| model = DummyModel() | |||
| optimizer = SGD(model.parameters(), lr=0.01) | |||
| lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=model, | |||
| train_dataset=DummyDataset(), | |||
| optimizers=(optimizer, lr_scheduler), | |||
| max_epochs=5) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
| trainer.register_optimizers_hook() | |||
| trainer.invoke_hook('before_run') | |||
| log_lrs = [] | |||
| optim_lrs = [] | |||
| for _ in range(trainer._epoch, trainer._max_epochs): | |||
| trainer.invoke_hook('before_train_epoch') | |||
| for _, data_batch in enumerate(train_dataloader): | |||
| trainer.invoke_hook('before_train_iter') | |||
| log_lrs.append(trainer.log_buffer.output['lr']) | |||
| optim_lrs.append(optimizer.param_groups[0]['lr']) | |||
| trainer.train_step(trainer.model, data_batch) | |||
| trainer.invoke_hook('after_train_iter') | |||
| trainer.invoke_hook('after_train_epoch') | |||
| trainer._epoch += 1 | |||
| trainer.invoke_hook('after_run') | |||
| iters = 5 | |||
| target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 | |||
| ] * iters * 2 | |||
| self.assertListEqual(log_lrs, target_lrs) | |||
| self.assertListEqual(optim_lrs, target_lrs) | |||
| def test_warmup_lr_scheduler_hook(self): | |||
| json_cfg = { | |||
| 'task': 'image_classification', | |||
| 'train': { | |||
| 'work_dir': self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'optimizer': { | |||
| 'type': 'SGD', | |||
| 'lr': 0.01 | |||
| }, | |||
| 'lr_scheduler': { | |||
| 'type': 'MultiStepLR', | |||
| 'milestones': [4, 6], | |||
| 'options': { | |||
| 'warmup': { | |||
| 'type': 'LinearWarmup', | |||
| 'warmup_iters': 3 | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| model = DummyModel() | |||
| # optimmizer = SGD(model.parameters(), lr=0.01) | |||
| # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=model, | |||
| train_dataset=DummyDataset(), | |||
| # optimizers=(optimmizer, lr_scheduler), | |||
| max_epochs=7) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
| trainer.register_optimizers_hook() | |||
| trainer.invoke_hook('before_run') | |||
| log_lrs = [] | |||
| optim_lrs = [] | |||
| for _ in range(trainer._epoch, trainer._max_epochs): | |||
| trainer.invoke_hook('before_train_epoch') | |||
| for _, data_batch in enumerate(train_dataloader): | |||
| trainer.invoke_hook('before_train_iter') | |||
| log_lrs.append(round(trainer.log_buffer.output['lr'], 5)) | |||
| optim_lrs.append( | |||
| round(trainer.optimizer.param_groups[0]['lr'], 5)) | |||
| trainer.train_step(trainer.model, data_batch) | |||
| trainer.invoke_hook('after_train_iter') | |||
| trainer.invoke_hook('after_train_epoch') | |||
| trainer.invoke_hook('after_run') | |||
| iters = 5 | |||
| target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ | |||
| 0.01 | |||
| ] * iters * 1 + [0.001] * iters * 2 + [0.0001] * iters * 2 | |||
| self.assertListEqual(log_lrs, target_lrs) | |||
| self.assertListEqual(optim_lrs, target_lrs) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,128 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| from abc import ABCMeta | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from torch.optim import SGD | |||
| from torch.optim.lr_scheduler import MultiStepLR | |||
| from torch.utils.data import Dataset | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.utils.constant import ModelFile | |||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||
| """Base Dataset | |||
| """ | |||
| def __len__(self): | |||
| return 10 | |||
| def __getitem__(self, idx): | |||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
| class DummyModel(nn.Module): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.linear = nn.Linear(5, 4) | |||
| self.bn = nn.BatchNorm1d(4) | |||
| def forward(self, feat, labels): | |||
| x = self.linear(feat) | |||
| x = self.bn(x) | |||
| loss = torch.sum(x) | |||
| return dict(logits=x, loss=loss) | |||
| class IterTimerHookTest(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(self.tmp_dir): | |||
| os.makedirs(self.tmp_dir) | |||
| def tearDown(self): | |||
| super().tearDown() | |||
| shutil.rmtree(self.tmp_dir) | |||
| def test_iter_time_hook(self): | |||
| json_cfg = { | |||
| 'task': 'image_classification', | |||
| 'train': { | |||
| 'work_dir': self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'hooks': [{ | |||
| 'type': 'IterTimerHook', | |||
| }] | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| model = DummyModel() | |||
| optimizer = SGD(model.parameters(), lr=0.01) | |||
| lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=model, | |||
| train_dataset=DummyDataset(), | |||
| optimizers=(optimizer, lr_scheduler), | |||
| max_epochs=5) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| train_dataloader = trainer._build_dataloader_with_dataset( | |||
| trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) | |||
| trainer.register_optimizers_hook() | |||
| trainer.register_hook_from_cfg(trainer.cfg.train.hooks) | |||
| trainer.invoke_hook('before_run') | |||
| for i in range(trainer._epoch, trainer._max_epochs): | |||
| trainer.invoke_hook('before_train_epoch') | |||
| for _, data_batch in enumerate(train_dataloader): | |||
| trainer.invoke_hook('before_train_iter') | |||
| trainer.train_step(trainer.model, data_batch) | |||
| trainer.invoke_hook('after_train_iter') | |||
| self.assertIn('data_load_time', trainer.log_buffer.val_history) | |||
| self.assertIn('time', trainer.log_buffer.val_history) | |||
| self.assertIn('loss', trainer.log_buffer.val_history) | |||
| trainer.invoke_hook('after_train_epoch') | |||
| target_len = 5 * (i + 1) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.val_history['data_load_time']), | |||
| target_len) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.val_history['time']), target_len) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.val_history['loss']), target_len) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.n_history['data_load_time']), | |||
| target_len) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.n_history['time']), target_len) | |||
| self.assertEqual( | |||
| len(trainer.log_buffer.n_history['loss']), target_len) | |||
| trainer.invoke_hook('after_run') | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,79 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import unittest | |||
| import torch | |||
| from torch import nn | |||
| from torch.optim.lr_scheduler import MultiStepLR | |||
| class WarmupTest(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| def test_constant_warmup(self): | |||
| from modelscope.trainers.lrscheduler.warmup import ConstantWarmup | |||
| net = nn.Linear(2, 2) | |||
| base_lr = 0.02 | |||
| warmup_iters = 3 | |||
| warmup_ratio = 0.2 | |||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
| lr_scheduler_with_warmup = ConstantWarmup( | |||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
| res = [] | |||
| for _ in range(10): | |||
| lr_scheduler_with_warmup.step() | |||
| for _, group in enumerate(optimizer.param_groups): | |||
| res.append(group['lr']) | |||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
| self.assertListEqual(res, [0.004, 0.004, 0.02] + base_lrs) | |||
| def test_linear_warmup(self): | |||
| from modelscope.trainers.lrscheduler.warmup import LinearWarmup | |||
| net = nn.Linear(2, 2) | |||
| base_lr = 0.02 | |||
| warmup_iters = 3 | |||
| warmup_ratio = 0.1 | |||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
| lr_scheduler_with_warmup = LinearWarmup( | |||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
| res = [] | |||
| for _ in range(10): | |||
| lr_scheduler_with_warmup.step() | |||
| for _, group in enumerate(optimizer.param_groups): | |||
| res.append(round(group['lr'], 5)) | |||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
| self.assertListEqual(res, [0.0080, 0.0140, 0.02] + base_lrs) | |||
| def test_exp_warmup(self): | |||
| from modelscope.trainers.lrscheduler.warmup import ExponentialWarmup | |||
| net = nn.Linear(2, 2) | |||
| base_lr = 0.02 | |||
| warmup_iters = 3 | |||
| warmup_ratio = 0.1 | |||
| optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) | |||
| lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) | |||
| lr_scheduler_with_warmup = ExponentialWarmup( | |||
| lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) | |||
| res = [] | |||
| for _ in range(10): | |||
| lr_scheduler_with_warmup.step() | |||
| for _, group in enumerate(optimizer.param_groups): | |||
| res.append(round(group['lr'], 5)) | |||
| base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] | |||
| self.assertListEqual(res, [0.00431, 0.00928, 0.02] + base_lrs) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,209 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| from abc import ABCMeta | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from torch.optim import SGD | |||
| from torch.optim.lr_scheduler import StepLR | |||
| from torch.utils.data import Dataset | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.test_utils import test_level | |||
| class DummyMetric: | |||
| def __call__(self, ground_truth, predict_results): | |||
| return {'accuracy': 0.5} | |||
| class DummyDataset(Dataset, metaclass=ABCMeta): | |||
| """Base Dataset | |||
| """ | |||
| def __len__(self): | |||
| return 20 | |||
| def __getitem__(self, idx): | |||
| return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) | |||
| class DummyModel(nn.Module): | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.linear = nn.Linear(5, 4) | |||
| self.bn = nn.BatchNorm1d(4) | |||
| def forward(self, feat, labels): | |||
| x = self.linear(feat) | |||
| x = self.bn(x) | |||
| loss = torch.sum(x) | |||
| return dict(logits=x, loss=loss) | |||
| class TrainerTest(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(self.tmp_dir): | |||
| os.makedirs(self.tmp_dir) | |||
| def tearDown(self): | |||
| super().tearDown() | |||
| shutil.rmtree(self.tmp_dir) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_train_0(self): | |||
| json_cfg = { | |||
| 'train': { | |||
| 'work_dir': | |||
| self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'optimizer': { | |||
| 'type': 'SGD', | |||
| 'lr': 0.01, | |||
| 'options': { | |||
| 'grad_clip': { | |||
| 'max_norm': 2.0 | |||
| } | |||
| } | |||
| }, | |||
| 'lr_scheduler': { | |||
| 'type': 'StepLR', | |||
| 'step_size': 2, | |||
| 'options': { | |||
| 'warmup': { | |||
| 'type': 'LinearWarmup', | |||
| 'warmup_iters': 2 | |||
| } | |||
| } | |||
| }, | |||
| 'hooks': [{ | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TextLoggerHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'IterTimerHook' | |||
| }, { | |||
| 'type': 'EvaluationHook', | |||
| 'interval': 1 | |||
| }] | |||
| }, | |||
| 'evaluation': { | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1, | |||
| 'shuffle': False | |||
| }, | |||
| 'metrics': ['seq_cls_metric'] | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=DummyModel(), | |||
| data_collator=None, | |||
| train_dataset=DummyDataset(), | |||
| eval_dataset=DummyDataset(), | |||
| max_epochs=3) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
| self.assertIn('epoch_1.pth', results_files) | |||
| self.assertIn('epoch_2.pth', results_files) | |||
| self.assertIn('epoch_3.pth', results_files) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_train_1(self): | |||
| json_cfg = { | |||
| 'train': { | |||
| 'work_dir': | |||
| self.tmp_dir, | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1 | |||
| }, | |||
| 'hooks': [{ | |||
| 'type': 'CheckpointHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'TextLoggerHook', | |||
| 'interval': 1 | |||
| }, { | |||
| 'type': 'IterTimerHook' | |||
| }, { | |||
| 'type': 'EvaluationHook', | |||
| 'interval': 1 | |||
| }] | |||
| }, | |||
| 'evaluation': { | |||
| 'dataloader': { | |||
| 'batch_size_per_gpu': 2, | |||
| 'workers_per_gpu': 1, | |||
| 'shuffle': False | |||
| }, | |||
| 'metrics': ['seq_cls_metric'] | |||
| } | |||
| } | |||
| config_path = os.path.join(self.tmp_dir, 'config.json') | |||
| with open(config_path, 'w') as f: | |||
| json.dump(json_cfg, f) | |||
| model = DummyModel() | |||
| optimmizer = SGD(model.parameters(), lr=0.01) | |||
| lr_scheduler = StepLR(optimmizer, 2) | |||
| trainer_name = 'EpochBasedTrainer' | |||
| kwargs = dict( | |||
| cfg_file=config_path, | |||
| model=model, | |||
| data_collator=None, | |||
| train_dataset=DummyDataset(), | |||
| eval_dataset=DummyDataset(), | |||
| optimizers=(optimmizer, lr_scheduler), | |||
| max_epochs=3) | |||
| trainer = build_trainer(trainer_name, kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
| self.assertIn('epoch_1.pth', results_files) | |||
| self.assertIn('epoch_2.pth', results_files) | |||
| self.assertIn('epoch_3.pth', results_files) | |||
| class DummyTrainerTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_dummy(self): | |||
| default_args = dict(cfg_file='configs/examples/train.json') | |||
| trainer = build_trainer('dummy', default_args) | |||
| trainer.train() | |||
| trainer.evaluate() | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -1,19 +0,0 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import unittest | |||
| from modelscope.trainers import build_trainer | |||
| class DummyTrainerTest(unittest.TestCase): | |||
| def test_dummy(self): | |||
| default_args = dict(cfg_file='configs/examples/train.json') | |||
| trainer = build_trainer('dummy', default_args) | |||
| trainer.train() | |||
| trainer.evaluate() | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,91 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import unittest | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.models.nlp.sbert_for_sequence_classification import \ | |||
| SbertTextClassfier | |||
| from modelscope.msdatasets import MsDataset | |||
| from modelscope.trainers import build_trainer | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.test_utils import test_level | |||
| class TestTrainerWithNlp(unittest.TestCase): | |||
| def setUp(self): | |||
| print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
| self.tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(self.tmp_dir): | |||
| os.makedirs(self.tmp_dir) | |||
| from datasets import Dataset | |||
| dataset_dict = { | |||
| 'sentence1': [ | |||
| 'This is test sentence1-1', 'This is test sentence2-1', | |||
| 'This is test sentence3-1' | |||
| ], | |||
| 'sentence2': [ | |||
| 'This is test sentence1-2', 'This is test sentence2-2', | |||
| 'This is test sentence3-2' | |||
| ], | |||
| 'label': [0, 1, 1] | |||
| } | |||
| dataset = Dataset.from_dict(dataset_dict) | |||
| class MsDatasetDummy(MsDataset): | |||
| def __len__(self): | |||
| return len(self._hf_ds) | |||
| self.dataset = MsDatasetDummy(dataset) | |||
| def tearDown(self): | |||
| shutil.rmtree(self.tmp_dir) | |||
| super().tearDown() | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_trainer(self): | |||
| model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||
| kwargs = dict( | |||
| model=model_id, | |||
| train_dataset=self.dataset, | |||
| eval_dataset=self.dataset, | |||
| work_dir=self.tmp_dir) | |||
| trainer = build_trainer(default_args=kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
| for i in range(10): | |||
| self.assertIn(f'epoch_{i+1}.pth', results_files) | |||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
| def test_trainer_with_model_and_args(self): | |||
| tmp_dir = tempfile.TemporaryDirectory().name | |||
| if not os.path.exists(tmp_dir): | |||
| os.makedirs(tmp_dir) | |||
| model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||
| cache_path = snapshot_download(model_id) | |||
| model = SbertTextClassfier.from_pretrained(cache_path) | |||
| kwargs = dict( | |||
| cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | |||
| model=model, | |||
| train_dataset=self.dataset, | |||
| eval_dataset=self.dataset, | |||
| max_epochs=2, | |||
| work_dir=self.tmp_dir) | |||
| trainer = build_trainer(default_args=kwargs) | |||
| trainer.train() | |||
| results_files = os.listdir(self.tmp_dir) | |||
| self.assertIn(f'{trainer.timestamp}.log.json', results_files) | |||
| for i in range(2): | |||
| self.assertIn(f'epoch_{i+1}.pth', results_files) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||