From 231f4001338cd05869f6ed021fb6e9ec530dda33 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Thu, 14 Jul 2022 16:25:55 +0800 Subject: [PATCH] [to #43112534] finetune support and first case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit co-contributed with 夕陌&雨泓 * add torch epoch based trainer and dis utils * add hooks including optimizer, lrscheduler, logging, checkpoint, evaluation, time profiling * add torch mdoel base and test * add optimizer and lrscheduler module * add sbert for text classification example * add task_dataset for dataset-level processor Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9338412 --- configs/cv/configuration.json | 175 +++++ configs/nlp/sbert_sentence_similarity.json | 77 ++ docs/source/api/modelscope.pipelines.rst | 4 +- modelscope/metainfo.py | 13 + modelscope/metrics/__init__.py | 3 + modelscope/metrics/base.py | 37 + modelscope/metrics/builder.py | 35 + .../metrics/sequence_classification_metric.py | 40 + modelscope/models/base_torch.py | 23 + .../models/multi_modal/clip/clip_model.py | 4 +- .../multi_modal/image_captioning_model.py | 2 +- .../nlp/sbert_for_sequence_classification.py | 13 +- modelscope/{pipelines => }/outputs.py | 1 + modelscope/pipelines/audio/ans_pipeline.py | 2 +- .../pipelines/audio/linear_aec_pipeline.py | 2 +- .../audio/text_to_speech_pipeline.py | 2 +- modelscope/pipelines/base.py | 2 +- .../cv/action_recognition_pipeline.py | 2 +- .../pipelines/cv/animal_recog_pipeline.py | 2 +- .../cv/cmdssl_video_embedding_pipleline.py | 2 +- .../pipelines/cv/image_cartoon_pipeline.py | 2 +- .../pipelines/cv/image_matting_pipeline.py | 2 +- .../pipelines/cv/ocr_detection_pipeline.py | 2 +- .../text_to_image_synthesis_pipeline.py | 2 +- .../nlp/dialog_intent_prediction_pipeline.py | 2 +- .../pipelines/nlp/dialog_modeling_pipeline.py | 2 +- .../nlp/dialog_state_tracking_pipeline.py | 2 +- .../pipelines/nlp/fill_mask_pipeline.py | 2 +- modelscope/pipelines/nlp/nli_pipeline.py | 2 +- .../nlp/sentence_similarity_pipeline.py | 2 +- .../nlp/sentiment_classification_pipeline.py | 2 +- .../nlp/sequence_classification_pipeline.py | 2 +- .../pipelines/nlp/text_generation_pipeline.py | 2 +- .../pipelines/nlp/translation_pipeline.py | 2 +- .../nlp/word_segmentation_pipeline.py | 2 +- .../nlp/zero_shot_classification_pipeline.py | 2 +- modelscope/preprocessors/nlp.py | 30 +- modelscope/task_datasets/__init__.py | 3 + modelscope/task_datasets/base.py | 48 ++ modelscope/task_datasets/builder.py | 21 + .../task_datasets/torch_base_dataset.py | 63 ++ modelscope/trainers/__init__.py | 1 + modelscope/trainers/base.py | 4 + modelscope/trainers/builder.py | 5 +- modelscope/trainers/hooks/__init__.py | 16 + modelscope/trainers/hooks/builder.py | 9 + modelscope/trainers/hooks/checkpoint_hook.py | 92 +++ modelscope/trainers/hooks/evaluation_hook.py | 74 ++ modelscope/trainers/hooks/hook.py | 208 ++++++ modelscope/trainers/hooks/iter_timer_hook.py | 22 + modelscope/trainers/hooks/logger/__init__.py | 6 + modelscope/trainers/hooks/logger/base.py | 115 +++ .../trainers/hooks/logger/text_logger_hook.py | 159 ++++ .../trainers/hooks/lr_scheduler_hook.py | 71 ++ modelscope/trainers/hooks/optimizer_hook.py | 37 + modelscope/trainers/hooks/priority.py | 62 ++ modelscope/trainers/lrscheduler/__init__.py | 8 + modelscope/trainers/lrscheduler/builder.py | 47 ++ .../trainers/lrscheduler/warmup/__init__.py | 5 + .../trainers/lrscheduler/warmup/base.py | 75 ++ .../trainers/lrscheduler/warmup/warmup.py | 79 ++ modelscope/trainers/optimizer/__init__.py | 4 + modelscope/trainers/optimizer/builder.py | 39 + modelscope/trainers/trainer.py | 703 ++++++++++++++++++ modelscope/trainers/utils/inference.py | 208 ++++++ modelscope/trainers/utils/log_buffer.py | 42 ++ modelscope/utils/checkpoint.py | 74 ++ modelscope/utils/constant.py | 55 +- modelscope/utils/hub.py | 13 + modelscope/utils/import_utils.py | 32 + modelscope/utils/tensor_utils.py | 77 ++ modelscope/utils/torch_utils.py | 127 ++++ output.wav | 3 - tests/models/__init__.py | 0 tests/models/test_base_torch.py | 60 ++ tests/pipelines/test_base.py | 2 +- tests/pipelines/test_image_captioning.py | 2 +- tests/pipelines/test_image_matting.py | 2 +- tests/pipelines/test_person_image_cartoon.py | 2 +- .../pipelines/test_text_to_image_synthesis.py | 2 +- tests/pipelines/test_text_to_speech.py | 2 +- tests/trainers/hooks/__init__.py | 0 tests/trainers/hooks/test_checkpoint_hook.py | 108 +++ .../trainers/hooks/test_lr_scheduler_hook.py | 188 +++++ tests/trainers/hooks/test_timer_hook.py | 128 ++++ tests/trainers/lrscheduler/__init__.py | 0 tests/trainers/lrscheduler/warmup/__init__.py | 0 .../lrscheduler/warmup/test_warmup_base.py | 79 ++ tests/trainers/test_trainer.py | 209 ++++++ tests/trainers/test_trainer_base.py | 19 - tests/trainers/test_trainer_with_nlp.py | 91 +++ 91 files changed, 3936 insertions(+), 68 deletions(-) create mode 100644 configs/cv/configuration.json create mode 100644 configs/nlp/sbert_sentence_similarity.json create mode 100644 modelscope/metrics/__init__.py create mode 100644 modelscope/metrics/base.py create mode 100644 modelscope/metrics/builder.py create mode 100644 modelscope/metrics/sequence_classification_metric.py create mode 100644 modelscope/models/base_torch.py rename modelscope/{pipelines => }/outputs.py (99%) create mode 100644 modelscope/task_datasets/__init__.py create mode 100644 modelscope/task_datasets/base.py create mode 100644 modelscope/task_datasets/builder.py create mode 100644 modelscope/task_datasets/torch_base_dataset.py create mode 100644 modelscope/trainers/hooks/__init__.py create mode 100644 modelscope/trainers/hooks/builder.py create mode 100644 modelscope/trainers/hooks/checkpoint_hook.py create mode 100644 modelscope/trainers/hooks/evaluation_hook.py create mode 100644 modelscope/trainers/hooks/hook.py create mode 100644 modelscope/trainers/hooks/iter_timer_hook.py create mode 100644 modelscope/trainers/hooks/logger/__init__.py create mode 100644 modelscope/trainers/hooks/logger/base.py create mode 100644 modelscope/trainers/hooks/logger/text_logger_hook.py create mode 100644 modelscope/trainers/hooks/lr_scheduler_hook.py create mode 100644 modelscope/trainers/hooks/optimizer_hook.py create mode 100644 modelscope/trainers/hooks/priority.py create mode 100644 modelscope/trainers/lrscheduler/__init__.py create mode 100644 modelscope/trainers/lrscheduler/builder.py create mode 100644 modelscope/trainers/lrscheduler/warmup/__init__.py create mode 100644 modelscope/trainers/lrscheduler/warmup/base.py create mode 100644 modelscope/trainers/lrscheduler/warmup/warmup.py create mode 100644 modelscope/trainers/optimizer/__init__.py create mode 100644 modelscope/trainers/optimizer/builder.py create mode 100644 modelscope/trainers/trainer.py create mode 100644 modelscope/trainers/utils/inference.py create mode 100644 modelscope/trainers/utils/log_buffer.py create mode 100644 modelscope/utils/checkpoint.py create mode 100644 modelscope/utils/tensor_utils.py create mode 100644 modelscope/utils/torch_utils.py delete mode 100644 output.wav create mode 100644 tests/models/__init__.py create mode 100644 tests/models/test_base_torch.py create mode 100644 tests/trainers/hooks/__init__.py create mode 100644 tests/trainers/hooks/test_checkpoint_hook.py create mode 100644 tests/trainers/hooks/test_lr_scheduler_hook.py create mode 100644 tests/trainers/hooks/test_timer_hook.py create mode 100644 tests/trainers/lrscheduler/__init__.py create mode 100644 tests/trainers/lrscheduler/warmup/__init__.py create mode 100644 tests/trainers/lrscheduler/warmup/test_warmup_base.py create mode 100644 tests/trainers/test_trainer.py delete mode 100644 tests/trainers/test_trainer_base.py create mode 100644 tests/trainers/test_trainer_with_nlp.py diff --git a/configs/cv/configuration.json b/configs/cv/configuration.json new file mode 100644 index 00000000..fb9ff064 --- /dev/null +++ b/configs/cv/configuration.json @@ -0,0 +1,175 @@ +{ + "framework": "pytorch", + + "task": "image_classification", + "work_dir": "./work_dir", + + "model": { + "type": "classification", + "pretrained": null, + "backbone": { + "type": "ResNet", + "depth": 50, + "out_indices": [ + 4 + ], + "norm_cfg": { + "type": "BN" + } + }, + "head": { + "type": "ClsHead", + "with_avg_pool": true, + "in_channels": 2048, + "loss_config": { + "type": "CrossEntropyLossWithLabelSmooth", + "label_smooth": 0 + }, + "num_classes": 1000 + } + }, + + "dataset": { + "train": { + "type": "ClsDataset", + "data_source": { + "list_file": "data/imagenet_raw/meta/train_labeled.txt", + "root": "data/imagenet_raw/train/", + "type": "ClsSourceImageList" + } + }, + "val": { + "type": "ClsDataset", + "data_source": { + "list_file": "data/imagenet_raw/meta/val_labeled.txt", + "root": "data/imagenet_raw/validation/", + "type": "ClsSourceImageList" + } + }, + "test": {} + }, + + + "preprocessor":{ + "train": [ + { + "type": "RandomResizedCrop", + "size": 224 + }, + { + "type": "RandomHorizontalFlip" + }, + { + "type": "ToTensor" + }, + { + "type": "Normalize", + "mean": [ + 0.485, + 0.456, + 0.406 + ], + "std": [ + 0.229, + 0.224, + 0.225 + ] + }, + { + "type": "Collect", + "keys": [ + "img", + "gt_labels" + ] + } + ], + "val": [ + { + "type": "Resize", + "size": 256 + }, + { + "type": "CenterCrop", + "size": 224 + }, + { + "type": "ToTensor" + }, + { + "type": "Normalize", + "mean": [ + 0.485, + 0.456, + 0.406 + ], + "std": [ + 0.229, + 0.224, + 0.225 + ] + }, + { + "type": "Collect", + "keys": [ + "img", + "gt_labels" + ] + } + ] + }, + + "train": { + "dataloader": { + "batch_size_per_gpu": 2, + "workers_per_gpu": 1 + }, + "optimizer": { + "type": "SGD", + "lr": 0.01, + "options": { + "grad_clip": { + "max_norm": 2.0 + } + } + }, + "lr_scheduler": { + "type": "StepLR", + "step_size": 2, + "options": { + "warmup": { + "type": "LinearWarmup", + "warmup_iters": 2 + + } + } + }, + "hooks": + [ + { + "type": "CheckpointHook", + "interval": 2 + }, + { + "type": "TextLoggerHook", + "interval": 1 + }, + { + "type": "IterTimerHook" + }, + { + "type": "EvaluationHook", + "interval": 1 + } + ] + }, + + "evaluation": { + "dataloader": { + "batch_size_per_gpu": 2, + "workers_per_gpu": 1, + "shuffle": false + }, + "metrics": ["accuracy", "precision", "recall"] + } + +} diff --git a/configs/nlp/sbert_sentence_similarity.json b/configs/nlp/sbert_sentence_similarity.json new file mode 100644 index 00000000..2d8eafdd --- /dev/null +++ b/configs/nlp/sbert_sentence_similarity.json @@ -0,0 +1,77 @@ +{ + "task": "sentence-similarity", + "preprocessor": { + "type": "bert-seq-cls-tokenizer-finetune", + "first_sequence": "sentence1", + "second_sequence": "sentence2" + }, + "model": { + "type": "structbert", + "attention_probs_dropout_prob": 0.1, + "easynlp_version": "0.0.3", + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "transformers_version": "4.6.0.dev0", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 30522 + }, + "pipeline": { + "type": "sentence-similarity" + }, + "work_dir": "/tmp", + "train": { + "dataloader": { + "batch_size_per_gpu": 2, + "workers_per_gpu": 1 + }, + "optimizer": { + "type": "SGD", + "lr": 0.01, + "options": { + "grad_clip": { + "max_norm": 2.0 + } + } + }, + "lr_scheduler": { + "type": "StepLR", + "step_size": 2, + "options": { + "warmup": { + "type": "LinearWarmup", + "warmup_iters": 2 + } + } + }, + "hooks": [{ + "type": "CheckpointHook", + "interval": 1 + }, { + "type": "TextLoggerHook", + "interval": 1 + }, { + "type": "IterTimerHook" + }, { + "type": "EvaluationHook", + "interval": 1 + }] + }, + "evaluation": { + "dataloader": { + "batch_size_per_gpu": 2, + "workers_per_gpu": 1, + "shuffle": false + } + } + } diff --git a/docs/source/api/modelscope.pipelines.rst b/docs/source/api/modelscope.pipelines.rst index 3e6c2694..e56a9a87 100644 --- a/docs/source/api/modelscope.pipelines.rst +++ b/docs/source/api/modelscope.pipelines.rst @@ -36,10 +36,10 @@ modelscope.pipelines.base module :undoc-members: :show-inheritance: -modelscope.pipelines.outputs module +modelscope.outputs module ----------------------------------- -.. automodule:: modelscope.pipelines.outputs +.. automodule:: modelscope.outputs :members: :undoc-members: :show-inheritance: diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index f03c0dab..79fd3b4f 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -127,3 +127,16 @@ class Preprocessors(object): # multi-modal ofa_image_caption = 'ofa-image-caption' mplug_visual_question_answering = 'mplug-visual-question-answering' + + +class Metrics(object): + """ Names for different metrics. + """ + + # accuracy + accuracy = 'accuracy' + + # metrics for sequence classification task + seq_cls_metric = 'seq_cls_metric' + # metrics for token-classification task + token_cls_metric = 'token-cls-metric' diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py new file mode 100644 index 00000000..681c65c4 --- /dev/null +++ b/modelscope/metrics/__init__.py @@ -0,0 +1,3 @@ +from .base import Metric +from .builder import METRICS, build_metric, task_default_metrics +from .sequence_classification_metric import SequenceClassificationMetric diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py new file mode 100644 index 00000000..1b9db825 --- /dev/null +++ b/modelscope/metrics/base.py @@ -0,0 +1,37 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from abc import ABC, abstractmethod +from typing import Dict + + +class Metric(ABC): + """The metric base class for computing metrics. + + The subclasses can either compute a single metric like 'accuracy', or compute the + complex metrics for a specific task with or without other Metric subclasses. + """ + + @abstractmethod + def add(self, outputs: Dict, inputs: Dict): + """ Append logits and labels within an eval loop. + + Will be called after every batch finished to gather the model predictions and the labels. + + Args: + outputs: The model prediction outputs. + inputs: The mini batch inputs from the dataloader. + + Returns: None + + """ + pass + + @abstractmethod + def evaluate(self): + """Evaluate the metrics after the eval finished. + + Will be called after the whole validation finished. + + Returns: The actual metric dict with standard names. + + """ + pass diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py new file mode 100644 index 00000000..03657b89 --- /dev/null +++ b/modelscope/metrics/builder.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from ..metainfo import Metrics +from ..utils.config import ConfigDict +from ..utils.constant import Tasks +from ..utils.registry import Registry, build_from_cfg, default_group + +METRICS = Registry('metrics') + + +class MetricKeys(object): + ACCURACY = 'accuracy' + F1 = 'f1' + PRECISION = 'precision' + RECALL = 'recall' + + +task_default_metrics = { + Tasks.sentence_similarity: [Metrics.seq_cls_metric], +} + + +def build_metric(metric_name: str, + field: str = default_group, + default_args: dict = None): + """ Build metric given metric_name and field. + + Args: + metric_name (:obj:`str`): The metric name. + field (str, optional): The field of this metric, default value: 'default' for all fields. + default_args (dict, optional): Default initialization arguments. + """ + cfg = ConfigDict({'type': metric_name}) + return build_from_cfg( + cfg, METRICS, group_key=field, default_args=default_args) diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py new file mode 100644 index 00000000..e41c7851 --- /dev/null +++ b/modelscope/metrics/sequence_classification_metric.py @@ -0,0 +1,40 @@ +from typing import Dict, List, Union + +import numpy as np + +from modelscope.outputs import OutputKeys +from ..metainfo import Metrics +from ..utils.registry import default_group +from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify +from .base import Metric +from .builder import METRICS, MetricKeys + + +@METRICS.register_module( + group_key=default_group, module_name=Metrics.seq_cls_metric) +class SequenceClassificationMetric(Metric): + """The metric computation class for sequence classification classes. + """ + + label_name = 'labels' + + def __init__(self): + self.preds = [] + self.labels = [] + + def add(self, outputs: Dict, inputs: Dict): + ground_truths = inputs[SequenceClassificationMetric.label_name] + eval_results = outputs[OutputKeys.LOGITS] + self.preds.append( + torch_nested_numpify(torch_nested_detach(eval_results))) + self.labels.append( + torch_nested_numpify(torch_nested_detach(ground_truths))) + + def evaluate(self): + preds = np.concatenate(self.preds, axis=0) + labels = np.concatenate(self.labels, axis=0) + preds = np.argmax(preds, axis=1) + return { + MetricKeys.ACCURACY: + (preds == labels).astype(np.float32).mean().item() + } diff --git a/modelscope/models/base_torch.py b/modelscope/models/base_torch.py new file mode 100644 index 00000000..10899221 --- /dev/null +++ b/modelscope/models/base_torch.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Dict + +import torch + +from .base import Model + + +class TorchModel(Model, torch.nn.Module): + """ Base model interface for pytorch + + """ + + def __init__(self, model_dir=None, *args, **kwargs): + # init reference: https://stackoverflow.com/questions\ + # /9575409/calling-parent-class-init-with-multiple-inheritance-whats-the-right-way + super().__init__(model_dir) + super(Model, self).__init__() + + def forward(self, inputs: Dict[str, + torch.Tensor]) -> Dict[str, torch.Tensor]: + raise NotImplementedError diff --git a/modelscope/models/multi_modal/clip/clip_model.py b/modelscope/models/multi_modal/clip/clip_model.py index 839b8d0e..79f7ae44 100644 --- a/modelscope/models/multi_modal/clip/clip_model.py +++ b/modelscope/models/multi_modal/clip/clip_model.py @@ -108,7 +108,7 @@ class CLIPForMultiModalEmbedding(Model): return text_ids_tensor, text_mask_tensor def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - from modelscope.pipelines.outputs import OutputKeys + from modelscope.outputs import OutputKeys output = { OutputKeys.IMG_EMBEDDING: None, OutputKeys.TEXT_EMBEDDING: None @@ -134,7 +134,7 @@ class CLIPForMultiModalEmbedding(Model): img_embedding = self.clip_model( input_data=img_tensor, input_type='img') - from modelscope.pipelines.outputs import OutputKeys + from modelscope.outputs import OutputKeys output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() if 'text' in input and input['text'] is not None: diff --git a/modelscope/models/multi_modal/image_captioning_model.py b/modelscope/models/multi_modal/image_captioning_model.py index 5c0a3ddf..05fc44d3 100644 --- a/modelscope/models/multi_modal/image_captioning_model.py +++ b/modelscope/models/multi_modal/image_captioning_model.py @@ -76,7 +76,7 @@ class OfaForImageCaptioning(Model): input = fairseq.utils.move_to_cuda(input, device=self._device) results, _ = self.eval_caption(self.task, self.generator, self.models, input) - from ...pipelines.outputs import OutputKeys + from modelscope.outputs import OutputKeys return { 'image_id': results[0]['image_id'], OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] diff --git a/modelscope/models/nlp/sbert_for_sequence_classification.py b/modelscope/models/nlp/sbert_for_sequence_classification.py index 861b6fe2..a685fcf7 100644 --- a/modelscope/models/nlp/sbert_for_sequence_classification.py +++ b/modelscope/models/nlp/sbert_for_sequence_classification.py @@ -7,7 +7,10 @@ import torch from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel from torch import nn +from modelscope.metainfo import Models +from modelscope.utils.constant import Tasks from ..base import Model +from ..builder import MODELS class SbertTextClassfier(SbertPreTrainedModel): @@ -20,7 +23,11 @@ class SbertTextClassfier(SbertPreTrainedModel): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - def forward(self, input_ids=None, token_type_ids=None): + def forward(self, + input_ids=None, + token_type_ids=None, + labels=None, + **kwargs): outputs = self.encoder( input_ids, token_type_ids=token_type_ids, @@ -29,6 +36,10 @@ class SbertTextClassfier(SbertPreTrainedModel): pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return {'logits': logits, 'loss': loss} return {'logits': logits} diff --git a/modelscope/pipelines/outputs.py b/modelscope/outputs.py similarity index 99% rename from modelscope/pipelines/outputs.py rename to modelscope/outputs.py index b1b0e86c..a169f40b 100644 --- a/modelscope/pipelines/outputs.py +++ b/modelscope/outputs.py @@ -4,6 +4,7 @@ from modelscope.utils.constant import Tasks class OutputKeys(object): + LOGITS = 'logits' SCORES = 'scores' LABEL = 'label' LABELS = 'labels' diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py index 30d129ca..cb37c343 100644 --- a/modelscope/pipelines/audio/ans_pipeline.py +++ b/modelscope/pipelines/audio/ans_pipeline.py @@ -7,10 +7,10 @@ import soundfile as sf import torch from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks from ..base import Input, Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys def audio_norm(x): diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py index ea07565e..c0e58ca0 100644 --- a/modelscope/pipelines/audio/linear_aec_pipeline.py +++ b/modelscope/pipelines/audio/linear_aec_pipeline.py @@ -9,12 +9,12 @@ import yaml from modelscope.fileio import File from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.preprocessors.audio import LinearAECAndFbank from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys logger = get_logger() diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py index 23380e25..d8aefd57 100644 --- a/modelscope/pipelines/audio/text_to_speech_pipeline.py +++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py @@ -5,9 +5,9 @@ import numpy as np from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.models.audio.tts import SambertHifigan +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, InputModel, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Fields, Tasks __all__ = ['TextToSpeechSambertHifiganPipeline'] diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index b2d17777..8c260ece 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -7,10 +7,10 @@ from typing import Any, Dict, Generator, List, Union from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.base import Model from modelscope.msdatasets import MsDataset +from modelscope.outputs import TASK_OUTPUTS from modelscope.preprocessors import Preprocessor from modelscope.utils.config import Config from modelscope.utils.logger import get_logger -from .outputs import TASK_OUTPUTS from .util import is_model, is_official_hub_path Tensor = Union['torch.Tensor', 'tf.Tensor'] diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py index ad453fd8..a53acd26 100644 --- a/modelscope/pipelines/cv/action_recognition_pipeline.py +++ b/modelscope/pipelines/cv/action_recognition_pipeline.py @@ -6,6 +6,7 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models.cv.action_recognition.models import BaseVideoModel +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.preprocessors.video import ReadVideoData from modelscope.utils.config import Config @@ -13,7 +14,6 @@ from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys logger = get_logger() diff --git a/modelscope/pipelines/cv/animal_recog_pipeline.py b/modelscope/pipelines/cv/animal_recog_pipeline.py index 2a5f3094..ddc3acea 100644 --- a/modelscope/pipelines/cv/animal_recog_pipeline.py +++ b/modelscope/pipelines/cv/animal_recog_pipeline.py @@ -10,13 +10,13 @@ from torchvision import transforms from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines from modelscope.models.cv.animal_recognition import resnet +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.preprocessors import load_image from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys logger = get_logger() diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py index 850f2914..47d90d71 100644 --- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py +++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py @@ -11,8 +11,8 @@ from PIL import Image from modelscope.metainfo import Pipelines from modelscope.models.cv.cmdssl_video_embedding.resnet2p1d import \ resnet26_2p1d +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py index 1b064e66..377e0235 100644 --- a/modelscope/pipelines/cv/image_cartoon_pipeline.py +++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py @@ -11,13 +11,13 @@ from modelscope.models.cv.cartoon.facelib.facer import FaceAna from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import ( get_reference_facial_points, warp_and_crop_face) from modelscope.models.cv.cartoon.utils import get_f5p, padTo16x, resize_size +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.preprocessors import load_image from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys if tf.__version__ >= '2.0': tf = tf.compat.v1 diff --git a/modelscope/pipelines/cv/image_matting_pipeline.py b/modelscope/pipelines/cv/image_matting_pipeline.py index c645daa2..5716a1f5 100644 --- a/modelscope/pipelines/cv/image_matting_pipeline.py +++ b/modelscope/pipelines/cv/image_matting_pipeline.py @@ -6,13 +6,13 @@ import numpy as np import PIL from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.preprocessors import load_image from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys logger = get_logger() diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py index 0333400d..d8b31389 100644 --- a/modelscope/pipelines/cv/ocr_detection_pipeline.py +++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py @@ -7,13 +7,13 @@ import PIL import tensorflow as tf from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.preprocessors import load_image from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils if tf.__version__ >= '2.0': diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py index 5b8d43d1..d78cb5c7 100644 --- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py +++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py @@ -3,12 +3,12 @@ from typing import Any, Dict import torch from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from ..base import Model, Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys logger = get_logger() diff --git a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py index 0fd863a6..842ae5a2 100644 --- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Union +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SpaceForDialogIntent @@ -9,7 +10,6 @@ from ...preprocessors import DialogIntentPredictionPreprocessor from ...utils.constant import Tasks from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['DialogIntentPredictionPipeline'] diff --git a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py index ed7a826b..6d91bd67 100644 --- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py @@ -2,6 +2,7 @@ from typing import Dict, Union +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SpaceForDialogModeling @@ -9,7 +10,6 @@ from ...preprocessors import DialogModelingPreprocessor from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['DialogModelingPipeline'] diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index 9c2c9b0d..dc6df061 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -1,12 +1,12 @@ from typing import Any, Dict, Union +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model, SpaceForDialogStateTracking from ...preprocessors import DialogStateTrackingPreprocessor from ...utils.constant import Tasks from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['DialogStateTrackingPipeline'] diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 9ed48a30..5fddea44 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Optional, Union import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp.masked_language import MaskedLanguageModelBase @@ -11,7 +12,6 @@ from ...utils.config import Config from ...utils.constant import ModelFile, Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['FillMaskPipeline'] _type_map = {'veco': 'roberta', 'sbert': 'bert'} diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py index 7ed050be..6c4d9135 100644 --- a/modelscope/pipelines/nlp/nli_pipeline.py +++ b/modelscope/pipelines/nlp/nli_pipeline.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Union import numpy as np import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForNLI @@ -11,7 +12,6 @@ from ...preprocessors import NLIPreprocessor from ...utils.constant import Tasks from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['NLIPipeline'] diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py index c8484521..8ed7d0f9 100644 --- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Union import numpy as np import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForSentenceSimilarity @@ -10,7 +11,6 @@ from ...preprocessors import SentenceSimilarityPreprocessor from ...utils.constant import Tasks from ..base import Input, Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['SentenceSimilarityPipeline'] diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py index 7bd75218..f77281a6 100644 --- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Union import numpy as np import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForSentimentClassification @@ -10,7 +11,6 @@ from ...preprocessors import SentimentClassificationPreprocessor from ...utils.constant import Tasks from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['SentimentClassificationPipeline'] diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py index ec765f55..3ade16e9 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -4,12 +4,12 @@ import numpy as np from modelscope.metainfo import Pipelines from modelscope.models.nlp import BertForSequenceClassification +from modelscope.outputs import OutputKeys from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from ...models import Model from ..base import Input, Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['SequenceClassificationPipeline'] diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index d383838e..d68f0039 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import PalmForTextGeneration @@ -9,7 +10,6 @@ from ...preprocessors import TextGenerationPreprocessor from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['TextGenerationPipeline'] diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index 339428ff..6ae28c31 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -4,6 +4,7 @@ from typing import Any, Dict import numpy as np import tensorflow as tf +from modelscope.outputs import OutputKeys from ...hub.snapshot_download import snapshot_download from ...metainfo import Pipelines from ...models.nlp import CsanmtForTranslation @@ -11,7 +12,6 @@ from ...utils.constant import ModelFile, Tasks from ...utils.logger import get_logger from ..base import Pipeline, Tensor from ..builder import PIPELINES -from ..outputs import OutputKeys if tf.__version__ >= '2.0': tf = tf.compat.v1 diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index c220adbb..65f28e7b 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union import torch +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForTokenClassification @@ -9,7 +10,6 @@ from ...preprocessors import TokenClassificationPreprocessor from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['WordSegmentationPipeline'] diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index 818a968f..2cd788bc 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Union import torch from scipy.special import softmax +from modelscope.outputs import OutputKeys from ...metainfo import Pipelines from ...models import Model from ...models.nlp import SbertForZeroShotClassification @@ -10,7 +11,6 @@ from ...preprocessors import ZeroShotClassificationPreprocessor from ...utils.constant import Tasks from ..base import Pipeline from ..builder import PIPELINES -from ..outputs import OutputKeys __all__ = ['ZeroShotClassificationPipeline'] diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 360d97aa..c4c3aa71 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -8,6 +8,7 @@ from transformers import AutoTokenizer from ..metainfo import Preprocessors from ..models import Model from ..utils.constant import Fields, InputFields +from ..utils.hub import parse_label_mapping from ..utils.type_assert import type_assert from .base import Preprocessor from .builder import PREPROCESSORS @@ -115,7 +116,8 @@ class SentenceSimilarityPreprocessor(NLPPreprocessorBase): def __init__(self, model_dir: str, *args, **kwargs): kwargs['truncation'] = True - kwargs['padding'] = False + kwargs['padding'] = False if 'padding' not in kwargs else kwargs[ + 'padding'] kwargs['return_tensors'] = 'pt' kwargs['max_length'] = kwargs.pop('sequence_length', 128) super().__init__(model_dir, *args, **kwargs) @@ -143,6 +145,7 @@ class SequenceClassificationPreprocessor(Preprocessor): self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) print(f'this is the tokenzier {self.tokenizer}') + self.label2id = parse_label_mapping(self.model_dir) @type_assert(object, (str, tuple, Dict)) def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: @@ -164,7 +167,7 @@ class SequenceClassificationPreprocessor(Preprocessor): 'id': [], 'input_ids': [], 'attention_mask': [], - 'token_type_ids': [] + 'token_type_ids': [], } max_seq_length = self.sequence_length @@ -186,6 +189,29 @@ class SequenceClassificationPreprocessor(Preprocessor): return rst +@PREPROCESSORS.register_module( + Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune') +class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor): + """Sentence similarity preprocessor in the finetune scenario + + Mainly added the label mapping procedure. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + kwargs['padding'] = 'max_length' + super().__init__(model_dir, *args, **kwargs) + self.label2id = parse_label_mapping(self.model_dir) + + @type_assert(object, (str, tuple, Dict)) + def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: + rst = super().__call__(data) + rst = {k: v.squeeze() for k, v in rst.items()} + if self.label2id is not None and 'label' in data: + rst['labels'] = [] + rst['labels'].append(self.label2id[str(data['label'])]) + return rst + + @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) class TextGenerationPreprocessor(NLPPreprocessorBase): diff --git a/modelscope/task_datasets/__init__.py b/modelscope/task_datasets/__init__.py new file mode 100644 index 00000000..3576e0da --- /dev/null +++ b/modelscope/task_datasets/__init__.py @@ -0,0 +1,3 @@ +from .base import TaskDataset +from .builder import build_task_dataset +from .torch_base_dataset import TorchTaskDataset diff --git a/modelscope/task_datasets/base.py b/modelscope/task_datasets/base.py new file mode 100644 index 00000000..90888a4c --- /dev/null +++ b/modelscope/task_datasets/base.py @@ -0,0 +1,48 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from abc import ABC, abstractmethod +from typing import Any, List, Tuple + + +class TaskDataset(ABC): + """The task dataset base class for all the task specific dataset processors. + """ + + def __init__(self, + datasets: Tuple[Any, List[Any]], + mode, + preprocessor=None, + **kwargs): + super().__init__() + self.mode = mode + self.preprocessor = preprocessor + self._inner_dataset = self.compose_dataset(datasets) + + @abstractmethod + def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: + """Prepare a dataset. + + User can process the input datasets in a whole dataset perspective. + This method also helps to merge several datasets to one. + + Args: + datasets: The original dataset(s) + + Returns: A single dataset, which may be created after merging. + + """ + pass + + @abstractmethod + def preprocess_dataset(self, data): + """Preprocess the data fetched from the inner_dataset. + + If the preprocessor is None, the original data will be returned, else the preprocessor will be called. + User can override this method to implement custom logics. + + Args: + data: The data fetched from the dataset. + + Returns: The processed data. + + """ + pass diff --git a/modelscope/task_datasets/builder.py b/modelscope/task_datasets/builder.py new file mode 100644 index 00000000..683bec8f --- /dev/null +++ b/modelscope/task_datasets/builder.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from modelscope.utils.config import ConfigDict +from modelscope.utils.registry import Registry, build_from_cfg + +TASK_DATASETS = Registry('task_datasets') + + +def build_task_dataset(cfg: ConfigDict, + task_name: str = None, + default_args: dict = None): + """ Build task specific dataset processor given model config dict and the task name. + + Args: + cfg (:obj:`ConfigDict`): config dict for model object. + task_name (str, optional): task name, refer to + :obj:`Tasks` for more details + default_args (dict, optional): Default initialization arguments. + """ + return build_from_cfg( + cfg, TASK_DATASETS, group_key=task_name, default_args=default_args) diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/task_datasets/torch_base_dataset.py new file mode 100644 index 00000000..e2fb7417 --- /dev/null +++ b/modelscope/task_datasets/torch_base_dataset.py @@ -0,0 +1,63 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, List, Tuple + +from torch.utils.data import ConcatDataset, Dataset + +from .base import TaskDataset + + +class TorchTaskDataset(TaskDataset, Dataset): + """The task dataset base class for all the torch-based task processors. + + This base class is enough for most cases, except there are procedures which can not be executed in + preprocessors and Datasets like dataset merging. + """ + + def __init__(self, + datasets: Tuple[Any, List[Any]], + mode, + preprocessor=None, + **kwargs): + TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) + + def __getitem__(self, index) -> Any: + return self.preprocess_dataset(self._inner_dataset[index]) + + def __len__(self): + return len(self._inner_dataset) + + def compose_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any: + """Prepare a dataset. + + User can process the input datasets in a whole dataset perspective. + This method gives a default implementation of datasets merging, user can override this + method to write custom logics. + + Args: + datasets: The original dataset(s) + + Returns: A single dataset, which may be created after merging. + + """ + if isinstance(datasets, List): + if len(datasets) == 1: + return datasets[0] + elif len(datasets) > 1: + return ConcatDataset(datasets) + else: + return datasets + + def preprocess_dataset(self, data): + """Preprocess the data fetched from the inner_dataset. + + If the preprocessor is None, the original data will be returned, else the preprocessor will be called. + User can override this method to implement custom logics. + + Args: + data: The data fetched from the dataset. + + Returns: The processed data. + + """ + return self.preprocessor( + data) if self.preprocessor is not None else data diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py index 589f325d..74d96e59 100644 --- a/modelscope/trainers/__init__.py +++ b/modelscope/trainers/__init__.py @@ -1,3 +1,4 @@ from .base import DummyTrainer from .builder import build_trainer from .nlp import SequenceClassificationTrainer +from .trainer import EpochBasedTrainer diff --git a/modelscope/trainers/base.py b/modelscope/trainers/base.py index 372938b4..c0bf51f3 100644 --- a/modelscope/trainers/base.py +++ b/modelscope/trainers/base.py @@ -1,10 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import time from abc import ABC, abstractmethod from typing import Callable, Dict, List, Optional, Tuple, Union from modelscope.trainers.builder import TRAINERS from modelscope.utils.config import Config +from .utils.log_buffer import LogBuffer class BaseTrainer(ABC): @@ -27,6 +29,8 @@ class BaseTrainer(ABC): self.args = self.cfg.to_args(arg_parse_fn) else: self.args = None + self.log_buffer = LogBuffer() + self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) @abstractmethod def train(self, *args, **kwargs): diff --git a/modelscope/trainers/builder.py b/modelscope/trainers/builder.py index 2192d46c..7f787011 100644 --- a/modelscope/trainers/builder.py +++ b/modelscope/trainers/builder.py @@ -5,9 +5,10 @@ from modelscope.utils.constant import Tasks from modelscope.utils.registry import Registry, build_from_cfg TRAINERS = Registry('trainers') +HOOKS = Registry('hooks') -def build_trainer(name: str = None, default_args: dict = None): +def build_trainer(name: str = 'EpochBasedTrainer', default_args: dict = None): """ build trainer given a trainer name Args: @@ -15,7 +16,5 @@ def build_trainer(name: str = None, default_args: dict = None): will be used. default_args (dict, optional): Default initialization arguments. """ - if name is None: - name = 'Trainer' cfg = dict(type=name) return build_from_cfg(cfg, TRAINERS, default_args=default_args) diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py new file mode 100644 index 00000000..d54c110a --- /dev/null +++ b/modelscope/trainers/hooks/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .builder import HOOKS, build_hook +from .checkpoint_hook import CheckpointHook +from .evaluation_hook import EvaluationHook +from .hook import Hook +from .iter_timer_hook import IterTimerHook +from .logger.text_logger_hook import TextLoggerHook +from .lr_scheduler_hook import LrSchedulerHook +from .optimizer_hook import OptimizerHook +from .priority import Priority + +__all__ = [ + 'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook', + 'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook', + 'IterTimerHook' +] diff --git a/modelscope/trainers/hooks/builder.py b/modelscope/trainers/hooks/builder.py new file mode 100644 index 00000000..1948e481 --- /dev/null +++ b/modelscope/trainers/hooks/builder.py @@ -0,0 +1,9 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.utils.registry import Registry, build_from_cfg, default_group + +HOOKS = Registry('hooks') + + +def build_hook(cfg, default_args=None): + return build_from_cfg( + cfg, HOOKS, group_key=default_group, default_args=default_args) diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py new file mode 100644 index 00000000..6892ac9c --- /dev/null +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -0,0 +1,92 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +from modelscope import __version__ +from modelscope.utils.checkpoint import save_checkpoint +from modelscope.utils.logger import get_logger +from modelscope.utils.torch_utils import get_dist_info +from .builder import HOOKS +from .hook import Hook +from .priority import Priority + + +@HOOKS.register_module() +class CheckpointHook(Hook): + """Save checkpoints periodically. + + Args: + interval (int): The frequency to save model. If `by_epoch=True`, + it means the number of epochs, else means the number of iterations + by_epoch (bool): Saving checkpoints by epoch or by iteration. + save_optimizer (bool): Whether to save optimizer state dict. Default: True. + save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` + save_last (bool): Whether to save the last checkpoint. Default: True. + """ + + PRIORITY = Priority.NORMAL + + def __init__(self, + interval=0, + by_epoch=True, + save_optimizer=True, + save_dir=None, + save_last=True): + self.interval = interval + self.by_epoch = by_epoch + self.save_optimizer = save_optimizer + self.save_dir = save_dir + self.save_last = save_last + + def before_run(self, trainer): + if not self.save_dir: + self.save_dir = trainer.work_dir + + if not hasattr(trainer, 'logger'): + self.logger = get_logger(__name__) + else: + self.logger = trainer.logger + + self.logger.info(f'Checkpoints will be saved to {self.save_dir}') + + def after_train_epoch(self, trainer): + if not self.by_epoch: + return + + if self._should_save(trainer): + self.logger.info(f'Saving checkpoint at {trainer.epoch + 1} epoch') + self._save_checkpoint(trainer) + + def _save_checkpoint(self, trainer): + if self.by_epoch: + cur_save_name = os.path.join(self.save_dir, + f'epoch_{trainer.epoch + 1}.pth') + else: + cur_save_name = os.path.join(self.save_dir, + f'iter_{trainer.epoch + 1}.pth') + + rank, _ = get_dist_info() + if rank == 0: + save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) + + def after_train_iter(self, trainer): + if self.by_epoch: + return + + if self._should_save(trainer): + self.logger.info( + f'Saving checkpoint at {trainer.iter + 1} iterations') + self._save_checkpoint(trainer) + + def _should_save(self, trainer): + if self.by_epoch: + check_last = self.is_last_epoch + check_frequency = self.every_n_epochs + else: + check_last = self.is_last_iter + check_frequency = self.every_n_iters + + if check_frequency(trainer, + self.interval) or (self.save_last + and check_last(trainer)): + return True + return False diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py new file mode 100644 index 00000000..7b06a170 --- /dev/null +++ b/modelscope/trainers/hooks/evaluation_hook.py @@ -0,0 +1,74 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .builder import HOOKS +from .hook import Hook +from .priority import Priority + + +@HOOKS.register_module() +class EvaluationHook(Hook): + """Evaluation hook. + Args: + interval (int): Evaluation interval. + by_epoch (bool): Evaluate by epoch or by iteration. + start_idx (int | None, optional): The epoch/iterations validation begins. + Default: None, validate every interval epochs/iterations from scratch. + """ + + PRIORITY = Priority.NORMAL + + def __init__(self, interval=1, by_epoch=True, start_idx=None): + + assert interval > 0, 'interval must be a positive number' + + self.interval = interval + self.start_idx = start_idx + self.by_epoch = by_epoch + + def after_train_iter(self, trainer): + """Called after every training iter to evaluate the results.""" + if not self.by_epoch and self._should_evaluate(trainer): + self.do_evaluate(trainer) + + def after_train_epoch(self, trainer): + """Called after every training epoch to evaluate the results.""" + if self.by_epoch and self._should_evaluate(trainer): + self.do_evaluate(trainer) + + def do_evaluate(self, trainer): + """Evaluate the results.""" + eval_res = trainer.evaluate() + for name, val in eval_res.items(): + trainer.log_buffer.output[name] = val + + trainer.log_buffer.ready = True + + def _should_evaluate(self, trainer): + """Judge whether to perform evaluation. + + Here is the rule to judge whether to perform evaluation: + 1. It will not perform evaluation during the epoch/iteration interval, + which is determined by ``self.interval``. + 2. It will not perform evaluation if the ``start_idx`` is larger than + current epochs/iters. + 3. It will not perform evaluation when current epochs/iters is larger than + the ``start_idx`` but during epoch/iteration interval. + + Returns: + bool: The flag indicating whether to perform evaluation. + """ + if self.by_epoch: + current = trainer.epoch + check_time = self.every_n_epochs + else: + current = trainer.iter + check_time = self.every_n_iters + + if self.start_idx is None: + if not check_time(trainer, self.interval): + return False + elif (current + 1) < self.start_idx: + return False + else: + if (current + 1 - self.start_idx) % self.interval: + return False + return True diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py new file mode 100644 index 00000000..e7ad2c37 --- /dev/null +++ b/modelscope/trainers/hooks/hook.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.utils.import_utils import is_method_overridden +from .priority import Priority + + +class Hook: + """ + The Hook base class of any modelscope trainer. You can build your own hook inherited from this class. + """ + + # TODO @jiangnana.jnn use constant variable for stages + stages = ('before_run', 'before_train_epoch', 'before_train_iter', + 'after_train_iter', 'after_train_epoch', 'before_val_epoch', + 'before_val_iter', 'after_val_iter', 'after_val_epoch', + 'after_run') + PRIORITY = Priority.NORMAL + + def before_run(self, trainer): + """ + Will be called before any loop begins. + Args: + trainer: The trainer instance. + + Returns: None + + """ + pass + + def after_run(self, trainer): + """ + Will be called after all loops end. + Args: + trainer: The trainer instance. + + Returns: None + + """ + pass + + def before_epoch(self, trainer): + """ + Will be called before every epoch begins. + Args: + trainer: The trainer instance. + + Returns: None + + """ + pass + + def after_epoch(self, trainer): + """ + Will be called after every epoch ends. + Args: + trainer: The trainer instance. + + Returns: None + + """ + pass + + def before_iter(self, trainer): + """ + Will be called before every loop begins. + Args: + trainer: The trainer instance. + + Returns: None + """ + pass + + def after_iter(self, trainer): + """ + Will be called after every loop ends. + Args: + trainer: The trainer instance. + + Returns: None + """ + pass + + def before_train_epoch(self, trainer): + """ + Will be called before every train epoch begins. Default call ``self.before_epoch`` + Args: + trainer: The trainer instance. + + Returns: None + + """ + self.before_epoch(trainer) + + def before_val_epoch(self, trainer): + """ + Will be called before every validation epoch begins. Default call ``self.before_epoch`` + Args: + trainer: The trainer instance. + + Returns: None + + """ + self.before_epoch(trainer) + + def after_train_epoch(self, trainer): + """ + Will be called after every train epoch ends. Default call ``self.after_epoch`` + Args: + trainer: The trainer instance. + + Returns: None + + """ + self.after_epoch(trainer) + + def after_val_epoch(self, trainer): + """ + Will be called after every validation epoch ends. Default call ``self.after_epoch`` + Args: + trainer: The trainer instance. + + Returns: None + + """ + self.after_epoch(trainer) + + def before_train_iter(self, trainer): + """ + Will be called before every train loop begins. Default call ``self.before_iter`` + Args: + trainer: The trainer instance. + + Returns: None + """ + self.before_iter(trainer) + + def before_val_iter(self, trainer): + """ + Will be called before every validation loop begins. Default call ``self.before_iter`` + Args: + trainer: The trainer instance. + + Returns: None + """ + self.before_iter(trainer) + + def after_train_iter(self, trainer): + """ + Will be called after every train loop ends. Default call ``self.after_iter`` + Args: + trainer: The trainer instance. + + Returns: None + """ + self.after_iter(trainer) + + def after_val_iter(self, trainer): + """ + Will be called after every validation loop ends. Default call ``self.after_iter`` + Args: + trainer: The trainer instance. + + Returns: None + """ + self.after_iter(trainer) + + def every_n_epochs(self, trainer, n): + """ + Whether to reach every ``n`` epochs + Returns: bool + """ + return (trainer.epoch + 1) % n == 0 if n > 0 else False + + def every_n_iters(self, trainer, n): + """ + Whether to reach every ``n`` iterations + Returns: bool + """ + return (trainer.iter + 1) % n == 0 if n > 0 else False + + def end_of_epoch(self, trainer): + """ + Whether to reach the end of every epoch + Returns: bool + """ + return trainer.inner_iter + 1 == len(trainer.data_loader) + + def is_last_epoch(self, trainer): + """ + Whether to reach the last epoch + Returns: bool + """ + return trainer.epoch + 1 == trainer._max_epochs + + def is_last_iter(self, trainer): + """ + Whether to reach the last iteration in the entire training process + Returns: bool + """ + return trainer.iter + 1 == trainer._max_iters + + def get_triggered_stages(self): + trigger_stages = set() + for stage in Hook.stages: + if is_method_overridden(stage, Hook, self): + trigger_stages.add(stage) + + return [stage for stage in Hook.stages if stage in trigger_stages] diff --git a/modelscope/trainers/hooks/iter_timer_hook.py b/modelscope/trainers/hooks/iter_timer_hook.py new file mode 100644 index 00000000..b57d8653 --- /dev/null +++ b/modelscope/trainers/hooks/iter_timer_hook.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import time + +from .builder import HOOKS +from .hook import Hook +from .priority import Priority + + +@HOOKS.register_module() +class IterTimerHook(Hook): + PRIORITY = Priority.LOW + + def before_epoch(self, trainer): + self.start_time = time.time() + + def before_iter(self, trainer): + trainer.log_buffer.update( + {'data_load_time': time.time() - self.start_time}) + + def after_iter(self, trainer): + trainer.log_buffer.update({'time': time.time() - self.start_time}) + self.start_time = time.time() diff --git a/modelscope/trainers/hooks/logger/__init__.py b/modelscope/trainers/hooks/logger/__init__.py new file mode 100644 index 00000000..16eb8797 --- /dev/null +++ b/modelscope/trainers/hooks/logger/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.trainers.utils.log_buffer import LogBuffer +from .base import LoggerHook +from .text_logger_hook import TextLoggerHook + +__all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer'] diff --git a/modelscope/trainers/hooks/logger/base.py b/modelscope/trainers/hooks/logger/base.py new file mode 100644 index 00000000..98c5e421 --- /dev/null +++ b/modelscope/trainers/hooks/logger/base.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +import numbers +from abc import ABCMeta, abstractmethod + +import numpy as np +import torch + +from modelscope.trainers.hooks.hook import Hook +from ..priority import Priority + + +class LoggerHook(Hook): + """Base class for logger hooks. + + Args: + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging. + by_epoch (bool): Whether EpochBasedtrainer is used. + """ + + __metaclass__ = ABCMeta + PRIORITY = Priority.VERY_LOW + + def __init__(self, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + self.interval = interval + self.ignore_last = ignore_last + self.reset_flag = reset_flag + self.by_epoch = by_epoch + + @abstractmethod + def log(self, trainer): + pass + + @staticmethod + def is_scalar(val, include_np=True, include_torch=True): + """Tell the input variable is a scalar or not. + + Args: + val: Input variable. + include_np (bool): Whether to treat 0-d np.ndarray as a scalar. + include_torch (bool): Whether to treat 0-d torch.Tensor as a scalar. + + Returns: + bool: True or False. + """ + if isinstance(val, numbers.Number): + return True + elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: + return True + elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: + return True + else: + return False + + def get_epoch(self, trainer): + if trainer.mode == 'train': + epoch = trainer.epoch + 1 + elif trainer.mode == 'val': + # normal val mode + # trainer.epoch += 1 has been done before val workflow + epoch = trainer.epoch + else: + raise ValueError(f"trainer mode should be 'train' or 'val', " + f'but got {trainer.mode}') + return epoch + + def get_iter(self, trainer, inner_iter=False): + """Get the current training iteration step.""" + if self.by_epoch and inner_iter: + current_iter = trainer.inner_iter + 1 + else: + current_iter = trainer.iter + 1 + return current_iter + + def before_run(self, trainer): + for hook in trainer.hooks[::-1]: + if isinstance(hook, LoggerHook): + hook.reset_flag = True + break + + def before_epoch(self, trainer): + trainer.log_buffer.clear() # clear logs of last epoch + + def after_train_iter(self, trainer): + if self.by_epoch and self.every_n_epochs(trainer, self.interval): + trainer.log_buffer.average(self.interval) + elif not self.by_epoch and self.every_n_iters(trainer, self.interval): + trainer.log_buffer.average(self.interval) + elif self.end_of_epoch(trainer) and not self.ignore_last: + # not precise but more stable + trainer.log_buffer.average(self.interval) + + if trainer.log_buffer.ready: + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() + + def after_train_epoch(self, trainer): + if trainer.log_buffer.ready: + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() + + def after_val_epoch(self, trainer): + trainer.log_buffer.average() + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py new file mode 100644 index 00000000..c6e39400 --- /dev/null +++ b/modelscope/trainers/hooks/logger/text_logger_hook.py @@ -0,0 +1,159 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import datetime +import os +import os.path as osp +from collections import OrderedDict + +import json +import torch +from torch import distributed as dist + +from modelscope.utils.torch_utils import get_dist_info +from ..builder import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class TextLoggerHook(LoggerHook): + """Logger hook in text, Output log to both console and local json file. + + Args: + by_epoch (bool, optional): Whether EpochBasedtrainer is used. + Default: True. + interval (int, optional): Logging interval (every k iterations). + Default: 10. + ignore_last (bool, optional): Ignore the log of last iterations in each + epoch if less than :attr:`interval`. Default: True. + reset_flag (bool, optional): Whether to clear the output buffer after + logging. Default: False. + out_dir (str): The directory to save log. If is None, use `trainer.work_dir` + """ + + def __init__(self, + by_epoch=True, + interval=10, + ignore_last=True, + reset_flag=False, + out_dir=None): + super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, + by_epoch) + self.by_epoch = by_epoch + self.time_sec_tot = 0 + self.out_dir = out_dir + self._logged_keys = [] # store the key has been logged + + def before_run(self, trainer): + super(TextLoggerHook, self).before_run(trainer) + + if self.out_dir is None: + self.out_dir = trainer.work_dir + + if not osp.exists(self.out_dir): + os.makedirs(self.out_dir) + + trainer.logger.info('Text logs will be saved to {}'.format( + self.out_dir)) + + self.start_iter = trainer.iter + self.json_log_path = osp.join(self.out_dir, + '{}.log.json'.format(trainer.timestamp)) + if hasattr(trainer, 'meta') and trainer.meta is not None: + self._dump_log(trainer.meta, trainer) + + def _get_max_memory(self, trainer): + device = getattr(trainer.model, 'output_device', None) + mem = torch.cuda.max_memory_allocated(device=device) + mem_mb = torch.tensor([mem / (1024 * 1024)], + dtype=torch.int, + device=device) + _, world_size = get_dist_info() + if world_size > 1: + dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) + return mem_mb.item() + + def _log_info(self, log_dict, trainer): + if log_dict['mode'] == 'train': + if isinstance(log_dict['lr'], dict): + lr_str = [] + for k, val in log_dict['lr'].items(): + lr_str.append(f'lr_{k}: {val:.3e}') + lr_str = ' '.join(lr_str) + else: + lr_str = f'lr: {log_dict["lr"]:.3e}' + + if self.by_epoch: + log_str = f'Epoch [{log_dict["epoch"]}][{log_dict["iter"]}/{len(trainer.data_loader)}]\t' + else: + log_str = f'Iter [{log_dict["iter"]}/{trainer.max_iters}]\t' + log_str += f'{lr_str}, ' + self._logged_keys.extend(['lr', 'mode', 'iter', 'epoch']) + + if 'time' in log_dict.keys(): + self.time_sec_tot += (log_dict['time'] * self.interval) + time_sec_avg = self.time_sec_tot / ( + trainer.iter - self.start_iter + 1) + eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + log_str += f'eta: {eta_str}, ' + log_str += f'time: {log_dict["time"]:.3f}, data_load_time: {log_dict["data_load_time"]:.3f}, ' + self._logged_keys.extend([ + 'time', + 'data_load_time', + ]) + else: + # val/test time + # here 1000 is the length of the val dataloader + # by epoch: Epoch[val] [4][1000] + # by iter: Iter[val] [1000] + if self.by_epoch: + log_str = f'Epoch({log_dict["mode"]}) [{log_dict["epoch"]}][{log_dict["iter"]}]\t' + else: + log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' + self._logged_keys.extend(['mode', 'iter', 'epoch']) + + log_items = [] + for name, val in log_dict.items(): + if name in self._logged_keys: + continue + if isinstance(val, float): + val = f'{val:.4f}' + log_items.append(f'{name}: {val}') + log_str += ', '.join(log_items) + + trainer.logger.info(log_str) + + def _dump_log(self, log_dict): + # dump log in json format + json_log = OrderedDict() + for k, v in log_dict.items(): + json_log[k] = self._round_float(v) + + rank, _ = get_dist_info() + if rank == 0: + with open(self.json_log_path, 'a+') as f: + json.dump(json_log, f) + f.write('\n') + + def _round_float(self, items, ndigits=5): + if isinstance(items, list): + return [self._round_float(item) for item in items] + elif isinstance(items, float): + return round(items, ndigits) + else: + return items + + def log(self, trainer): + cur_iter = self.get_iter(trainer, inner_iter=True) + + log_dict = OrderedDict( + mode=trainer.mode, epoch=self.get_epoch(trainer), iter=cur_iter) + + # statistic memory + if torch.cuda.is_available(): + log_dict['memory'] = self._get_max_memory(trainer) + + log_dict = dict(log_dict, **trainer.log_buffer.output) + + self._log_info(log_dict, trainer) + self._dump_log(log_dict) + return log_dict diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py new file mode 100644 index 00000000..08545ffc --- /dev/null +++ b/modelscope/trainers/hooks/lr_scheduler_hook.py @@ -0,0 +1,71 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.trainers.lrscheduler.builder import build_lr_scheduler +from .builder import HOOKS +from .hook import Hook +from .priority import Priority + + +@HOOKS.register_module() +class LrSchedulerHook(Hook): + """Lr scheduler. + + Args: + by_epoch (bool): Whether lr changes by epoch + warmup (dict): warm up config + """ + PRIORITY = Priority.VERY_HIGH + + def __init__(self, by_epoch=True, warmup=None) -> None: + super().__init__() + self.by_epoch = by_epoch + if not self.by_epoch: + raise ValueError('We only support ``by_epoch=True`` now!') + + self.warmup = warmup + self.warmup_lr_scheduler = None + + def before_run(self, trainer): + if self.warmup is not None: + assert isinstance(self.warmup, dict) and 'type' in self.warmup + self.warmup_lr_scheduler = build_lr_scheduler( + cfg=self.warmup, + default_args={'base_scheduler': trainer.lr_scheduler}) + + def get_current_lr(self, trainer): + import torch + + if isinstance(trainer.optimizer, torch.optim.Optimizer): + lr = [group['lr'] for group in trainer.optimizer.param_groups] + elif isinstance(trainer.optimizer, dict): + lr = dict() + for name, optim in trainer.optimizer.items(): + lr[name] = [group['lr'] for group in optim.param_groups] + else: + raise RuntimeError( + 'lr is not applicable because optimizer does not exist.') + return lr + + def before_train_iter(self, trainer): + trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) + + def before_train_epoch(self, trainer): + if self.by_epoch: + if self.warmup_lr_scheduler is not None: + self.warmup_lr_scheduler.step() + else: + trainer.lr_scheduler.step() + trainer.log_buffer.output['lr'] = self._get_log_lr(trainer) + + def _get_log_lr(self, trainer): + cur_lr = self.get_current_lr(trainer) + # only record lr of the first param group + if isinstance(cur_lr, list): + lr = cur_lr[0] + else: + assert isinstance(cur_lr, dict) + lr = {} + for k, lr_ in cur_lr.items(): + assert isinstance(lr_, list) + lr.update({k: lr_[0]}) + + return lr diff --git a/modelscope/trainers/hooks/optimizer_hook.py b/modelscope/trainers/hooks/optimizer_hook.py new file mode 100644 index 00000000..28bd9492 --- /dev/null +++ b/modelscope/trainers/hooks/optimizer_hook.py @@ -0,0 +1,37 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from torch.nn.utils import clip_grad + +from .builder import HOOKS +from .hook import Hook +from .priority import Priority + + +@HOOKS.register_module() +class OptimizerHook(Hook): + + PRIORITY = Priority.ABOVE_NORMAL + + def __init__(self, grad_clip=None, loss_keys='loss') -> None: + if isinstance(loss_keys, str): + loss_keys = [loss_keys] + assert isinstance(loss_keys, (tuple, list)) + self.loss_keys = loss_keys + self.grad_clip = grad_clip + + def clip_grads(self, params, **clip_args): + params = list( + filter(lambda p: p.requires_grad and p.grad is not None, params)) + if len(params) > 0: + return clip_grad.clip_grad_norm_(params, **clip_args) + + def after_train_iter(self, trainer): + trainer.optimizer.zero_grad() + + for k in self.loss_keys: + trainer.train_outputs[k].backward() + + clip_args = self.grad_clip + if clip_args is not None: + self.clip_grads(trainer.model.parameters(), **clip_args) + + trainer.optimizer.step() diff --git a/modelscope/trainers/hooks/priority.py b/modelscope/trainers/hooks/priority.py new file mode 100644 index 00000000..db749652 --- /dev/null +++ b/modelscope/trainers/hooks/priority.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +from enum import Enum +from typing import Union + + +class Priority(Enum): + """Hook priority levels. + + +--------------+------------+ + | Level | Value | + +==============+============+ + | HIGHEST | 0 | + +--------------+------------+ + | VERY_HIGH | 10 | + +--------------+------------+ + | HIGH | 30 | + +--------------+------------+ + | ABOVE_NORMAL | 40 | + +--------------+------------+ + | NORMAL | 50 | + +--------------+------------+ + | BELOW_NORMAL | 60 | + +--------------+------------+ + | LOW | 70 | + +--------------+------------+ + | VERY_LOW | 90 | + +--------------+------------+ + | LOWEST | 100 | + +--------------+------------+ + """ + + HIGHEST = 0 + VERY_HIGH = 10 + HIGH = 30 + ABOVE_NORMAL = 40 + NORMAL = 50 + BELOW_NORMAL = 60 + LOW = 70 + VERY_LOW = 90 + LOWEST = 100 + + +def get_priority(priority: Union[int, str, Priority]) -> int: + """Get priority value. + + Args: + priority (int or str or :obj:`Priority`): Priority. + + Returns: + int: The priority value. + """ + if isinstance(priority, int): + if priority < 0 or priority > 100: + raise ValueError('priority must be between 0 and 100') + return priority + elif isinstance(priority, Priority): + return priority.value + elif isinstance(priority, str): + return Priority[priority.upper()].value + else: + raise TypeError('priority must be an integer or Priority enum value') diff --git a/modelscope/trainers/lrscheduler/__init__.py b/modelscope/trainers/lrscheduler/__init__.py new file mode 100644 index 00000000..336a45f0 --- /dev/null +++ b/modelscope/trainers/lrscheduler/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .builder import LR_SCHEDULER, build_lr_scheduler +from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup + +__all__ = [ + 'LR_SCHEDULER', 'build_lr_scheduler', 'BaseWarmup', 'ConstantWarmup', + 'LinearWarmup', 'ExponentialWarmup' +] diff --git a/modelscope/trainers/lrscheduler/builder.py b/modelscope/trainers/lrscheduler/builder.py new file mode 100644 index 00000000..f284b1a9 --- /dev/null +++ b/modelscope/trainers/lrscheduler/builder.py @@ -0,0 +1,47 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import inspect + +from modelscope.utils.config import ConfigDict +from modelscope.utils.registry import Registry, build_from_cfg, default_group + +LR_SCHEDULER = Registry('lr scheduler') + + +def build_lr_scheduler(cfg: ConfigDict, default_args: dict = None): + """ build lr scheduler from given lr scheduler config dict + + Args: + cfg (:obj:`ConfigDict`): config dict for lr scheduler object. + default_args (dict, optional): Default initialization arguments. + """ + if cfg['type'].lower().endswith('warmup'): + # build warmup lr scheduler + if not hasattr(cfg, 'base_scheduler'): + if default_args is None or ('base_scheduler' not in default_args): + raise ValueError( + 'Must provide ``base_scheduler`` which is an instance of ``torch.optim.lr_scheduler._LRScheduler`` ' + 'for build warmup lr scheduler.') + else: + # build lr scheduler without warmup + if not hasattr(cfg, 'optimizer'): + if default_args is None or ('optimizer' not in default_args): + raise ValueError( + 'Must provide ``optimizer`` which is an instance of ``torch.optim.Optimizer`` ' + 'for build lr scheduler') + + return build_from_cfg( + cfg, LR_SCHEDULER, group_key=default_group, default_args=default_args) + + +def register_torch_lr_scheduler(): + from torch.optim import lr_scheduler + from torch.optim.lr_scheduler import _LRScheduler + + members = inspect.getmembers(lr_scheduler) + + for name, obj in members: + if inspect.isclass(obj) and issubclass(obj, _LRScheduler): + LR_SCHEDULER.register_module(module_name=name, module_cls=obj) + + +register_torch_lr_scheduler() diff --git a/modelscope/trainers/lrscheduler/warmup/__init__.py b/modelscope/trainers/lrscheduler/warmup/__init__.py new file mode 100644 index 00000000..ad8e11c0 --- /dev/null +++ b/modelscope/trainers/lrscheduler/warmup/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .base import BaseWarmup +from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup + +__all__ = ['BaseWarmup', 'ConstantWarmup', 'LinearWarmup', 'ExponentialWarmup'] diff --git a/modelscope/trainers/lrscheduler/warmup/base.py b/modelscope/trainers/lrscheduler/warmup/base.py new file mode 100644 index 00000000..78e9b5fb --- /dev/null +++ b/modelscope/trainers/lrscheduler/warmup/base.py @@ -0,0 +1,75 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from torch.optim.lr_scheduler import _LRScheduler + + +class BaseWarmup(_LRScheduler): + """Base warmup scheduler + + Args: + base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type + warmup_iters (int | list): Warmup iterations + last_epoch (int): The index of last epoch. + """ + + def __init__(self, + base_scheduler, + warmup_iters, + last_epoch=-1, + verbose=False): + self.base_scheduler = base_scheduler + self.warmup_iters = warmup_iters + optimizer = self.base_scheduler.optimizer + self._is_init_step = True + + super(BaseWarmup, self).__init__( + optimizer, last_epoch=last_epoch, verbose=verbose) + + def get_lr(self): + return self.base_scheduler.get_lr() + + def state_dict(self): + self.base_scheduler.state_dict() + + def load_state_dict(self, state_dict): + self.base_scheduler.load_state_dict(state_dict) + + def scale(self): + """Scale the learning rates. + """ + scale_value = self.get_warmup_scale(self.base_scheduler._step_count + - 1) + if isinstance(scale_value, (int, float)): + scale_value = [ + scale_value for _ in range(len(self.optimizer.param_groups)) + ] + else: + assert isinstance( + scale_value, (list, tuple)), 'Only support list or tuple type!' + assert len(scale_value) == len( + self.optimizer.param_groups), ('Size mismatch {} != {}'.format( + len(scale_value), len(self.optimizer.param_groups))) + + for i, group in enumerate(self.optimizer.param_groups): + group['lr'] *= scale_value[i] + + def step(self, epoch=None): + """ + When ``self.base_scheduler._step_count`` is less than ``self.warmup_iters``, multiply lr by scale + """ + if self.base_scheduler._step_count > self.warmup_iters: + return self.base_scheduler.step(epoch=epoch) + + for group, lr in zip(self.optimizer.param_groups, self.base_lrs): + group['lr'] = lr + + # `base_scheduler` has done step() at init when build + if self._is_init_step: + self._is_init_step = False + else: + self.base_scheduler.step(epoch=epoch) + + self.scale() + + @classmethod + def get_warmup_scale(self, cur_iter): + pass diff --git a/modelscope/trainers/lrscheduler/warmup/warmup.py b/modelscope/trainers/lrscheduler/warmup/warmup.py new file mode 100644 index 00000000..d00c83b0 --- /dev/null +++ b/modelscope/trainers/lrscheduler/warmup/warmup.py @@ -0,0 +1,79 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.trainers.lrscheduler.builder import LR_SCHEDULER +from .base import BaseWarmup + + +@LR_SCHEDULER.register_module() +class ConstantWarmup(BaseWarmup): + """Linear warmup scheduler. + + Args: + base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type + warmup_ratio (float): Lr used at warmup stage equals to warmup_ratio * initial_lr + warmup_iters (int | list): Warmup iterations + last_epoch (int): The index of last epoch. + """ + + def __init__(self, + base_scheduler, + warmup_iters, + warmup_ratio=0.1, + last_epoch=-1): + self.warmup_ratio = warmup_ratio + super(ConstantWarmup, self).__init__( + base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) + + def get_warmup_scale(self, cur_iter): + if cur_iter >= self.warmup_iters: + return 1.0 + return self.warmup_ratio + + +@LR_SCHEDULER.register_module() +class LinearWarmup(BaseWarmup): + """Linear warmup scheduler. + + Args: + base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type + warmup_iters (int | list): Warmup iterations + warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr + last_epoch (int): The index of last epoch. + """ + + def __init__(self, + base_scheduler, + warmup_iters, + warmup_ratio=0.1, + last_epoch=-1): + self.warmup_ratio = warmup_ratio + super(LinearWarmup, self).__init__( + base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) + + def get_warmup_scale(self, cur_iter): + k = (1 - cur_iter / self.warmup_iters) * (1 - self.warmup_ratio) + return 1 - k + + +@LR_SCHEDULER.register_module() +class ExponentialWarmup(BaseWarmup): + """Exponential warmup scheduler. + + Args: + base_scheduler (torch.optim._LRScheduler): an instance of torch.optim._LRScheduler type + warmup_iters (int | list): Warmup iterations + warmup_ratio (float): Lr used at the beginning of warmup equals to warmup_ratio * initial_lr + last_epoch (int): The index of last epoch. + """ + + def __init__(self, + base_scheduler, + warmup_iters, + warmup_ratio=0.1, + last_epoch=-1): + self.warmup_ratio = warmup_ratio + super(ExponentialWarmup, self).__init__( + base_scheduler, warmup_iters=warmup_iters, last_epoch=last_epoch) + + def get_warmup_scale(self, cur_iter): + k = self.warmup_ratio**(1 - cur_iter / self.warmup_iters) + return k diff --git a/modelscope/trainers/optimizer/__init__.py b/modelscope/trainers/optimizer/__init__.py new file mode 100644 index 00000000..884f3043 --- /dev/null +++ b/modelscope/trainers/optimizer/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .builder import OPTIMIZERS, build_optimizer + +__all__ = ['OPTIMIZERS', 'build_optimizer'] diff --git a/modelscope/trainers/optimizer/builder.py b/modelscope/trainers/optimizer/builder.py new file mode 100644 index 00000000..4d772dd9 --- /dev/null +++ b/modelscope/trainers/optimizer/builder.py @@ -0,0 +1,39 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import inspect + +import torch + +from modelscope.utils.config import ConfigDict +from modelscope.utils.registry import Registry, build_from_cfg, default_group + +OPTIMIZERS = Registry('optimizer') + + +def build_optimizer(model: torch.nn.Module, + cfg: ConfigDict, + default_args: dict = None): + """ build optimizer from optimizer config dict + + Args: + cfg (:obj:`ConfigDict`): config dict for optimizer object. + default_args (dict, optional): Default initialization arguments. + """ + if hasattr(model, 'module'): + model = model.module + cfg.params = model.parameters() + + return build_from_cfg( + cfg, OPTIMIZERS, group_key=default_group, default_args=default_args) + + +def register_torch_optimizers(): + for name, module in inspect.getmembers(torch.optim): + if name.startswith('__'): + continue + if inspect.isclass(module) and issubclass(module, + torch.optim.Optimizer): + OPTIMIZERS.register_module( + default_group, module_name=name, module_cls=module) + + +register_torch_optimizers() diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py new file mode 100644 index 00000000..6be0be29 --- /dev/null +++ b/modelscope/trainers/trainer.py @@ -0,0 +1,703 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path +import random +import time +from distutils.version import LooseVersion +from functools import partial +from typing import Callable, List, Optional, Tuple, Union + +import numpy as np +import torch +from addict import Dict +from torch import distributed as dist +from torch import nn +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.distributed import DistributedSampler + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metrics import build_metric, task_default_metrics +from modelscope.models.base import Model +from modelscope.models.base_torch import TorchModel +from modelscope.msdatasets.ms_dataset import MsDataset +from modelscope.preprocessors import build_preprocessor +from modelscope.preprocessors.base import Preprocessor +from modelscope.task_datasets import TorchTaskDataset, build_task_dataset +from modelscope.trainers.hooks.builder import HOOKS +from modelscope.trainers.hooks.priority import Priority, get_priority +from modelscope.trainers.lrscheduler.builder import build_lr_scheduler +from modelscope.trainers.optimizer.builder import build_optimizer +from modelscope.utils.config import ConfigDict +from modelscope.utils.constant import Hubs, ModelFile, Tasks +from modelscope.utils.logger import get_logger +from modelscope.utils.registry import build_from_cfg +from modelscope.utils.tensor_utils import torch_default_data_collator +from modelscope.utils.torch_utils import get_dist_info +from .base import BaseTrainer +from .builder import TRAINERS +from .hooks.hook import Hook + + +@TRAINERS.register_module() +class EpochBasedTrainer(BaseTrainer): + """Epoch based Trainer, a training helper for PyTorch. + + Args: + cfg_file(str): The local config file. + model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir + or a model id. If model is None, build_model method will be called. + data_collator (`Callable`, *optional*): + The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. + train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): + The dataset to use for training. + + Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a + distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a + `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will + manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally + sets the seed of the RNGs used. + eval_dataset (`torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation. + preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor. + NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code, + this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file. + Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and + this preprocessing action will be executed every time the dataset's __getitem__ is called. + optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple + containing the optimizer and the scheduler to use. + max_epochs: (int, optional): Total training epochs. + """ + + def __init__( + self, + model: Optional[Union[TorchModel, nn.Module, str]] = None, + cfg_file: Optional[str] = None, + arg_parse_fn: Optional[Callable] = None, + data_collator: Optional[Callable] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + preprocessor: Optional[Preprocessor] = None, + optimizers: Tuple[torch.optim.Optimizer, + torch.optim.lr_scheduler._LRScheduler] = (None, + None), + **kwargs): + if isinstance(model, str): + if os.path.exists(model): + self.model_dir = model if os.path.isdir( + model) else os.path.dirname(model) + else: + self.model_dir = snapshot_download(model) + cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) + self.model = self.build_model() + else: + assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class' + assert isinstance( + model, + (TorchModel, nn.Module + )), 'model should be either str, TorchMode or nn.Module.' + self.model_dir = os.path.dirname(cfg_file) + self.model = model + + super().__init__(cfg_file, arg_parse_fn) + if 'work_dir' in kwargs: + self.work_dir = kwargs['work_dir'] + else: + self.work_dir = self.cfg.train.get('work_dir', './work_dir') + + self.preprocessor = None + if isinstance(preprocessor, Preprocessor): + self.preprocessor = preprocessor + elif hasattr(self.cfg, 'preprocessor'): + self.preprocessor = self.build_preprocessor() + # TODO @wenmeng.zwm add data collator option + # TODO how to fill device option? + self.device = int( + os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None + self.train_dataset = self.to_task_dataset( + train_dataset, mode='train', preprocessor=self.preprocessor) + self.eval_dataset = self.to_task_dataset( + eval_dataset, mode='eval', preprocessor=self.preprocessor) + self.data_collator = data_collator if data_collator is not None else torch_default_data_collator + self.metrics = self.get_metrics() + self.optimizers = optimizers + self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) + self._mode = 'train' + self._hooks: List[Hook] = [] + self._epoch = 0 + self._iter = 0 + self._inner_iter = 0 + if 'max_epochs' not in kwargs: + assert hasattr( + self.cfg.train, + 'max_epochs'), 'max_epochs is missing in configuration file' + self._max_epochs = self.cfg.train.max_epochs + else: + self._max_epochs = kwargs['max_epochs'] + + # TODO @wenmeng.zwm add seed init fn + self._seed = 0 + + self._dist = get_dist_info()[1] > 1 + + @property + def mode(self): + return self._mode + + @property + def hooks(self) -> List[Hook]: + """list[:obj:`Hook`]: A list of registered hooks.""" + return self._hooks + + @property + def epoch(self) -> int: + """int: Current epoch.""" + return self._epoch + + @property + def iter(self) -> int: + """int: Current iteration.""" + return self._iter + + @property + def inner_iter(self) -> int: + """int: Iteration in an epoch.""" + return self._inner_iter + + @property + def max_epochs(self): + """int: Maximum training epochs.""" + return self._max_epochs + + @property + def max_iters(self): + """int: Maximum training iterations.""" + return self._max_epochs * len(self.data_loader) + + def to_task_dataset(self, + datasets: Tuple[Dataset, List[Dataset]], + mode: str, + preprocessor: Optional[Preprocessor] = None): + """Build the task specific dataset processor for this trainer. + + Returns: The task dataset processor for the task. If no result for the very model-type and task, + the default TaskDataset will be returned. + """ + try: + if not datasets: + return datasets + if isinstance(datasets, TorchTaskDataset): + return datasets + task_dataset = build_task_dataset( + ConfigDict({ + **self.cfg.model, + 'mode': mode, + 'preprocessor': preprocessor, + 'datasets': datasets, + }), getattr(self.cfg, 'task', None)) + return task_dataset + except Exception: + if isinstance(datasets, (List, Tuple)) or preprocessor is not None: + return TorchTaskDataset( + datasets, + mode=mode, + preprocessor=preprocessor, + **(self.cfg.model if hasattr(self.cfg, 'model') else {})) + else: + return datasets + + def build_preprocessor(self) -> Preprocessor: + """Build the preprocessor. + + User can override this method to implement custom logits. + + Returns: The preprocessor instance. + + """ + # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor + # when they are different ones in training and evaluation + cfg = ConfigDict({ + **getattr(self.cfg, 'preprocessor'), 'model_dir': + self.model_dir + }) + return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + + def get_metrics(self) -> List[str]: + """Get the metric class types. + + The first choice will be the metrics configured in the config file, if not found, the default metrics will be + used. + If no metrics is found and the eval dataset exists, the method will raise an error. + + Returns: The metric types. + + """ + metrics = self.cfg.evaluation.metrics if hasattr( + self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, + 'metrics') else None + metrics = metrics if metrics is not None else task_default_metrics.get( + self.cfg.task) + if metrics is None and self.eval_dataset is not None: + raise ValueError( + f'Metrics are needed in evaluation, please try to either ' + f'add metrics in configuration.json or add the default metric for {self.cfg.task}.' + ) + if isinstance(metrics, str): + metrics = [metrics] + return metrics + + def train(self, *args, **kwargs): + self.model.train() + self._mode = 'train' + + if self.train_dataset is None: + self.train_dataloader = self.get_train_dataloader() + else: + self.train_dataloader = self._build_dataloader_with_dataset( + self.train_dataset, **self.cfg.train.get('dataloader', {})) + self.data_loader = self.train_dataloader + + self.register_optimizers_hook() + self.register_hook_from_cfg(self.cfg.train.hooks) + + self.train_loop(self.train_dataloader) + + def evaluate(self, checkpoint_path=None): + self.model.eval() + self._mode = 'val' + + if self.eval_dataset is None: + self.eval_dataloader = self.get_eval_data_loader() + else: + self.eval_dataloader = self._build_dataloader_with_dataset( + self.eval_dataset, **self.cfg.evaluation.get('dataloader', {})) + self.data_loader = self.eval_dataloader + metric_classes = [build_metric(metric) for metric in self.metrics] + self.evaluation_loop(self.eval_dataloader, checkpoint_path, + metric_classes) + + metric_values = {} + for metric_cls in metric_classes: + metric_values.update(metric_cls.evaluate()) + return metric_values + + def build_model(self) -> Union[nn.Module, TorchModel]: + """ Instantiate a pytorch model and return. + + By default, we will create a model using config from configuration file. You can + subclass and override this method in a subclass. + + """ + # TODO temp implementation, waiting for @zhangzhicheng + model = Model.from_pretrained(self.model_dir) + if not isinstance(model, nn.Module) and hasattr(model, 'model'): + return model.model + + def collate_fn(self, data): + """Prepare the input just before the forward function. + This method will move the tensors to the right device. + Usually this method does not need to be overridden. + + Args: + data: The data out of the dataloader. + + Returns: The processed data. + + """ + if isinstance(data, dict): + return type(data)({k: self.collate_fn(v) for k, v in data.items()}) + elif isinstance(data, (tuple, np.ndarray, list)): + return type(data)(self.collate_fn(v) for v in data) + elif isinstance(data, torch.Tensor) and self.device is not None: + kwargs = dict(device=self.device) + return data.to(**kwargs) + return data + + def train_step(self, model, inputs): + """ Perform a training step on a batch of inputs. + + Subclass and override to inject custom behavior. + + Args: + model (`TorchModel`): The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + # EvaluationHook will do evaluate and change mode to val, return to train mode + # TODO: find more pretty way to change mode + model.train() + self._mode = 'train' + inputs = self.collate_fn(inputs) + if isinstance(inputs, dict): + train_outputs = model.forward(**inputs) + else: + train_outputs = model.forward(inputs) + + if not isinstance(train_outputs, dict): + raise TypeError( + '"model.train_step()" and "model.val_step()" must return a dict' + ) + + # add model output info to log + if 'log_vars' not in train_outputs: + default_keys_pattern = ['loss'] + match_keys = set([]) + for key_p in default_keys_pattern: + match_keys.update( + [key for key in train_outputs.keys() if key_p in key]) + + log_vars = {} + for key in match_keys: + value = train_outputs.get(key, None) + if value is not None: + if dist.is_available() and dist.is_initialized(): + value = value.data.clone() + dist.all_reduce(value.div_(dist.get_world_size())) + log_vars.update({key: value.item()}) + self.log_buffer.update(log_vars) + else: + self.log_buffer.update(train_outputs['log_vars']) + + self.train_outputs = train_outputs + + def prediction_step(self, model, inputs): + """ Perform forward step by `model` using `inputs`. + + Args: + model (`TorchModel`): The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + ignore_keys (`Lst[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + + Return: + Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, + logits and labels (each being optional). + """ + raise NotImplementedError + + def get_train_dataloader(self): + """ Builder torch dataloader for training. + + We provide a reasonable default that works well. If you want to use something else, you can change + the config for data.train in configuration file, or subclass and override this method + (or `get_train_dataloader` in a subclass. + """ + train_data = self.cfg.dataset.train + if self.train_dataset is None: + self.train_dataset = self.build_dataset(train_data, mode='train') + + data_loader = self._build_dataloader_with_dataset( + self.train_dataset, **self.cfg.train.get('dataloader', {})) + return data_loader + + def get_eval_data_loader(self): + """ Builder torch dataloader for evaluation. + + We provide a reasonable default that works well. If you want to use something else, you can change + the config for dataset.eval in configuration file, or subclass and override this method in a subclass. + pass + """ + val_data = self.cfg.dataset.val + if self.eval_dataset is None: + self.eval_dataset = self.build_dataset(val_data, mode='eval') + + batch_size = self.cfg.evaluation.batch_size + workers = self.cfg.evaluation.workers + shuffle = self.cfg.evaluation.get('shuffle', False) + data_loader = self._build_dataloader_with_dataset( + self.eval_dataset, + batch_size_per_gpu=batch_size, + workers_per_gpu=workers, + shuffle=shuffle, + dist=self._dist, + seed=self._seed, + persistent_workers=True, + ) + return data_loader + + def build_dataset(self, data_cfg, mode): + """ Build torch dataset object using data config + """ + dataset = MsDataset.load( + dataset_name=data_cfg.name, + split=data_cfg.split, + subset_name=data_cfg.subset_name if hasattr( + data_cfg, 'subset_name') else None, + hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, + ) + torch_dataset = dataset.to_torch_dataset( + preprocessors=self.preprocessor, ) + dataset = self.to_task_dataset( + torch_dataset, mode, preprocessor=self.preprocessor) + return dataset + + def create_optimizer_and_scheduler(self): + """ Create optimizer and lr scheduler + + We provide a default implementation, if you want to customize your own optimizer + and lr scheduler, you can either pass a tuple through trainer init function or + subclass this class and override this method. + + + """ + optimizer, lr_scheduler = self.optimizers + if optimizer is None: + optimizer_cfg = self.cfg.train.get('optimizer', None) + else: + optimizer_cfg = None + + optim_options = {} + if optimizer_cfg is not None: + optim_options = optimizer_cfg.pop('options', {}) + optimizer = build_optimizer(self.model, cfg=optimizer_cfg) + + if lr_scheduler is None: + lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None) + else: + lr_scheduler_cfg = None + + lr_options = {} + if lr_scheduler_cfg is not None: + assert optimizer is not None + lr_options = lr_scheduler_cfg.pop('options', {}) + lr_scheduler = build_lr_scheduler( + cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer}) + + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + return self.optimizer, self.lr_scheduler, optim_options, lr_options + + def register_optimizers_hook(self): + """ Register optimizer hook and lr scheduler hook. + """ + optimizer, lr_scheduler = self.optimizers + opti_error_msg = 'optimizers should be a tuple of `torch.optim.Optimizer`'\ + ' and `torch.optim.lr_scheduler._LRScheduler`' + if optimizer is not None: + assert isinstance(optimizer, torch.optim.Optimizer), opti_error_msg + if lr_scheduler is not None: + assert isinstance( + lr_scheduler, + torch.optim.lr_scheduler._LRScheduler), opti_error_msg + + _, _, optim_options, lr_options = self.create_optimizer_and_scheduler() + lr_hook = dict(type='LrSchedulerHook', **lr_options) + optim_hook = dict(type='OptimizerHook', **optim_options) + + self.register_hook_from_cfg([lr_hook, optim_hook]) + + def _build_dataloader_with_dataset(self, + dataset: Dataset, + batch_size_per_gpu: int, + workers_per_gpu: int, + dist: bool = False, + shuffle: bool = True, + seed: int = 0, + persistent_workers=False, + **kwargs) -> DataLoader: + """Build dataloader using input dataset and cfg. Used by `EpochBasedTrainer.train()` + and `EpochBasedTrainer.evaluate()`. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + batch_size_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + seed (int, Optional): Seed to be used. Default: 0. + runner_type (str): Type of runner. Default: `EpochBasedRunner` + persistent_workers (bool): If True, the data loader will not shutdown + the worker processes after a dataset has been consumed once. + This allows to maintain the workers `Dataset` instances alive. + This argument is only valid when PyTorch>=1.7.0. Default: False. + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + + if dist: + # When model is :obj:`DistributedDataParallel`, + # `batch_size` of :obj:`dataloader` is the + # number of training samples on each GPU. + batch_size = batch_size_per_gpu + num_workers = workers_per_gpu + else: + batch_size = batch_size_per_gpu + num_workers = workers_per_gpu + + if dist: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=shuffle, seed=seed) + else: + sampler = None + + batch_sampler = None + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + if LooseVersion(torch.__version__) >= LooseVersion('1.7.0'): + kwargs['persistent_workers'] = persistent_workers + elif persistent_workers is True: + self.logger.warning( + 'persistent_workers is invalid because your pytorch ' + 'version is lower than 1.7.0') + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=self.data_collator, + pin_memory=kwargs.pop('pin_memory', False), + worker_init_fn=init_fn, + **kwargs) + + return data_loader + + def train_loop(self, data_loader): + """ Training loop used by `EpochBasedTrainer.train()` + """ + self.invoke_hook('before_run') + self._epoch = 0 + kwargs = {} + for _ in range(self._epoch, self._max_epochs): + self.invoke_hook('before_train_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(data_loader): + self.data_batch = data_batch + self._inner_iter = i + self.invoke_hook('before_train_iter') + self.train_step(self.model, data_batch, **kwargs) + self.invoke_hook('after_train_iter') + del self.data_batch + self._iter += 1 + + self.invoke_hook('after_train_epoch') + self._epoch += 1 + + time.sleep(1) # wait for some hooks like loggers to finish + self.invoke_hook('after_run') + + def evaluation_loop(self, data_loader, checkpoint_path, metric_classes): + """ Evaluation loop used by `EpochBasedTrainer.evaluate()`. + + """ + if self._dist: + from modelscope.trainers.utils.inference import multi_gpu_test + multi_gpu_test( + self.model, + data_loader, + tmpdir=None, + gpu_collect=False, + data_collate_fn=self.collate_fn, + metric_classes=metric_classes) + else: + from modelscope.trainers.utils.inference import single_gpu_test + single_gpu_test( + self.model, + data_loader, + data_collate_fn=self.collate_fn, + metric_classes=metric_classes) + + def register_hook(self, hook: Hook) -> None: + """Register a hook into the hook list. + + The hook will be inserted into a priority queue, with the specified + priority (See :class:`Priority` for details of priorities). + For hooks with the same priority, they will be triggered in the same + order as they are registered. + + Args: + hook (:obj:`Hook`): The hook to be registered. + """ + assert isinstance(hook, Hook) + # insert the hook to a sorted list + inserted = False + for i in range(len(self._hooks) - 1, -1, -1): + if get_priority(hook.PRIORITY) > get_priority( + self._hooks[i].PRIORITY): + self._hooks.insert(i + 1, hook) + inserted = True + break + if not inserted: + self._hooks.insert(0, hook) + + def register_hook_from_cfg(self, hook_cfg: Dict) -> None: + """Register a hook from its cfg. + + Args: + hook_cfg (dict): Hook config. It should have at least keys 'type' + and 'priority' indicating its type and priority. + + Note: + The specific hook class to register should not use 'type' and + 'priority' arguments during initialization. + """ + hook_cfg = hook_cfg.copy() + assert isinstance(hook_cfg, list) + for cfg_i in hook_cfg: + hook = build_from_cfg(cfg_i, HOOKS) + self.register_hook(hook) + + def invoke_hook(self, fn_name: str) -> None: + """Call all hooks. + + Args: + fn_name (str): The function name in each hook to be called, such as + "before_train_epoch". + """ + for hook in self._hooks: + getattr(hook, fn_name)(self) + + def get_hook_info(self) -> str: + # Get hooks info in each stage + stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages} + for hook in self.hooks: + try: + priority = Priority(hook.priority).name # type: ignore + except ValueError: + priority = hook.priority # type: ignore + classname = hook.__class__.__name__ + hook_info = f'({priority:<12}) {classname:<35}' + for trigger_stage in hook.get_triggered_stages(): + stage_hook_map[trigger_stage].append(hook_info) + + stage_hook_infos = [] + for stage in Hook.stages: + hook_infos = stage_hook_map[stage] + if len(hook_infos) > 0: + info = f'{stage}:\n' + info += '\n'.join(hook_infos) + info += '\n -------------------- ' + stage_hook_infos.append(info) + return '\n'.join(stage_hook_infos) + + +def worker_init_fn(worker_id, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) + torch.manual_seed(worker_seed) diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py new file mode 100644 index 00000000..4b2096c5 --- /dev/null +++ b/modelscope/trainers/utils/inference.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import pickle +import shutil +import tempfile +import time + +import torch +from torch import distributed as dist +from tqdm import tqdm + +from modelscope.utils.torch_utils import get_dist_info + + +def single_gpu_test(model, + data_loader, + data_collate_fn=None, + metric_classes=None): + """Test model with a single gpu. + + Args: + data_collate_fn: An optional data_collate_fn before fed into the model + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + metric_classes(List): List of Metric class that uses to collect metrics + + Returns: + list: The prediction results. + """ + model.eval() + dataset = data_loader.dataset + with tqdm(total=len(dataset), desc='test samples') as pbar: + for data in data_loader: + if data_collate_fn is not None: + data = data_collate_fn(data) + with torch.no_grad(): + result = model(**data) + if metric_classes is not None: + for metric_cls in metric_classes: + metric_cls.add(result, data) + + batch_size = len(result) + for _ in range(batch_size): + pbar.update() + + +def multi_gpu_test(model, + data_loader, + tmpdir=None, + gpu_collect=False, + data_collate_fn=None, + metric_classes=None): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting + ``gpu_collect=True``, it encodes results to gpu tensors and use gpu + communication for results collection. On cpu mode it saves the results on + different gpus to ``tmpdir`` and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + data_collate_fn: An optional data_collate_fn before fed into the model + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + metric_classes(List): List of Metric class that uses to collect metrics + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + + time.sleep(2) # This line can prevent deadlock problem in some cases. + + count = 0 + with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: + for _, data in enumerate(data_loader): + if data_collate_fn is not None: + data = data_collate_fn(data) + with torch.no_grad(): + result = model(**data) + results.extend(result) + + rank, world_size = get_dist_info() + if rank == 0: + batch_size = len(result) + batch_size_all = batch_size * world_size + count += batch_size_all + if count > len(dataset): + batch_size_all = len(dataset) - (count - batch_size_all) + for _ in range(batch_size_all): + pbar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + ground_truths = [dataset[i] for i in range(len(dataset))] + if metric_classes is not None: + for metric_cls in metric_classes: + metric_cls.add(results, ground_truths) + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results under cpu mode. + + On cpu mode, this function will save the results on different gpus to + ``tmpdir`` and collect them by the rank 0 worker. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + tmpdir (str | None): temporal directory for collected results to + store. If set to None, it will create a random temporal directory + for it. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # TODO create a random tmp dir if it is not specified + if tmpdir is None: + tmpdir = tempfile.gettempdir() + if not os.path.exists(tmpdir): + os.makedirs(tmpdir) + # dump the part result to the dir + pickle.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = os.path.join(tmpdir, f'part_{i}.pkl') + part_result = pickle.load(part_file) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results under gpu mode. + + On gpu mode, this function will encode results to gpu tensors and use gpu + communication for results collection. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/modelscope/trainers/utils/log_buffer.py b/modelscope/trainers/utils/log_buffer.py new file mode 100644 index 00000000..edcc273e --- /dev/null +++ b/modelscope/trainers/utils/log_buffer.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +from collections import OrderedDict + +import numpy as np + + +class LogBuffer: + + def __init__(self): + self.val_history = OrderedDict() + self.n_history = OrderedDict() + self.output = OrderedDict() + self.ready = False + + def clear(self) -> None: + self.val_history.clear() + self.n_history.clear() + self.clear_output() + + def clear_output(self) -> None: + self.output.clear() + self.ready = False + + def update(self, vars: dict, count: int = 1) -> None: + assert isinstance(vars, dict) + for key, var in vars.items(): + if key not in self.val_history: + self.val_history[key] = [] + self.n_history[key] = [] + self.val_history[key].append(var) + self.n_history[key].append(count) + + def average(self, n: int = 0) -> None: + """Average latest n values or all values.""" + assert n >= 0 + for key in self.val_history: + values = np.array(self.val_history[key][-n:]) + nums = np.array(self.n_history[key][-n:]) + avg = np.sum(values * nums) / np.sum(nums) + self.output[key] = avg + self.ready = True diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py new file mode 100644 index 00000000..76fb2a19 --- /dev/null +++ b/modelscope/utils/checkpoint.py @@ -0,0 +1,74 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +import time +from collections import OrderedDict +from typing import Optional + +import torch +from torch.optim import Optimizer + +from modelscope import __version__ +from modelscope.fileio import File + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + # Keep metadata in state_dict + state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) + return state_dict_cpu + + +def save_checkpoint(model: torch.nn.Module, + filename: str, + optimizer: Optional[Optimizer] = None, + meta: Optional[dict] = None) -> None: + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError(f'meta must be a dict or None, but got {type(meta)}') + meta.update(modescope=__version__, time=time.asctime()) + + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model = model.module + + if hasattr(model, 'CLASSES') and model.CLASSES is not None: + # save class name to the meta + meta.update(CLASSES=model.CLASSES) + + checkpoint = { + 'meta': meta, + 'state_dict': weights_to_cpu(model.state_dict()) + } + # save optimizer state dict in the checkpoint + if isinstance(optimizer, Optimizer): + checkpoint['optimizer'] = optimizer.state_dict() + elif isinstance(optimizer, dict): + checkpoint['optimizer'] = {} + for name, optim in optimizer.items(): + checkpoint['optimizer'][name] = optim.state_dict() + + with io.BytesIO() as f: + torch.save(checkpoint, f) + File.write(f.getvalue(), filename) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index cdd346b3..0b8da090 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -13,12 +13,7 @@ class Fields(object): multi_modal = 'multi-modal' -class Tasks(object): - """ Names for tasks supported by modelscope. - - Holds the standard task name to use for identifying different tasks. - This should be used to register models, pipelines, trainers. - """ +class CVTasks(object): # vision tasks image_to_text = 'image-to-text' pose_estimation = 'pose-estimation' @@ -33,6 +28,8 @@ class Tasks(object): action_recognition = 'action-recognition' video_embedding = 'video-embedding' + +class NLPTasks(object): # nlp tasks word_segmentation = 'word-segmentation' nli = 'nli' @@ -56,11 +53,15 @@ class Tasks(object): question_answering = 'question-answering' zero_shot_classification = 'zero-shot-classification' + +class AudioTasks(object): # audio tasks auto_speech_recognition = 'auto-speech-recognition' text_to_speech = 'text-to-speech' speech_signal_process = 'speech-signal-process' + +class MultiModalTasks(object): # multi-modal tasks image_captioning = 'image-captioning' visual_grounding = 'visual-grounding' @@ -69,6 +70,47 @@ class Tasks(object): visual_question_answering = 'visual-question-answering' +class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks): + """ Names for tasks supported by modelscope. + + Holds the standard task name to use for identifying different tasks. + This should be used to register models, pipelines, trainers. + """ + + reverse_field_index = {} + + @staticmethod + def find_field_by_task(task_name): + if len(Tasks.reverse_field_index) == 0: + # Lazy init, not thread safe + field_dict = { + Fields.cv: [ + getattr(Tasks, attr) for attr in dir(CVTasks) + if not attr.startswith('__') + ], + Fields.nlp: [ + getattr(Tasks, attr) for attr in dir(NLPTasks) + if not attr.startswith('__') + ], + Fields.audio: [ + getattr(Tasks, attr) for attr in dir(AudioTasks) + if not attr.startswith('__') + ], + Fields.multi_modal: [ + getattr(Tasks, attr) for attr in dir(MultiModalTasks) + if not attr.startswith('__') + ], + } + + for field, tasks in field_dict.items(): + for task in tasks: + if task in Tasks.reverse_field_index: + raise ValueError(f'Duplicate task: {task}') + Tasks.reverse_field_index[task] = field + + return Tasks.reverse_field_index.get(task_name) + + class InputFields(object): """ Names for input data fields in the input data for pipelines """ @@ -100,6 +142,7 @@ class ModelFile(object): TF_CKPT_PREFIX = 'ckpt-' TORCH_MODEL_FILE = 'pytorch_model.pt' TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' + LABEL_MAPPING = 'label_mapping.json' class Requirements(object): diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 24b3246c..dd00fd5b 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -86,3 +86,16 @@ def get_model_type(model_dir): return cfg.model_type if hasattr(cfg, 'model_type') else None except Exception as e: logger.error(f'parse config file failed with error: {e}') + + +def parse_label_mapping(model_dir): + import os + import json + label2id = None + label_path = os.path.join(model_dir, ModelFile.LABEL_MAPPING) + if os.path.exists(label_path): + with open(label_path) as f: + label_mapping = json.load(f) + label2id = {name: idx for name, idx in label_mapping.items()} + + return label2id diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py index 43c6869a..c04d3234 100644 --- a/modelscope/utils/import_utils.py +++ b/modelscope/utils/import_utils.py @@ -54,6 +54,38 @@ def import_modules_from_file(py_file: str): return module_name, mod +def is_method_overridden(method, base_class, derived_class): + """Check if a method of base class is overridden in derived class. + + Args: + method (str): the method name to check. + base_class (type): the class of the base class. + derived_class (type | Any): the class or instance of the derived class. + """ + assert isinstance(base_class, type), \ + "base_class doesn't accept instance, Please pass class instead." + + if not isinstance(derived_class, type): + derived_class = derived_class.__class__ + + base_method = getattr(base_class, method) + derived_method = getattr(derived_class, method) + return derived_method != base_method + + +def has_method(obj: object, method: str) -> bool: + """Check whether the object has a method. + + Args: + method (str): The method name to check. + obj (object): The object to check. + + Returns: + bool: True if the object has the method else False. + """ + return hasattr(obj, method) and callable(getattr(obj, method)) + + def import_modules(imports, allow_failed_imports=False): """Import modules from the given list of strings. diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py new file mode 100644 index 00000000..dc38f9bf --- /dev/null +++ b/modelscope/utils/tensor_utils.py @@ -0,0 +1,77 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Part of the implementation is borrowed from huggingface/transformers. + + +def torch_nested_numpify(tensors): + import torch + "Numpify `tensors` (even if it's a nested list/tuple of tensors)." + if isinstance(tensors, (list, tuple)): + return type(tensors)(torch_nested_numpify(t) for t in tensors) + if isinstance(tensors, torch.Tensor): + t = tensors.cpu() + return t.numpy() + return tensors + + +def torch_nested_detach(tensors): + import torch + "Detach `tensors` (even if it's a nested list/tuple of tensors)." + if isinstance(tensors, (list, tuple)): + return type(tensors)(torch_nested_detach(t) for t in tensors) + if isinstance(tensors, torch.Tensor): + return tensors.detach() + return tensors + + +def torch_default_data_collator(features): + # TODO @jiangnana.jnn refine this default data collator + import torch + + # if not isinstance(features[0], (dict, BatchEncoding)): + # features = [vars(f) for f in features] + first = features[0] + + if isinstance(first, dict): + batch = {} + # Special handling for labels. + # Ensure that tensor is created with the correct type + # (it should be automatically the case, but let's make sure of it.) + if 'label' in first and first['label'] is not None: + label = first['label'].item() if isinstance( + first['label'], torch.Tensor) else first['label'] + dtype = torch.long if isinstance(label, int) else torch.float + batch['labels'] = torch.tensor([f['label'] for f in features], + dtype=dtype) + elif 'label_ids' in first and first['label_ids'] is not None: + if isinstance(first['label_ids'], torch.Tensor): + batch['labels'] = torch.stack( + [f['label_ids'] for f in features]) + else: + dtype = torch.long if type( + first['label_ids'][0]) is int else torch.float + batch['labels'] = torch.tensor( + [f['label_ids'] for f in features], dtype=dtype) + + # Handling of all other possible keys. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in first.items(): + if k not in ('label', 'label_ids' + ) and v is not None and not isinstance(v, str): + if isinstance(v, torch.Tensor): + batch[k] = torch.stack([f[k] for f in features]) + else: + batch[k] = torch.tensor([f[k] for f in features]) + elif isinstance(first, tuple): + batch = [] + for idx in range(len(first)): + if isinstance(first[idx], torch.Tensor): + batch.append(torch.stack([f[k] for f in features])) + else: + batch.append(torch.tensor([f[k] for f in features])) + else: + if isinstance(first, torch.Tensor): + batch = torch.stack(features) + else: + batch = torch.tensor(features) + + return batch diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py new file mode 100644 index 00000000..01b122a7 --- /dev/null +++ b/modelscope/utils/torch_utils.py @@ -0,0 +1,127 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Following code is partialy borrowed from openmmlab/mmcv + +import functools +import os +import socket +import subprocess +from collections import OrderedDict +from typing import Callable, List, Optional, Tuple + +import torch +import torch.multiprocessing as mp +from torch import distributed as dist +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def _find_free_port() -> str: + # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Binding to port 0 will cause the OS to find an available port for us + sock.bind(('', 0)) + port = sock.getsockname()[1] + sock.close() + # NOTE: there is still a chance the port could be taken by other processes. + return port + + +def _is_free_port(port: int) -> bool: + ips = socket.gethostbyname_ex(socket.gethostname())[-1] + ips.append('localhost') + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return all(s.connect_ex((ip, port)) != 0 for ip in ips) + + +def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None: + if mp.get_start_method(allow_none=True) is None: + mp.set_start_method('spawn') + if launcher == 'pytorch': + _init_dist_pytorch(backend, **kwargs) + elif launcher == 'mpi': + _init_dist_mpi(backend, **kwargs) + elif launcher == 'slurm': + _init_dist_slurm(backend, **kwargs) + else: + raise ValueError(f'Invalid launcher type: {launcher}') + + +def _init_dist_pytorch(backend: str, **kwargs) -> None: + # rank = int(os.environ['RANK']) + local_rank = int(os.environ['LOCAL_RANK']) + + torch.cuda.set_device(local_rank) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_mpi(backend: str, **kwargs) -> None: + local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + torch.cuda.set_device(local_rank) + if 'MASTER_PORT' not in os.environ: + # 29500 is torch.distributed default port + os.environ['MASTER_PORT'] = '29500' + if 'MASTER_ADDR' not in os.environ: + raise KeyError('The environment variable MASTER_ADDR is not set') + os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] + os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: + """Initialize slurm distributed training environment. + + If argument ``port`` is not specified, then the master port will be system + environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system + environment variable, then a default port ``29500`` will be used. + + Args: + backend (str): Backend of torch.distributed. + port (int, optional): Master port. Defaults to None. + """ + proc_id = int(os.environ['SLURM_PROCID']) + ntasks = int(os.environ['SLURM_NTASKS']) + node_list = os.environ['SLURM_NODELIST'] + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(proc_id % num_gpus) + addr = subprocess.getoutput( + f'scontrol show hostname {node_list} | head -n1') + # specify master port + if port is not None: + os.environ['MASTER_PORT'] = str(port) + elif 'MASTER_PORT' in os.environ: + pass # use MASTER_PORT in the environment variable + else: + # if torch.distributed default port(29500) is available + # then use it, else find a free port + if _is_free_port(29500): + os.environ['MASTER_PORT'] = '29500' + else: + os.environ['MASTER_PORT'] = str(_find_free_port()) + # use MASTER_ADDR in the environment variable if it already exists + if 'MASTER_ADDR' not in os.environ: + os.environ['MASTER_ADDR'] = addr + os.environ['WORLD_SIZE'] = str(ntasks) + os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) + os.environ['RANK'] = str(proc_id) + dist.init_process_group(backend=backend) + + +def get_dist_info() -> Tuple[int, int]: + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +def master_only(func: Callable) -> Callable: + + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + + return wrapper diff --git a/output.wav b/output.wav deleted file mode 100644 index fcbc39ad..00000000 --- a/output.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4153d9ffc0b72eeaf162b5c9f4426f95dcea2bb0da9e7b5e1b72fd2643b1915 -size 50444 diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py new file mode 100644 index 00000000..2da9874b --- /dev/null +++ b/tests/models/test_base_torch.py @@ -0,0 +1,60 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.base_torch import TorchModel + + +class TorchBaseTest(unittest.TestCase): + + def test_custom_model(self): + + class MyTorchModel(TorchModel): + + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 20, 5) + self.conv2 = nn.Conv2d(20, 20, 5) + + def forward(self, x): + x = F.relu(self.conv1(x)) + return F.relu(self.conv2(x)) + + model = MyTorchModel() + model.train() + model.eval() + out = model.forward(torch.rand(1, 1, 10, 10)) + self.assertEqual((1, 20, 2, 2), out.shape) + + def test_custom_model_with_postprocess(self): + add_bias = 200 + + class MyTorchModel(TorchModel): + + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 20, 5) + self.conv2 = nn.Conv2d(20, 20, 5) + + def forward(self, x): + x = F.relu(self.conv1(x)) + return F.relu(self.conv2(x)) + + def postprocess(self, x): + return x + add_bias + + model = MyTorchModel() + model.train() + model.eval() + out = model(torch.rand(1, 1, 10, 10)) + self.assertEqual((1, 20, 2, 2), out.shape) + self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10))) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py index 5897382e..60f955c4 100644 --- a/tests/pipelines/test_base.py +++ b/tests/pipelines/test_base.py @@ -6,9 +6,9 @@ from typing import Any, Dict, List, Tuple, Union import numpy as np import PIL +from modelscope.outputs import OutputKeys from modelscope.pipelines import Pipeline, pipeline from modelscope.pipelines.builder import PIPELINES, add_default_pipeline_info -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from modelscope.utils.registry import default_group diff --git a/tests/pipelines/test_image_captioning.py b/tests/pipelines/test_image_captioning.py index c185d774..6bede92c 100644 --- a/tests/pipelines/test_image_captioning.py +++ b/tests/pipelines/test_image_captioning.py @@ -2,8 +2,8 @@ import unittest +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py index 584d6d91..538041a5 100644 --- a/tests/pipelines/test_image_matting.py +++ b/tests/pipelines/test_image_matting.py @@ -7,8 +7,8 @@ import cv2 from modelscope.fileio import File from modelscope.msdatasets import MsDataset +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py index d94bcf51..d6ef1894 100644 --- a/tests/pipelines/test_person_image_cartoon.py +++ b/tests/pipelines/test_person_image_cartoon.py @@ -5,9 +5,9 @@ import unittest import cv2 +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py index 1e12548a..6a2edb57 100644 --- a/tests/pipelines/test_text_to_image_synthesis.py +++ b/tests/pipelines/test_text_to_image_synthesis.py @@ -5,8 +5,8 @@ import unittest import numpy as np from modelscope.models import Model +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py index 37fe07e5..d28d85d9 100644 --- a/tests/pipelines/test_text_to_speech.py +++ b/tests/pipelines/test_text_to_speech.py @@ -9,8 +9,8 @@ from scipy.io.wavfile import write from modelscope.metainfo import Pipelines from modelscope.models import Model +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline -from modelscope.pipelines.outputs import OutputKeys from modelscope.utils.constant import Fields, Tasks from modelscope.utils.logger import get_logger from modelscope.utils.test_utils import test_level diff --git a/tests/trainers/hooks/__init__.py b/tests/trainers/hooks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py new file mode 100644 index 00000000..4e839f0c --- /dev/null +++ b/tests/trainers/hooks/test_checkpoint_hook.py @@ -0,0 +1,108 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile + + +class DummyDataset(Dataset, metaclass=ABCMeta): + + def __len__(self): + return 20 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class CheckpointHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_checkpoint_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01, + 'options': { + 'grad_clip': { + 'max_norm': 2.0 + } + } + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + 'options': { + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_iters': 2 + } + } + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1 + }] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=DummyDataset(), + max_epochs=2) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn('epoch_1.pth', results_files) + self.assertIn('epoch_2.pth', results_files) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py new file mode 100644 index 00000000..27ee000f --- /dev/null +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -0,0 +1,188 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.optim import SGD +from torch.optim.lr_scheduler import MultiStepLR +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile + + +class DummyDataset(Dataset, metaclass=ABCMeta): + """Base Dataset + """ + + def __len__(self): + return 10 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class LrSchedulerHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_lr_scheduler_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + } + } + } + + config_path = os.path.join(self.tmp_dir, 'config.json') + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=DummyDataset(), + optimizers=(optimizer, lr_scheduler), + max_epochs=5) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + + trainer.invoke_hook('before_run') + log_lrs = [] + optim_lrs = [] + for _ in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook('before_train_epoch') + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook('before_train_iter') + + log_lrs.append(trainer.log_buffer.output['lr']) + optim_lrs.append(optimizer.param_groups[0]['lr']) + + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook('after_train_iter') + + trainer.invoke_hook('after_train_epoch') + trainer._epoch += 1 + trainer.invoke_hook('after_run') + + iters = 5 + target_lrs = [0.01] * iters * 1 + [0.001] * iters * 2 + [0.0001 + ] * iters * 2 + + self.assertListEqual(log_lrs, target_lrs) + self.assertListEqual(optim_lrs, target_lrs) + + def test_warmup_lr_scheduler_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01 + }, + 'lr_scheduler': { + 'type': 'MultiStepLR', + 'milestones': [4, 6], + 'options': { + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_iters': 3 + } + } + } + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + # optimmizer = SGD(model.parameters(), lr=0.01) + # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=DummyDataset(), + # optimizers=(optimmizer, lr_scheduler), + max_epochs=7) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + + trainer.invoke_hook('before_run') + log_lrs = [] + optim_lrs = [] + for _ in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook('before_train_epoch') + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook('before_train_iter') + + log_lrs.append(round(trainer.log_buffer.output['lr'], 5)) + optim_lrs.append( + round(trainer.optimizer.param_groups[0]['lr'], 5)) + + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook('after_train_iter') + + trainer.invoke_hook('after_train_epoch') + trainer.invoke_hook('after_run') + + iters = 5 + target_lrs = [0.004] * iters * 1 + [0.007] * iters * 1 + [ + 0.01 + ] * iters * 1 + [0.001] * iters * 2 + [0.0001] * iters * 2 + + self.assertListEqual(log_lrs, target_lrs) + self.assertListEqual(optim_lrs, target_lrs) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py new file mode 100644 index 00000000..b8864aef --- /dev/null +++ b/tests/trainers/hooks/test_timer_hook.py @@ -0,0 +1,128 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.optim import SGD +from torch.optim.lr_scheduler import MultiStepLR +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile + + +class DummyDataset(Dataset, metaclass=ABCMeta): + """Base Dataset + """ + + def __len__(self): + return 10 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class IterTimerHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_iter_time_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'hooks': [{ + 'type': 'IterTimerHook', + }] + } + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=DummyDataset(), + optimizers=(optimizer, lr_scheduler), + max_epochs=5) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + trainer.register_hook_from_cfg(trainer.cfg.train.hooks) + + trainer.invoke_hook('before_run') + for i in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook('before_train_epoch') + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook('before_train_iter') + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook('after_train_iter') + + self.assertIn('data_load_time', trainer.log_buffer.val_history) + self.assertIn('time', trainer.log_buffer.val_history) + self.assertIn('loss', trainer.log_buffer.val_history) + + trainer.invoke_hook('after_train_epoch') + + target_len = 5 * (i + 1) + self.assertEqual( + len(trainer.log_buffer.val_history['data_load_time']), + target_len) + self.assertEqual( + len(trainer.log_buffer.val_history['time']), target_len) + self.assertEqual( + len(trainer.log_buffer.val_history['loss']), target_len) + + self.assertEqual( + len(trainer.log_buffer.n_history['data_load_time']), + target_len) + self.assertEqual( + len(trainer.log_buffer.n_history['time']), target_len) + self.assertEqual( + len(trainer.log_buffer.n_history['loss']), target_len) + + trainer.invoke_hook('after_run') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/lrscheduler/__init__.py b/tests/trainers/lrscheduler/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/lrscheduler/warmup/__init__.py b/tests/trainers/lrscheduler/warmup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/lrscheduler/warmup/test_warmup_base.py b/tests/trainers/lrscheduler/warmup/test_warmup_base.py new file mode 100644 index 00000000..45c9fe2c --- /dev/null +++ b/tests/trainers/lrscheduler/warmup/test_warmup_base.py @@ -0,0 +1,79 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import torch +from torch import nn +from torch.optim.lr_scheduler import MultiStepLR + + +class WarmupTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + def test_constant_warmup(self): + from modelscope.trainers.lrscheduler.warmup import ConstantWarmup + + net = nn.Linear(2, 2) + base_lr = 0.02 + warmup_iters = 3 + warmup_ratio = 0.2 + optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) + lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) + lr_scheduler_with_warmup = ConstantWarmup( + lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) + + res = [] + for _ in range(10): + lr_scheduler_with_warmup.step() + for _, group in enumerate(optimizer.param_groups): + res.append(group['lr']) + + base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] + self.assertListEqual(res, [0.004, 0.004, 0.02] + base_lrs) + + def test_linear_warmup(self): + from modelscope.trainers.lrscheduler.warmup import LinearWarmup + + net = nn.Linear(2, 2) + base_lr = 0.02 + warmup_iters = 3 + warmup_ratio = 0.1 + optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) + lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) + lr_scheduler_with_warmup = LinearWarmup( + lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) + + res = [] + for _ in range(10): + lr_scheduler_with_warmup.step() + for _, group in enumerate(optimizer.param_groups): + res.append(round(group['lr'], 5)) + + base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] + self.assertListEqual(res, [0.0080, 0.0140, 0.02] + base_lrs) + + def test_exp_warmup(self): + from modelscope.trainers.lrscheduler.warmup import ExponentialWarmup + + net = nn.Linear(2, 2) + base_lr = 0.02 + warmup_iters = 3 + warmup_ratio = 0.1 + optimizer = torch.optim.SGD(net.parameters(), lr=base_lr, momentum=0.9) + lr_scheduler = MultiStepLR(optimizer, milestones=[7, 9]) + lr_scheduler_with_warmup = ExponentialWarmup( + lr_scheduler, warmup_iters=warmup_iters, warmup_ratio=warmup_ratio) + + res = [] + for _ in range(10): + lr_scheduler_with_warmup.step() + for _, group in enumerate(optimizer.param_groups): + res.append(round(group['lr'], 5)) + + base_lrs = [0.02, 0.02, 0.02, 0.002, 0.002, 0.0002, 0.0002] + self.assertListEqual(res, [0.00431, 0.00928, 0.02] + base_lrs) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py new file mode 100644 index 00000000..22874262 --- /dev/null +++ b/tests/trainers/test_trainer.py @@ -0,0 +1,209 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest +from abc import ABCMeta + +import json +import torch +from torch import nn +from torch.optim import SGD +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import Dataset + +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile +from modelscope.utils.test_utils import test_level + + +class DummyMetric: + + def __call__(self, ground_truth, predict_results): + return {'accuracy': 0.5} + + +class DummyDataset(Dataset, metaclass=ABCMeta): + """Base Dataset + """ + + def __len__(self): + return 20 + + def __getitem__(self, idx): + return dict(feat=torch.rand((5, )), label=torch.randint(0, 4, (1, ))) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class TrainerTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_train_0(self): + json_cfg = { + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'optimizer': { + 'type': 'SGD', + 'lr': 0.01, + 'options': { + 'grad_clip': { + 'max_norm': 2.0 + } + } + }, + 'lr_scheduler': { + 'type': 'StepLR', + 'step_size': 2, + 'options': { + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_iters': 2 + } + } + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['seq_cls_metric'] + } + } + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=DummyModel(), + data_collator=None, + train_dataset=DummyDataset(), + eval_dataset=DummyDataset(), + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + self.assertIn('epoch_1.pth', results_files) + self.assertIn('epoch_2.pth', results_files) + self.assertIn('epoch_3.pth', results_files) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_train_1(self): + json_cfg = { + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'hooks': [{ + 'type': 'CheckpointHook', + 'interval': 1 + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': ['seq_cls_metric'] + } + } + + config_path = os.path.join(self.tmp_dir, 'config.json') + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimmizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = StepLR(optimmizer, 2) + trainer_name = 'EpochBasedTrainer' + kwargs = dict( + cfg_file=config_path, + model=model, + data_collator=None, + train_dataset=DummyDataset(), + eval_dataset=DummyDataset(), + optimizers=(optimmizer, lr_scheduler), + max_epochs=3) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + self.assertIn('epoch_1.pth', results_files) + self.assertIn('epoch_2.pth', results_files) + self.assertIn('epoch_3.pth', results_files) + + +class DummyTrainerTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_dummy(self): + default_args = dict(cfg_file='configs/examples/train.json') + trainer = build_trainer('dummy', default_args) + + trainer.train() + trainer.evaluate() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_trainer_base.py b/tests/trainers/test_trainer_base.py deleted file mode 100644 index c5fc1303..00000000 --- a/tests/trainers/test_trainer_base.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import unittest - -from modelscope.trainers import build_trainer - - -class DummyTrainerTest(unittest.TestCase): - - def test_dummy(self): - default_args = dict(cfg_file='configs/examples/train.json') - trainer = build_trainer('dummy', default_args) - - trainer.train() - trainer.evaluate() - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py new file mode 100644 index 00000000..a20bf97f --- /dev/null +++ b/tests/trainers/test_trainer_with_nlp.py @@ -0,0 +1,91 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models.nlp.sbert_for_sequence_classification import \ + SbertTextClassfier +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile +from modelscope.utils.test_utils import test_level + + +class TestTrainerWithNlp(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + from datasets import Dataset + dataset_dict = { + 'sentence1': [ + 'This is test sentence1-1', 'This is test sentence2-1', + 'This is test sentence3-1' + ], + 'sentence2': [ + 'This is test sentence1-2', 'This is test sentence2-2', + 'This is test sentence3-2' + ], + 'label': [0, 1, 1] + } + dataset = Dataset.from_dict(dataset_dict) + + class MsDatasetDummy(MsDataset): + + def __len__(self): + return len(self._hf_ds) + + self.dataset = MsDatasetDummy(dataset) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_trainer(self): + model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' + kwargs = dict( + model=model_id, + train_dataset=self.dataset, + eval_dataset=self.dataset, + work_dir=self.tmp_dir) + + trainer = build_trainer(default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(10): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_trainer_with_model_and_args(self): + tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' + cache_path = snapshot_download(model_id) + model = SbertTextClassfier.from_pretrained(cache_path) + kwargs = dict( + cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), + model=model, + train_dataset=self.dataset, + eval_dataset=self.dataset, + max_epochs=2, + work_dir=self.tmp_dir) + + trainer = build_trainer(default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(2): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + +if __name__ == '__main__': + unittest.main()