From 7b84adc914219afb5eb4173ff80068c31c5b4cdd Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Wed, 26 Oct 2022 19:15:43 +0800 Subject: [PATCH 1/6] [to #42322933]Fix remove files in local model not take effect to remote repo after push_model Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10533214 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10533214 --- modelscope/hub/api.py | 8 ++++++++ tests/hub/test_hub_upload.py | 15 +++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 00254f16..eacde64a 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -266,6 +266,14 @@ class HubApi: logger.info('Create new branch %s' % revision) git_wrapper.new_branch(tmp_dir, revision) git_wrapper.checkout(tmp_dir, revision) + files_in_repo = os.listdir(tmp_dir) + for f in files_in_repo: + if f[0] != '.': + src = os.path.join(tmp_dir, f) + if os.path.isfile(src): + os.remove(src) + else: + shutil.rmtree(src, ignore_errors=True) for f in files_to_save: if f[0] != '.': src = os.path.join(model_dir, f) diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py index e1f61467..835aa62b 100644 --- a/tests/hub/test_hub_upload.py +++ b/tests/hub/test_hub_upload.py @@ -7,7 +7,7 @@ import uuid from modelscope.hub.api import HubApi from modelscope.hub.constants import Licenses, ModelVisibility -from modelscope.hub.errors import HTTPError, NotLoginException +from modelscope.hub.errors import GitError, HTTPError, NotLoginException from modelscope.hub.repository import Repository from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger @@ -97,6 +97,17 @@ class HubUploadTest(unittest.TestCase): revision='new_revision/version1') assert os.path.exists(os.path.join(add4_path, 'add4.py')) shutil.rmtree(self.repo_path, ignore_errors=True) + assert os.path.exists(os.path.join(self.finetune_path, 'add3.py')) + os.remove(os.path.join(self.finetune_path, 'add3.py')) + self.api.push_model( + model_id=self.create_model_name, + model_dir=self.finetune_path, + revision='new_revision/version1') + Repository( + model_dir=self.repo_path, + clone_from=self.create_model_name, + revision='new_revision/version1') + assert not os.path.exists(os.path.join(self.repo_path, 'add3.py')) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_upload_non_exists_repo(self): @@ -133,7 +144,7 @@ class HubUploadTest(unittest.TestCase): def test_upload_invalid_repo(self): logger.info('test upload to invalid repo!') self.api.login(TEST_ACCESS_TOKEN1) - with self.assertRaises(HTTPError): + with self.assertRaises((HTTPError, GitError)): self.api.push_model( model_id='%s/%s' % ('speech_tts', 'invalid_model_test'), model_dir=self.finetune_path, From d0f8547e7ebbcd8108ee1fe83aa85230459b12de Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Wed, 26 Oct 2022 20:58:00 +0800 Subject: [PATCH 2/6] [to #42322933] Fix gpt3 loading checkpoint after finetuning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 修复GPT-3模型无法加载finetune保存的checkpoint的问题 2. 为GPT-3诗词生成模型添加 ut Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10537209 --- modelscope/models/nlp/gpt3/backbone.py | 2 ++ tests/pipelines/test_text_generation.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/modelscope/models/nlp/gpt3/backbone.py b/modelscope/models/nlp/gpt3/backbone.py index 587c7a9d..4647428e 100644 --- a/modelscope/models/nlp/gpt3/backbone.py +++ b/modelscope/models/nlp/gpt3/backbone.py @@ -342,6 +342,8 @@ class GPT3Model(PreTrainedModel): state_dict_file = os.path.join(pretrained_model_name_or_path, ModelFile.TORCH_MODEL_BIN_FILE) state_dict = torch.load(state_dict_file) + if 'state_dict' in state_dict: + state_dict = state_dict['state_dict'] state_dict = { k.replace('model.language_model', 'language_model'): v for k, v in state_dict.items() diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index ffb30090..c97f347d 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -38,7 +38,9 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): self.gpt3_base_model_id = 'damo/nlp_gpt3_text-generation_chinese-base' self.gpt3_large_model_id = 'damo/nlp_gpt3_text-generation_chinese-large' + self.gpt3_poetry_large_model_id = 'damo/nlp_gpt3_poetry-generation_chinese-large' self.gpt3_input = '《故乡》。深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地,' + self.gpt3_poetry_input = '天生我材必有用,' def run_pipeline_with_model_instance(self, model_id, input): model = Model.from_pretrained(model_id) @@ -115,6 +117,11 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): self.run_pipeline_with_model_instance(self.palm_model_id_en, self.palm_input_en) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_gpt_poetry_large_with_model_name(self): + self.run_pipeline_with_model_id(self.gpt3_poetry_large_model_id, + self.gpt3_poetry_input) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_gpt_base_with_model_instance(self): self.run_pipeline_with_model_instance(self.gpt3_base_model_id, @@ -125,6 +132,11 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): self.run_pipeline_with_model_instance(self.gpt3_large_model_id, self.gpt3_input) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_gpt_poetry_large_with_model_instance(self): + self.run_pipeline_with_model_instance(self.gpt3_poetry_large_model_id, + self.gpt3_poetry_input) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_palm(self): for model_id, input in ((self.palm_model_id_zh_base, From 06053761352718d78676cc7b908d6170721e67d9 Mon Sep 17 00:00:00 2001 From: "liugao.lg" Date: Thu, 27 Oct 2022 09:29:06 +0800 Subject: [PATCH 3/6] [to #42322933]add ofa finetune MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增ofa的finetune能力 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10540701 --- modelscope/metrics/accuracy_metric.py | 22 +++-- modelscope/metrics/ned_metric.py | 87 +++++++++++++++++++ .../preprocessors/ofa/ocr_recognition.py | 22 ++++- .../trainers/multi_modal/ofa/ofa_trainer.py | 4 +- .../multi_modal/ofa/ofa_trainer_utils.py | 26 +++--- tests/trainers/test_ofa_trainer.py | 52 +++++------ 6 files changed, 164 insertions(+), 49 deletions(-) create mode 100644 modelscope/metrics/ned_metric.py diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py index 1761786e..953ece4c 100644 --- a/modelscope/metrics/accuracy_metric.py +++ b/modelscope/metrics/accuracy_metric.py @@ -27,15 +27,21 @@ class AccuracyMetric(Metric): label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS ground_truths = inputs[label_name] eval_results = outputs[label_name] + for key in [ + OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES, + OutputKeys.LABELS, OutputKeys.SCORES + ]: + if key in outputs and outputs[key] is not None: + eval_results = outputs[key] + break assert type(ground_truths) == type(eval_results) - if isinstance(ground_truths, list): - self.preds.extend(eval_results) - self.labels.extend(ground_truths) - elif isinstance(ground_truths, np.ndarray): - self.preds.extend(eval_results.tolist()) - self.labels.extend(ground_truths.tolist()) - else: - raise 'only support list or np.ndarray' + for truth in ground_truths: + self.labels.append(truth) + for result in eval_results: + if isinstance(truth, str): + self.preds.append(result.strip().replace(' ', '')) + else: + self.preds.append(result) def evaluate(self): assert len(self.preds) == len(self.labels) diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py new file mode 100644 index 00000000..e87bb2c4 --- /dev/null +++ b/modelscope/metrics/ned_metric.py @@ -0,0 +1,87 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Dict + +import numpy as np + +from modelscope.metainfo import Metrics +from modelscope.outputs import OutputKeys +from modelscope.utils.registry import default_group +from .base import Metric +from .builder import METRICS, MetricKeys + + +@METRICS.register_module(group_key=default_group, module_name=Metrics.NED) +class NedMetric(Metric): + """The ned metric computation class for classification classes. + + This metric class calculates the levenshtein distance between sentences for the whole input batches. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.preds = [] + self.labels = [] + + def add(self, outputs: Dict, inputs: Dict): + label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS + ground_truths = inputs[label_name] + eval_results = outputs[label_name] + for key in [ + OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES, + OutputKeys.LABELS, OutputKeys.SCORES + ]: + if key in outputs and outputs[key] is not None: + eval_results = outputs[key] + break + assert type(ground_truths) == type(eval_results) + if isinstance(ground_truths, list): + self.preds.extend(eval_results) + self.labels.extend(ground_truths) + elif isinstance(ground_truths, np.ndarray): + self.preds.extend(eval_results.tolist()) + self.labels.extend(ground_truths.tolist()) + else: + raise Exception('only support list or np.ndarray') + + def evaluate(self): + assert len(self.preds) == len(self.labels) + return { + MetricKeys.NED: (np.asarray([ + 1.0 - NedMetric._distance(pred, ref) + for pred, ref in zip(self.preds, self.labels) + ])).mean().item() + } + + @staticmethod + def _distance(pred, ref): + if pred is None or ref is None: + raise TypeError('Argument (pred or ref) is NoneType.') + if pred == ref: + return 0.0 + if len(pred) == 0: + return len(ref) + if len(ref) == 0: + return len(pred) + m_len = max(len(pred), len(ref)) + if m_len == 0: + return 0.0 + + def levenshtein(s0, s1): + v0 = [0] * (len(s1) + 1) + v1 = [0] * (len(s1) + 1) + + for i in range(len(v0)): + v0[i] = i + + for i in range(len(s0)): + v1[0] = i + 1 + for j in range(len(s1)): + cost = 1 + if s0[i] == s1[j]: + cost = 0 + v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost) + v0, v1 = v1, v0 + return v0[len(s1)] + + return levenshtein(pred, ref) / m_len diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index 1761dbd4..26fff9d2 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -91,8 +91,24 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): ]) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + sample = self._build_infer_sample(data) + target = data[self.column_map['text']] + target = target.translate(self.transtab).strip() + target_token_list = target.strip().split() + target = ' '.join(target_token_list[:self.max_tgt_length]) + sample['target'] = self.tokenize_text(target, add_bos=False) + sample['prev_output_tokens'] = torch.cat( + [self.bos_item, sample['target'][:-1]]) + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) patch_image = self.patch_resize_transform(image) prompt = self.cfg.model.get('prompt', '图片上的文字是什么?') inputs = self.tokenize_text(prompt) @@ -102,4 +118,6 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): 'patch_image': patch_image, 'patch_mask': torch.tensor([True]) } + if 'text' in self.column_map and self.column_map['text'] in data: + sample['label'] = data[self.column_map['text']] return sample diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py index 02853925..f8028c6c 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py @@ -129,9 +129,7 @@ class OFATrainer(EpochBasedTrainer): def train_step(self, model, inputs): model.train() - model_outputs = model.forward(inputs) - loss, sample_size, logging_output = self.criterion( - model_outputs, inputs) + loss, sample_size, logging_output = self.criterion(model, inputs) train_outputs = {'loss': loss} # add model output info to log if 'log_vars' not in train_outputs: diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py index 2189a5db..3c38884c 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py @@ -123,7 +123,7 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss): self.padding_idx = args.tokenizer.pad_token_id self.args = args - def forward(self, output, sample, update_num=0, reduce=True): + def forward(self, model, sample, update_num=0, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: @@ -131,11 +131,16 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss): 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ + if 'labels' in sample: + del sample['labels'] + if 'samples' in sample: + del sample['samples'] + if self.use_rdrop: construct_rdrop_sample(sample) - + output = model.model(**sample['net_input']) loss, nll_loss, ntokens = self.compute_loss( - output, sample, update_num, reduce=reduce) + output.logits, sample, update_num, reduce=reduce) sample_size = ( sample['target'].size(0) if self.sentence_avg else ntokens) logging_output = { @@ -147,19 +152,18 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss): } return loss, sample_size, logging_output - def get_lprobs_and_target(self, net_output, sample): + def get_lprobs_and_target(self, logits, sample): conf = sample['conf'][:, None, None] if 'conf' in sample and sample[ 'conf'] is not None else 1 constraint_masks = None if 'constraint_masks' in sample and sample[ 'constraint_masks'] is not None: constraint_masks = sample['constraint_masks'] - net_output[0].masked_fill_(~constraint_masks, -math.inf) + logits.masked_fill_(~constraint_masks, -math.inf) if self.constraint_start is not None and self.constraint_end is not None: - net_output[0][:, :, 4:self.constraint_start] = -math.inf - net_output[0][:, :, self.constraint_end:] = -math.inf - lprobs = F.log_softmax( - net_output[0], dim=-1, dtype=torch.float32) * conf + logits[:, :, 4:self.constraint_start] = -math.inf + logits[:, :, self.constraint_end:] = -math.inf + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) * conf target = sample['target'] if self.ignore_prefix_size > 0: lprobs = lprobs[:, self.ignore_prefix_size:, :].contiguous() @@ -180,9 +184,9 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss): return lprobs.view(-1, lprobs.size(-1)), target.view(-1), constraint_masks - def compute_loss(self, net_output, sample, update_num, reduce=True): + def compute_loss(self, logits, sample, update_num, reduce=True): lprobs, target, constraint_masks = self.get_lprobs_and_target( - net_output, sample) + logits, sample) if constraint_masks is not None: constraint_masks = constraint_masks[target != self.padding_idx] lprobs = lprobs[target != self.padding_idx] diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py index 06003625..3f68a9fb 100644 --- a/tests/trainers/test_ofa_trainer.py +++ b/tests/trainers/test_ofa_trainer.py @@ -5,10 +5,10 @@ import unittest import json -from modelscope.metainfo import Metrics, Trainers +from modelscope.metainfo import Trainers from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import DownloadMode, ModelFile from modelscope.utils.test_utils import test_level @@ -17,26 +17,27 @@ class TestOfaTrainer(unittest.TestCase): def setUp(self) -> None: self.finetune_cfg = \ {'framework': 'pytorch', - 'task': 'image-captioning', + 'task': 'ocr-recognition', 'model': {'type': 'ofa', 'beam_search': {'beam_size': 5, - 'max_len_b': 16, + 'max_len_b': 64, 'min_len': 1, 'no_repeat_ngram_size': 0}, 'seed': 7, - 'max_src_length': 256, - 'language': 'en', + 'max_src_length': 128, + 'language': 'zh', 'gen_type': 'generation', 'patch_image_size': 480, + 'is_document': False, 'max_image_size': 480, 'imagenet_default_mean_and_std': False}, - 'pipeline': {'type': 'image-captioning'}, - 'dataset': {'column_map': {'text': 'caption'}}, - 'train': {'work_dir': 'work/ckpts/caption', + 'pipeline': {'type': 'ofa-ocr-recognition'}, + 'dataset': {'column_map': {'text': 'label'}}, + 'train': {'work_dir': 'work/ckpts/recognition', # 'launcher': 'pytorch', 'max_epochs': 1, 'use_fp16': True, - 'dataloader': {'batch_size_per_gpu': 1, 'workers_per_gpu': 0}, + 'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0}, 'lr_scheduler': {'name': 'polynomial_decay', 'warmup_proportion': 0.01, 'lr_end': 1e-07}, @@ -57,47 +58,48 @@ class TestOfaTrainer(unittest.TestCase): 'report_accuracy': False, 'sample_patch_num': 196, 'sentence_avg': False, - 'use_rdrop': False}, + 'use_rdrop': True}, 'hooks': [{'type': 'BestCkptSaverHook', - 'metric_key': 'bleu-4', + 'metric_key': 'accuracy', 'interval': 100}, {'type': 'TextLoggerHook', 'interval': 1}, {'type': 'IterTimerHook'}, {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]}, 'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0}, - 'metrics': [{'type': 'bleu', - 'eval_tokenized_bleu': False, - 'ref_name': 'labels', - 'hyp_name': 'caption'}]}, + 'metrics': [{'type': 'accuracy'}]}, 'preprocessor': []} @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_trainer_std(self): - WORKSPACE = './workspace/ckpts/caption' + WORKSPACE = './workspace/ckpts/recognition' os.makedirs(WORKSPACE, exist_ok=True) config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION) with open(config_file, 'w') as writer: json.dump(self.finetune_cfg, writer) - pretrained_model = 'damo/ofa_image-caption_coco_distilled_en' + pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh' args = dict( model=pretrained_model, work_dir=WORKSPACE, train_dataset=MsDataset.load( - 'coco_2014_caption', + 'ocr_fudanvi_zh', + subset_name='scene', namespace='modelscope', - split='train[:20]'), + split='train[:200]', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS), eval_dataset=MsDataset.load( - 'coco_2014_caption', + 'ocr_fudanvi_zh', + subset_name='scene', namespace='modelscope', - split='validation[:10]'), - metrics=[Metrics.BLEU], + split='test[:20]', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS), cfg_file=config_file) trainer = build_trainer(name=Trainers.ofa, default_args=args) trainer.train() - self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, - os.listdir(os.path.join(WORKSPACE, 'output'))) + self.assertIn( + ModelFile.TORCH_MODEL_BIN_FILE, + os.listdir(os.path.join(WORKSPACE, ModelFile.TRAIN_OUTPUT_DIR))) shutil.rmtree(WORKSPACE) From f9e12669baaa78e32ea1d552e6f68010f41fb56e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 27 Oct 2022 09:33:19 +0800 Subject: [PATCH 4/6] [to #42322933]add default mapping for preprocessors Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10536603 --- .../nlp/faq_question_answering_pipeline.py | 4 - modelscope/preprocessors/base.py | 157 ++++++++++++++++-- 2 files changed, 143 insertions(+), 18 deletions(-) diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index fd614e91..3917f20c 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -26,10 +26,6 @@ class FaqQuestionAnsweringPipeline(Pipeline): if preprocessor is None: preprocessor = Preprocessor.from_pretrained( model.model_dir, **kwargs) - if preprocessor is None: - from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor - preprocessor = FaqQuestionAnsweringPreprocessor( - model.model_dir, **kwargs) super().__init__(model=model, preprocessor=preprocessor, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index c2716a13..db14ba47 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -4,7 +4,8 @@ from abc import ABC, abstractmethod from copy import deepcopy from typing import Any, Dict, Optional, Sequence -from modelscope.utils.config import Config +from modelscope.metainfo import Models, Preprocessors +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.logger import get_logger @@ -12,6 +13,112 @@ from .builder import build_preprocessor logger = get_logger(__name__) +PREPROCESSOR_MAP = { + # nlp + # bart + (Models.bart, Tasks.text_error_correction): + Preprocessors.text_error_correction, + + # bert + (Models.bert, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.document_segmentation): + Preprocessors.document_segmentation, + (Models.bert, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.bert, Tasks.sentence_embedding): + Preprocessors.sentence_embedding, + (Models.bert, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.zero_shot_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.text_ranking): + Preprocessors.text_ranking, + (Models.bert, Tasks.part_of_speech): + Preprocessors.token_cls_tokenizer, + (Models.bert, Tasks.token_classification): + Preprocessors.token_cls_tokenizer, + (Models.bert, Tasks.word_segmentation): + Preprocessors.token_cls_tokenizer, + + # bloom + (Models.bloom, Tasks.backbone): + Preprocessors.text_gen_tokenizer, + + # gpt_neo + # gpt_neo may have different preprocessors, but now only one + (Models.gpt_neo, Tasks.backbone): + Preprocessors.sentence_piece, + + # gpt3 has different preprocessors by different sizes of models, so they are not listed here. + + # palm_v2 + (Models.palm, Tasks.backbone): + Preprocessors.text_gen_tokenizer, + + # T5 + (Models.T5, Tasks.backbone): + Preprocessors.text2text_gen_preprocessor, + (Models.T5, Tasks.text2text_generation): + Preprocessors.text2text_gen_preprocessor, + + # deberta_v2 + (Models.deberta_v2, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.deberta_v2, Tasks.fill_mask): + Preprocessors.fill_mask, + + # ponet + (Models.ponet, Tasks.fill_mask): + Preprocessors.fill_mask_ponet, + + # structbert + (Models.structbert, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.structbert, Tasks.faq_question_answering): + Preprocessors.faq_question_answering_preprocessor, + (Models.structbert, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.zero_shot_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.part_of_speech): + Preprocessors.token_cls_tokenizer, + (Models.structbert, Tasks.token_classification): + Preprocessors.token_cls_tokenizer, + (Models.structbert, Tasks.word_segmentation): + Preprocessors.token_cls_tokenizer, + + # veco + (Models.veco, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.veco, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + + # space +} + class Preprocessor(ABC): @@ -56,37 +163,59 @@ class Preprocessor(ABC): if 'task' in kwargs: task = kwargs.pop('task') field_name = Tasks.find_field_by_task(task) + sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' + if not hasattr(cfg, 'preprocessor'): logger.error('No preprocessor field found in cfg.') - return None - - sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' + preprocessor_cfg = ConfigDict() + else: + preprocessor_cfg = cfg.preprocessor - if 'type' not in cfg.preprocessor: - if sub_key in cfg.preprocessor: - sub_cfg = getattr(cfg.preprocessor, sub_key) + if 'type' not in preprocessor_cfg: + if sub_key in preprocessor_cfg: + sub_cfg = getattr(preprocessor_cfg, sub_key) else: logger.error( f'No {sub_key} key and type key found in ' f'preprocessor domain of configuration.json file.') - return None + sub_cfg = preprocessor_cfg else: - sub_cfg = cfg.preprocessor + sub_cfg = preprocessor_cfg - if len(sub_cfg): + sub_cfg.update({'model_dir': model_dir}) + sub_cfg.update(kwargs) + if 'type' in sub_cfg: if isinstance(sub_cfg, Sequence): # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, # and add mode for Compose or other plans raise NotImplementedError('Not supported yet!') sub_cfg = deepcopy(sub_cfg) - sub_cfg.update({'model_dir': model_dir}) - sub_cfg.update(kwargs) + preprocessor = build_preprocessor(sub_cfg, field_name) else: logger.error( f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, ' - f'please check the preprocessor field in the configuration.json file.' + f'current config: {sub_cfg}. trying to build by task and model information.' ) - return None + model_cfg = getattr(cfg, 'model', ConfigDict()) + model_type = model_cfg.type if hasattr( + model_cfg, 'type') else getattr(model_cfg, 'model_type', None) + if task is None or model_type is None: + logger.error( + f'Find task: {task}, model type: {model_type}. ' + f'Insufficient information to build preprocessor, skip building preprocessor' + ) + return None + if (model_type, task) not in PREPROCESSOR_MAP: + logger.error( + f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, ' + f'skip building preprocessor.') + return None + + sub_cfg = ConfigDict({ + 'type': PREPROCESSOR_MAP[(model_type, task)], + **sub_cfg + }) + preprocessor = build_preprocessor(sub_cfg, field_name) preprocessor.mode = preprocessor_mode return preprocessor From 69104c0f8ae88a242b4669d152377cac04d2c274 Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Thu, 27 Oct 2022 09:52:05 +0800 Subject: [PATCH 5/6] [to #42322933] Refactor text generation model outputs and fix some bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 将 single_gpu_test 与 multi_gpu_test 中的 model.forward 部分分离为 EpochBasedTrainer 中的 evaluation_step,为部分 evaluation 阶段不调用 forward 的模型提供更好的灵活性 2. 重构代码将文本生成模型 Model 层的输入输出统一为 Tensor,Tensor 到 str 的 decode 过程移动到 pipeline 中完成 3. pipeline 后处理添加对中文和中文标点与英文混杂时空格的处理,使 decode 后中英文混杂输出正确 4. 添加 TextGenerationTrainer 修复了部分模型 evaluation 过程 forward 输出单个 token 计算 metrics 的问题 5. 修复了 rouge 无法接收空字符串的问题 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10473768 --- modelscope/metainfo.py | 1 + modelscope/metrics/text_generation_metric.py | 17 +++++-- modelscope/models/nlp/__init__.py | 3 +- modelscope/models/nlp/bloom/__init__.py | 19 ++++++++ modelscope/models/nlp/bloom/backbone.py | 4 +- modelscope/models/nlp/gpt3/text_generation.py | 8 +--- modelscope/models/nlp/palm_v2/backbone.py | 4 +- .../models/nlp/palm_v2/text_generation.py | 41 ++--------------- .../pipelines/nlp/text_generation_pipeline.py | 45 ++++++++++++++++--- modelscope/trainers/nlp/__init__.py | 4 +- .../trainers/nlp/text_generation_trainer.py | 36 +++++++++++++++ modelscope/trainers/trainer.py | 26 ++++++++++- modelscope/trainers/utils/inference.py | 36 ++++----------- .../trainers/test_finetune_text_generation.py | 4 +- tests/trainers/utils/test_inference.py | 11 ++++- 15 files changed, 166 insertions(+), 93 deletions(-) create mode 100644 modelscope/models/nlp/bloom/__init__.py create mode 100644 modelscope/trainers/nlp/text_generation_trainer.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 7944d1ed..419ec919 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -313,6 +313,7 @@ class Trainers(object): nlp_base_trainer = 'nlp-base-trainer' nlp_veco_trainer = 'nlp-veco-trainer' nlp_text_ranking_trainer = 'nlp-text-ranking-trainer' + text_generation_trainer = 'text-generation-trainer' # audio trainers speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index 90b80425..9bca7cf3 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -36,20 +36,31 @@ class TextGenerationMetric(Metric): for char in string ]).split()) - def add(self, outputs: Dict[str, List[str]], inputs: Dict = None): - ground_truths = outputs['tgts'] + def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]): + ground_truths = inputs['tgts'] eval_results = outputs['preds'] for truth in ground_truths: self.tgts.append(self.rebuild_str(truth)) for result in eval_results: self.preds.append(self.rebuild_str(result)) + def _check(self, pred: str, tgt: str) -> bool: + + def remove_useless(string: str) -> str: + return string.replace(' ', '').replace('.', '') + + return remove_useless(pred) and remove_useless(tgt) + def evaluate(self): + assert self.preds, 'preds in TextGenerationMetric must not be empty!' + tmp = [(pred, tgt) for pred, tgt in zip(self.preds, self.tgts) + if self._check(pred, tgt)] + preds, tgts = zip(*tmp) def mean(iter: Iterable) -> float: return sum(iter) / len(self.preds) - rouge_scores = self.rouge.get_scores(hyps=self.preds, refs=self.tgts) + rouge_scores = self.rouge.get_scores(hyps=preds, refs=tgts) rouge_1 = mean(map(lambda score: score['rouge-1']['f'], rouge_scores)) rouge_l = mean(map(lambda score: score['rouge-l']['f'], rouge_scores)) pred_split = tuple(pred.split(' ') for pred in self.preds) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index d4562f10..ccb2d382 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -49,7 +49,7 @@ if TYPE_CHECKING: VecoForSequenceClassification, VecoForTokenClassification, VecoModel, VecoTokenizer, VecoTokenizerFast) - + from .bloom import BloomModel else: _import_structure = { 'backbones': ['SbertModel'], @@ -107,6 +107,7 @@ else: 'sentence_embedding': ['SentenceEmbedding'], 'T5': ['T5ForConditionalGeneration'], 'gpt_neo': ['GPTNeoModel'], + 'bloom': ['BloomModel'], } import sys diff --git a/modelscope/models/nlp/bloom/__init__.py b/modelscope/models/nlp/bloom/__init__.py new file mode 100644 index 00000000..ad93252f --- /dev/null +++ b/modelscope/models/nlp/bloom/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .backbone import BloomModel +else: + _import_structure = { + 'backbone': ['BloomModel'], + } + import sys + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/bloom/backbone.py b/modelscope/models/nlp/bloom/backbone.py index b6bd315e..f8ea7b2f 100644 --- a/modelscope/models/nlp/bloom/backbone.py +++ b/modelscope/models/nlp/bloom/backbone.py @@ -4,10 +4,10 @@ from transformers import BloomModel as BloomModelTransform from modelscope.metainfo import Models from modelscope.models.builder import BACKBONES -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Tasks -@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.bloom) +@BACKBONES.register_module(group_key=Tasks.backbone, module_name=Models.bloom) class BloomModel(BloomModelTransform): def __init__(self, **kwargs): diff --git a/modelscope/models/nlp/gpt3/text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py index d686ea30..b8b705a5 100644 --- a/modelscope/models/nlp/gpt3/text_generation.py +++ b/modelscope/models/nlp/gpt3/text_generation.py @@ -42,7 +42,7 @@ class GPT3ForTextGeneration(TorchModel): """ return self.model(**input) - def generate(self, input: Dict[str, Tensor]) -> Dict[str, str]: + def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: assert 'input_ids' in input, "generate function must accept 'input_ids' key" input_ids = input['input_ids'] if 'attention_mask' in input: @@ -59,8 +59,4 @@ class GPT3ForTextGeneration(TorchModel): gen_params['top_k'] = input.pop('top_k', 10) gen_params['top_p'] = input.pop('top_p', None) sample_output = self.model.generate(**gen_params) - return { - OutputKeys.TEXT: - self.tokenizer.decode(sample_output[0], - skip_special_tokens=True).replace(' ', '') - } + return {'sequences': sample_output[0]} diff --git a/modelscope/models/nlp/palm_v2/backbone.py b/modelscope/models/nlp/palm_v2/backbone.py index 3e0ff805..afee2e3f 100644 --- a/modelscope/models/nlp/palm_v2/backbone.py +++ b/modelscope/models/nlp/palm_v2/backbone.py @@ -1314,8 +1314,8 @@ class Translator(object): return results - def __call__(self, input_ids: torch.Tensor, - attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: + def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, + **kwargs) -> Dict[str, torch.Tensor]: batch = self.Batch( batch_size=input_ids.size()[0], src=input_ids, diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py index 2c37afd6..d83860db 100644 --- a/modelscope/models/nlp/palm_v2/text_generation.py +++ b/modelscope/models/nlp/palm_v2/text_generation.py @@ -29,22 +29,6 @@ class PalmForTextGeneration(TorchModel): self.tokenizer = self.model.tokenizer self.generator = Translator(self.model) - def _evaluate_postprocess(self, ids_list: List[List[int]]) -> List[str]: - replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), ('[unused1]', - ''), - (r' +', ' '), ('[SEP]', ''), ('[unused2]', ''), - ('[CLS]', ''), ('[UNK]', ''), (' ', '')) - replace_tokens_roberta = ((r' +', ' '), ('', '. '), - ('', ''), ('', ''), ('', ''), - ('', ' '), ('', '. ')) - - replace_tokens = replace_tokens_roberta \ - if self.model.config.encoder == 'roberta' else replace_tokens_bert - strings = [self.tokenizer.decode(pred_ids) for pred_ids in ids_list] - for _old, _new in replace_tokens: - strings = [s.replace(_old, _new) for s in strings] - return strings - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: """return the result by the model @@ -57,29 +41,10 @@ class PalmForTextGeneration(TorchModel): { 'loss': Tensor([12.34]), # loss for backward } - or - { - 'preds': List["hello word"...] # the predicted strings - 'tgts': List["hello world"...] # target strings - } """ - if self.training: - return self.model(**input) - else: - outputs = self.generator(input['input_ids'], - input['attention_mask']) - preds = outputs['predictions'] - pred_ids_list = [ - pred_batch[0].cpu().numpy().tolist() for pred_batch in preds - ] - tgt_ids_list = input['labels'].cpu().numpy().tolist() - return { - 'preds': self._evaluate_postprocess(pred_ids_list), - 'tgts': self._evaluate_postprocess(tgt_ids_list) - } + return self.model(**input) - def generate(self, input: Dict[str, Tensor]) -> Dict[str, str]: + def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: outputs = self.generator(**input) preds = outputs['predictions'] - pred_ids_list = [preds[0][0].cpu().numpy().tolist()] - return {OutputKeys.TEXT: self._evaluate_postprocess(pred_ids_list)[0]} + return {'sequences': [pred[0] for pred in preds]} diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 28acebb4..2d5b664f 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -53,7 +53,7 @@ class TextGenerationPipeline(Pipeline): model = model if isinstance(model, Model) else Model.from_pretrained(model) cfg = read_config(model.model_dir) - self.postprocessor = cfg.pop('postprocessor', None) + self.postprocessor = cfg.pop('postprocessor', 'decode') if preprocessor is None: preprocessor_cfg = cfg.preprocessor preprocessor_cfg.update({ @@ -78,8 +78,37 @@ class TextGenerationPipeline(Pipeline): with torch.no_grad(): return self.model.generate(inputs, **forward_params) - def sentence_piece(self, inputs) -> Dict[str, Tensor]: - return self.preprocessor.tokenizer.decode(inputs.tolist()[0]) + def _is_chinese_char(self, word: str): + chinese_punctuations = (',', '。', ';', ':' '!', '?', '《', '》') + return len(word) == 1 \ + and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations) + + def _remove_space_between_chinese_chars(self, decoded: str): + old_word_list = decoded.split(' ') + new_word_list = [] + start = -1 + for i, word in enumerate(old_word_list): + if self._is_chinese_char(word): + if start == -1: + start = i + else: + if start != -1: + new_word_list.append(''.join(old_word_list[start:i])) + start = -1 + new_word_list.append(word) + if start != -1: + new_word_list.append(''.join(old_word_list[start:])) + return ' '.join(new_word_list) + + def decode(self, inputs) -> str: + tokenizer = self.preprocessor.tokenizer + return tokenizer.decode(inputs.tolist(), skip_special_tokens=True) + + def roberta(self, inputs) -> str: + tokenizer = self.preprocessor.tokenizer + decoded = tokenizer.decode(inputs.tolist()) + return decoded.replace('', '. ').replace('', + '. ').replace('', '') def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]: @@ -91,7 +120,9 @@ class TextGenerationPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ - return inputs if self.postprocessor is None else { - OutputKeys.TEXT: - getattr(self, self.postprocessor.replace('-', '_'))(inputs) - } + inputs = inputs['sequences'] + if isinstance(inputs, list): + inputs = inputs[0] + decoded = getattr(self, self.postprocessor)(inputs) + text = self._remove_space_between_chinese_chars(decoded) + return {OutputKeys.TEXT: text} diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py index 22f2cfe6..e3c39cf2 100644 --- a/modelscope/trainers/nlp/__init__.py +++ b/modelscope/trainers/nlp/__init__.py @@ -7,11 +7,13 @@ if TYPE_CHECKING: from .sequence_classification_trainer import SequenceClassificationTrainer from .csanmt_translation_trainer import CsanmtTranslationTrainer from .text_ranking_trainer import TextRankingTrainer + from .text_generation_trainer import TextGenerationTrainer else: _import_structure = { 'sequence_classification_trainer': ['SequenceClassificationTrainer'], 'csanmt_translation_trainer': ['CsanmtTranslationTrainer'], - 'text_ranking_trainer': ['TextRankingTrainer'] + 'text_ranking_trainer': ['TextRankingTrainer'], + 'text_generation_trainer': ['TextGenerationTrainer'], } import sys diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py new file mode 100644 index 00000000..0e26f153 --- /dev/null +++ b/modelscope/trainers/nlp/text_generation_trainer.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from collections.abc import Mapping + +import torch + +from modelscope.metainfo import Trainers +from modelscope.trainers import NlpEpochBasedTrainer +from modelscope.trainers.builder import TRAINERS +from modelscope.utils.file_utils import func_receive_dict_inputs + + +@TRAINERS.register_module(module_name=Trainers.text_generation_trainer) +class TextGenerationTrainer(NlpEpochBasedTrainer): + + def _decode(self, tokens): + tokenizer = self.eval_preprocessor.tokenizer + return tokenizer.decode(tokens.tolist(), skip_special_tokens=True) + + def evaluation_step(self, data): + model = self.model + model.eval() + + with torch.no_grad(): + if isinstance( + data, + Mapping) and not func_receive_dict_inputs(model.generate): + result = model.generate(**data) + else: + result = model.generate(data) + + result['preds'] = [self._decode(seq) for seq in result['sequences']] + data['tgts'] = [self._decode(seq) for seq in data['labels']] + assert len(result['preds']) == len(data['tgts']) + + return result diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 605136e5..f660a55a 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -855,6 +855,28 @@ class EpochBasedTrainer(BaseTrainer): self.invoke_hook(TrainerStages.after_run) + def evaluation_step(self, data): + """Perform a training step on a batch of inputs. + + Subclass and override to inject custom behavior. + + """ + model = self.model + model.eval() + + if is_parallel(model): + receive_dict_inputs = func_receive_dict_inputs( + model.module.forward) + else: + receive_dict_inputs = func_receive_dict_inputs(model.forward) + + with torch.no_grad(): + if isinstance(data, Mapping) and not receive_dict_inputs: + result = model.forward(**data) + else: + result = model.forward(data) + return result + def evaluation_loop(self, data_loader, metric_classes): """ Evaluation loop used by `EpochBasedTrainer.evaluate()`. @@ -862,7 +884,7 @@ class EpochBasedTrainer(BaseTrainer): if self._dist: from modelscope.trainers.utils.inference import multi_gpu_test metric_values = multi_gpu_test( - self.model, + self, data_loader, device=self.device, tmpdir=None, @@ -872,7 +894,7 @@ class EpochBasedTrainer(BaseTrainer): else: from modelscope.trainers.utils.inference import single_gpu_test metric_values = single_gpu_test( - self.model, + self, data_loader, device=self.device, metric_classes=metric_classes, diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index 1f8f8ed0..d6187b5f 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -4,29 +4,25 @@ import logging import os import pickle import shutil -import time -from collections.abc import Mapping import torch from torch import distributed as dist from tqdm import tqdm -from modelscope.trainers.parallel.utils import is_parallel from modelscope.utils.data_utils import to_device -from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, make_tmp_dir) -def single_gpu_test(model, +def single_gpu_test(trainer, data_loader, device, metric_classes=None, data_loader_iters=None): - """Test model with a single gpu. + """Test model in EpochBasedTrainer with a single gpu. Args: - model (nn.Module): Model to be tested. + trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested. data_loader (nn.Dataloader): Pytorch data loader. device (str | torch.device): The target device for the data. metric_classes (List): List of Metric class that uses to collect metrics @@ -35,7 +31,6 @@ def single_gpu_test(model, Returns: list: The prediction results. """ - model.eval() dataset = data_loader.dataset progress_with_iters = False if data_loader_iters is None: @@ -55,12 +50,7 @@ def single_gpu_test(model, with tqdm(total=data_len, desc=desc) as pbar: for i, data in enumerate(data_loader): data = to_device(data, device) - with torch.no_grad(): - if isinstance(data, Mapping) and not func_receive_dict_inputs( - model.forward): - result = model.forward(**data) - else: - result = model.forward(data) + result = trainer.evaluation_step(data) if metric_classes is not None: for metric_cls in metric_classes: metric_cls.add(result, data) @@ -88,14 +78,14 @@ def single_gpu_test(model, return metric_values -def multi_gpu_test(model, +def multi_gpu_test(trainer, data_loader, device, tmpdir=None, gpu_collect=False, metric_classes=None, data_loader_iters_per_gpu=None): - """Test model with multiple gpus. + """Test model in EpochBasedTrainer with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting @@ -104,7 +94,7 @@ def multi_gpu_test(model, different gpus to ``tmpdir`` and collects them by the rank 0 worker. Args: - model (nn.Module): Model to be tested. + trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested. data_loader (nn.Dataloader): Pytorch data loader. device: (str | torch.device): The target device for the data. tmpdir (str): Path of directory to save the temporary results from @@ -115,7 +105,6 @@ def multi_gpu_test(model, Returns: list: The prediction results. """ - model.eval() results = [] data_list = [] dataset = data_loader.dataset @@ -138,21 +127,12 @@ def multi_gpu_test(model, data_len = data_loader_iters_per_gpu * world_size desc = 'Total test iterations with multi gpus' - if is_parallel(model): - receive_dict_inputs = func_receive_dict_inputs(model.module.forward) - else: - receive_dict_inputs = func_receive_dict_inputs(model.forward) - count = 0 with tqdm(total=data_len, desc=desc) as pbar: for i, data in enumerate(data_loader): data = to_device(data, device) data_list.append(data) - with torch.no_grad(): - if isinstance(data, Mapping) and not receive_dict_inputs: - result = model.forward(**data) - else: - result = model.forward(data) + result = trainer.evaluation_step(data) results.append(result) if isinstance(data, dict): diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py index 6aefa969..63d4577b 100644 --- a/tests/trainers/test_finetune_text_generation.py +++ b/tests/trainers/test_finetune_text_generation.py @@ -59,7 +59,7 @@ class TestFinetuneTextGeneration(unittest.TestCase): work_dir=self.tmp_dir) trainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.text_generation_trainer, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) @@ -98,7 +98,7 @@ class TestFinetuneTextGeneration(unittest.TestCase): work_dir=self.tmp_dir) trainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.text_generation_trainer, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py index 23561734..37e202e3 100644 --- a/tests/trainers/utils/test_inference.py +++ b/tests/trainers/utils/test_inference.py @@ -12,6 +12,7 @@ from modelscope.metrics.builder import MetricKeys from modelscope.metrics.sequence_classification_metric import \ SequenceClassificationMetric from modelscope.models.base import Model +from modelscope.trainers import EpochBasedTrainer from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test from modelscope.utils.test_utils import (DistributedTestCase, create_dummy_test_dataset, test_level) @@ -36,6 +37,12 @@ class DummyModel(nn.Module, Model): return dict(logits=x, loss=loss) +class DummyTrainer(EpochBasedTrainer): + + def __init__(self, model): + self.model = model + + def test_func(dist=False): dummy_model = DummyModel() dataset = dummy_dataset.to_torch_dataset() @@ -62,8 +69,10 @@ def test_func(dist=False): else: test_func = single_gpu_test + dummy_trainer = DummyTrainer(dummy_model) + metric_results = test_func( - dummy_model, + dummy_trainer, dummy_loader, device=device, metric_classes=[metric_class]) From de708dd5183e4325a90ec756312f7b014df298f5 Mon Sep 17 00:00:00 2001 From: "yingda.chen" Date: Thu, 27 Oct 2022 10:12:05 +0800 Subject: [PATCH 6/6] add basic remap column wrapper Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10539917 * add basic remap column wrapper --- modelscope/msdatasets/ms_dataset.py | 12 +++++++++++ tests/trainers/test_finetune_mplug.py | 21 +++++++++---------- .../trainers/test_finetune_text_generation.py | 14 +++++++++---- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index ad900bab..e90f397b 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -563,6 +563,18 @@ class MsDataset: self._hf_ds.reset_format() return self._hf_ds + def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset: + """ + Rename columns and return the underlying hf dataset directly + TODO: support native MsDataset column rename. + Args: + column_mapping: the mapping of the original and new column names + Returns: + underlying hf dataset + """ + self._hf_ds.reset_format() + return self._hf_ds.rename_columns(column_mapping) + @staticmethod def upload(object_name: str, local_file_path: str, diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index 72196fba..4972a731 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -24,17 +24,16 @@ class TestFinetuneMPlug(unittest.TestCase): datadict = MsDataset.load( 'coco_captions_small_slice', download_mode=DownloadMode.FORCE_REDOWNLOAD) - self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map( - lambda _: { - 'question': 'what the picture describes?' - }).rename_column('image:FILE', - 'image').rename_column('answer:Value', 'answer')) - self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map( - lambda _: { - 'question': 'what the picture describes?' - }).rename_column('image:FILE', - 'image').rename_column('answer:Value', 'answer')) - + self.train_dataset = MsDataset( + datadict['train'].remap_columns({ + 'image:FILE': 'image', + 'answer:Value': 'answer' + }).map(lambda _: {'question': 'what the picture describes?'})) + self.test_dataset = MsDataset( + datadict['test'].remap_columns({ + 'image:FILE': 'image', + 'answer:Value': 'answer' + }).map(lambda _: {'question': 'what the picture describes?'})) self.max_epochs = 2 def tearDown(self): diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py index 63d4577b..59bef51c 100644 --- a/tests/trainers/test_finetune_text_generation.py +++ b/tests/trainers/test_finetune_text_generation.py @@ -130,10 +130,16 @@ class TestFinetuneTextGeneration(unittest.TestCase): def test_finetune_cnndm(self): from modelscope.msdatasets import MsDataset dataset_dict = MsDataset.load('DuReader_robust-QG') - train_dataset = dataset_dict['train'].to_hf_dataset() \ - .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) - eval_dataset = dataset_dict['validation'].to_hf_dataset() \ - .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) + train_dataset = dataset_dict['train'].remap_columns({ + 'text1': 'src_txt', + 'text2': 'tgt_txt' + }) + eval_dataset = dataset_dict['validation'].remap_columns({ + 'text1': + 'src_txt', + 'text2': + 'tgt_txt' + }) num_warmup_steps = 200 os.environ['LOCAL_RANK'] = '0'