From 2c05a349240aa891fc9b6fbe3eb463cdb1443172 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 19 Sep 2022 21:30:31 +0800 Subject: [PATCH] [to #42322933] bug fix for fairseq --- modelscope/preprocessors/__init__.py | 4 +- modelscope/preprocessors/nlp/__init__.py | 46 ++++++++++++++++ .../preprocessors/{nlp.py => nlp/nlp_base.py} | 52 ++----------------- .../nlp/text_error_correction.py | 50 ++++++++++++++++++ 4 files changed, 104 insertions(+), 48 deletions(-) create mode 100644 modelscope/preprocessors/nlp/__init__.py rename modelscope/preprocessors/{nlp.py => nlp/nlp_base.py} (96%) create mode 100644 modelscope/preprocessors/nlp/text_error_correction.py diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 04901dc5..ba03a35e 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -24,7 +24,8 @@ if TYPE_CHECKING: TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, - PassageRankingPreprocessor) + PassageRankingPreprocessor, + WordSegmentationBlankSetToLabelPreprocessor) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor) @@ -56,6 +57,7 @@ else: 'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor', + 'WordSegmentationBlankSetToLabelPreprocessor', 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' ], 'space': [ diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py new file mode 100644 index 00000000..eee5e80f --- /dev/null +++ b/modelscope/preprocessors/nlp/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .text_error_correction import TextErrorCorrectionPreprocessor + from .nlp_base import ( + Tokenize, SequenceClassificationPreprocessor, + TextGenerationPreprocessor, TokenClassificationPreprocessor, + SingleSentenceClassificationPreprocessor, + PairSentenceClassificationPreprocessor, FillMaskPreprocessor, + ZeroShotClassificationPreprocessor, NERPreprocessor, + FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, + RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, + FillMaskPoNetPreprocessor, PassageRankingPreprocessor, + WordSegmentationBlankSetToLabelPreprocessor) + +else: + _import_structure = { + 'nlp_base': [ + 'Tokenize', 'SequenceClassificationPreprocessor', + 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', + 'SingleSentenceClassificationPreprocessor', + 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', + 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', + 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', + 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'RelationExtractionPreprocessor', + 'WordSegmentationBlankSetToLabelPreprocessor', + 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + ], + 'text_error_correction': [ + 'TextErrorCorrectionPreprocessor', + ], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp/nlp_base.py similarity index 96% rename from modelscope/preprocessors/nlp.py rename to modelscope/preprocessors/nlp/nlp_base.py index e20adaa6..0a2495af 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -6,20 +6,19 @@ import uuid from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np -import torch from transformers import AutoTokenizer, BertTokenizerFast from modelscope.metainfo import Models, Preprocessors from modelscope.models.nlp.structbert import SbertTokenizerFast from modelscope.outputs import OutputKeys +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.config import Config, ConfigFields from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile from modelscope.utils.hub import get_model_type, parse_label_mapping from modelscope.utils.logger import get_logger from modelscope.utils.nlp import import_external_nltk_data from modelscope.utils.type_assert import type_assert -from .base import Preprocessor -from .builder import PREPROCESSORS logger = get_logger() @@ -30,9 +29,9 @@ __all__ = [ 'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor', - 'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor', - 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor', + 'FillMaskPoNetPreprocessor' ] @@ -889,47 +888,6 @@ class RelationExtractionPreprocessor(Preprocessor): } -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text_error_correction) -class TextErrorCorrectionPreprocessor(Preprocessor): - """The preprocessor used in text correction task. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - from fairseq.data import Dictionary - """preprocess the data via the vocab file from the `model_dir` path - - Args: - model_dir (str): model path - """ - super().__init__(*args, **kwargs) - self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt')) - - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - '随着中国经济突飞猛近,建造工业与日俱增' - Returns: - Dict[str, Any]: the preprocessed data - Example: - {'net_input': - {'src_tokens':tensor([1,2,3,4]), - 'src_lengths': tensor([4])} - } - """ - - text = ' '.join([x for x in data]) - inputs = self.vocab.encode_line( - text, append_eos=True, add_if_not_exist=False) - lengths = inputs.size() - sample = dict() - sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths} - return sample - - @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor) class FaqQuestionAnsweringPreprocessor(Preprocessor): diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py new file mode 100644 index 00000000..357a946f --- /dev/null +++ b/modelscope/preprocessors/nlp/text_error_correction.py @@ -0,0 +1,50 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os.path as osp +from typing import Any, Dict + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_error_correction) +class TextErrorCorrectionPreprocessor(Preprocessor): + """The preprocessor used in text correction task. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + from fairseq.data import Dictionary + """preprocess the data via the vocab file from the `model_dir` path + + Args: + model_dir (str): model path + """ + super().__init__(*args, **kwargs) + self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt')) + + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + '随着中国经济突飞猛近,建造工业与日俱增' + Returns: + Dict[str, Any]: the preprocessed data + Example: + {'net_input': + {'src_tokens':tensor([1,2,3,4]), + 'src_lengths': tensor([4])} + } + """ + + text = ' '.join([x for x in data]) + inputs = self.vocab.encode_line( + text, append_eos=True, add_if_not_exist=False) + lengths = inputs.size() + sample = dict() + sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths} + return sample