From 348e87e697649d7c3a233a57697b981f43240497 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 2 Dec 2022 16:57:09 +0800 Subject: [PATCH] change sequence_length to max_length To cooperate with other tokenizing args, change sequence_length to max_length, meanwhile making the input args compatible with old 'sequence_length' arg. --- .../nlp/feature_extraction_preprocessor.py | 9 ++++++--- .../nlp/fill_mask_preprocessor.py | 18 ++++++++++++------ .../nlp/sentence_embedding_preprocessor.py | 9 ++++++--- .../nlp/text_classification_preprocessor.py | 9 ++++++--- .../nlp/text_generation_preprocessor.py | 13 ++++++------- .../nlp/text_ranking_preprocessor.py | 8 +++++--- .../nlp/token_classification_preprocessor.py | 9 ++++++--- .../zero_shot_classification_preprocessor.py | 7 +++++-- 8 files changed, 52 insertions(+), 30 deletions(-) diff --git a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py index 249aa24c..2f7f5d14 100644 --- a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py +++ b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py @@ -22,7 +22,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): first_sequence: str = None, second_sequence: str = None, mode: str = ModeKeys.INFERENCE, - sequence_length: int = 128, + max_length: int = None, use_fast: bool = None, **kwargs): """The preprocessor for feature extraction task, based on transformers' tokenizer. @@ -30,7 +30,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): Args: model_dir: The model dir used to initialize the tokenizer. use_fast: Use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ @@ -38,7 +38,10 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): self.second_sequence = second_sequence kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', True) super().__init__(mode) diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py index 80ac441f..0b9597d4 100644 --- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py +++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py @@ -111,7 +111,7 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase): first_sequence: str = None, second_sequence: str = None, mode: str = ModeKeys.INFERENCE, - sequence_length: int = 128, + max_length: int = None, use_fast: bool = None, **kwargs): """The preprocessor for fill mask task, based on transformers' tokenizer. @@ -119,13 +119,16 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase): Args: model_dir: The model dir used to initialize the tokenizer. use_fast: Use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', True) super().__init__(first_sequence, second_sequence, mode) @@ -183,7 +186,7 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): first_sequence: str = None, second_sequence: str = None, mode: str = ModeKeys.INFERENCE, - sequence_length: int = 512, + max_length: int = None, use_fast: bool = None, **kwargs): """The tokenizer preprocessor used in PoNet model's MLM task. @@ -191,13 +194,16 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): Args: model_dir: The model dir used to initialize the tokenizer. use_fast: Use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 512) + kwargs.pop('sequence_length', None) kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', True) super().__init__(first_sequence, second_sequence, mode) diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py index ccbf3ef2..77d65dec 100644 --- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py +++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py @@ -22,7 +22,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor): second_sequence='sentences_to_compare', mode=ModeKeys.INFERENCE, use_fast: bool = None, - sequence_length: int = 128, + max_length: int = None, **kwargs): """The preprocessor for sentence embedding task, based on transformers' tokenizer. @@ -32,13 +32,16 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor): second_sequence: The key of the second sequence. mode: The mode for the preprocessor. use_fast: Use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ self.first_sequence = first_sequence self.second_sequence = second_sequence - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) model_type = None if model_dir is not None: model_type = get_model_type(model_dir) diff --git a/modelscope/preprocessors/nlp/text_classification_preprocessor.py b/modelscope/preprocessors/nlp/text_classification_preprocessor.py index 06820e6c..ef38594f 100644 --- a/modelscope/preprocessors/nlp/text_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/text_classification_preprocessor.py @@ -129,20 +129,23 @@ class TextClassificationTransformersPreprocessor( label: Union[str, List] = 'label', label2id: Dict = None, mode: str = ModeKeys.INFERENCE, - sequence_length: int = 128, + max_length: int = None, use_fast: bool = None, **kwargs): """The tokenizer preprocessor used in sequence classification. Args: use_fast: Whether to use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) model_type = None if model_dir is not None: model_type = get_model_type(model_dir) diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py index 2823748b..e0f8d943 100644 --- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py +++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py @@ -99,7 +99,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): mode: str = ModeKeys.INFERENCE, src_txt='src_txt', tgt_txt='tgt_txt', - sequence_length: int = 128, + max_length: int = None, use_fast: bool = None, **kwargs): """The tokenizer preprocessor used in text generation. @@ -109,7 +109,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): mode: The mode for the preprocessor. src_txt: The key of the source sentence. tgt_txt: The key of the generated sentence. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. use_fast: Whether to use the fast tokenizer or not. **kwargs: Extra args input into the tokenizer's __call__ method. @@ -121,7 +121,10 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', False) - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) self.src_length = kwargs['max_length'] self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length']) model_type = None @@ -237,7 +240,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): src_txt='src_txt', tgt_txt='tgt_txt', use_fast: bool = None, - sequence_length: int = 128, **kwargs): """The preprocessor for text to text generation task, based on transformers' tokenizer. @@ -245,8 +247,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): model_dir: The model dir used to initialize the tokenizer. src_txt: The key of the first sequence. use_fast: Use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, - will be passed into tokenizer as the 'max_length' param. mode: The mode for the preprocessor. **kwargs: Extra args input into the tokenizer's __call__ method. """ @@ -255,7 +255,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): mode=mode, src_txt=src_txt, tgt_txt=tgt_txt, - sequence_length=sequence_length, use_fast=use_fast, truncation=kwargs.pop('truncation', True), padding=kwargs.pop('padding', 'max_length'), diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py index 574b94ae..86d42a3e 100644 --- a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py +++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py @@ -22,7 +22,7 @@ class TextRankingTransformersPreprocessor(Preprocessor): second_sequence='sentences_to_compare', label='labels', qid='qid', - sequence_length=128, + max_length=None, **kwargs): """The tokenizer preprocessor class for the text ranking preprocessor. @@ -33,7 +33,7 @@ class TextRankingTransformersPreprocessor(Preprocessor): label(str, `optional`): The keys of the label columns, default `labels`. qid(str, `optional`): The qid info. mode: The mode for the preprocessor. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. """ super().__init__(mode) @@ -42,7 +42,9 @@ class TextRankingTransformersPreprocessor(Preprocessor): self.second_sequence = second_sequence self.label = label self.qid = qid - self.sequence_length = sequence_length + self.sequence_length = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) @type_assert(object, dict) diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 1d42324d..eb94e85b 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -198,14 +198,14 @@ class TokenClassificationTransformersPreprocessor( label2id: Dict = None, label_all_tokens: bool = False, mode: str = ModeKeys.INFERENCE, - sequence_length=128, + max_length=None, use_fast=None, **kwargs): """ Args: use_fast: Whether to use the fast tokenizer or not. - sequence_length: The max sequence length which the model supported, + max_length: The max sequence length which the model supported, will be passed into tokenizer as the 'max_length' param. **kwargs: Extra args input into the tokenizer's __call__ method. """ @@ -219,7 +219,10 @@ class TokenClassificationTransformersPreprocessor( model_type = get_model_type(model_dir) kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 128) + kwargs.pop('sequence_length', None) kwargs['add_special_tokens'] = model_type != 'lstm' self.nlp_tokenizer = NLPTokenizerForLSTM( model_dir=model_dir, diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py index a7d87674..34b87e10 100644 --- a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py @@ -20,7 +20,7 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor): model_dir: str, first_sequence=None, mode=ModeKeys.INFERENCE, - sequence_length=512, + max_length=None, use_fast=None, **kwargs): """preprocess the data @@ -28,7 +28,10 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor): Args: model_dir (str): model path """ - self.sequence_length = sequence_length + kwargs[ + 'max_length'] = max_length if max_length is not None else kwargs.get( + 'sequence_length', 512) + kwargs.pop('sequence_length', None) model_type = None if model_dir is not None: model_type = get_model_type(model_dir)