From 348e87e697649d7c3a233a57697b981f43240497 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 2 Dec 2022 16:57:09 +0800
Subject: [PATCH] change sequence_length to max_length

To cooperate with other tokenizing args, change sequence_length to max_length, meanwhile making the input args compatible with old 'sequence_length' arg.
---
 .../nlp/feature_extraction_preprocessor.py     |  9 ++++++---
 .../nlp/fill_mask_preprocessor.py              | 18 ++++++++++++------
 .../nlp/sentence_embedding_preprocessor.py     |  9 ++++++---
 .../nlp/text_classification_preprocessor.py    |  9 ++++++---
 .../nlp/text_generation_preprocessor.py        | 13 ++++++-------
 .../nlp/text_ranking_preprocessor.py           |  8 +++++---
 .../nlp/token_classification_preprocessor.py   |  9 ++++++---
 .../zero_shot_classification_preprocessor.py   |  7 +++++--
 8 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
index 249aa24c..2f7f5d14 100644
--- a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
+++ b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
@@ -22,7 +22,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
                  first_sequence: str = None,
                  second_sequence: str = None,
                  mode: str = ModeKeys.INFERENCE,
-                 sequence_length: int = 128,
+                 max_length: int = None,
                  use_fast: bool = None,
                  **kwargs):
         """The preprocessor for feature extraction task, based on transformers' tokenizer.
@@ -30,7 +30,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
         Args:
             model_dir: The model dir used to initialize the tokenizer.
             use_fast: Use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
@@ -38,7 +38,10 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
         self.second_sequence = second_sequence
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      True)
         super().__init__(mode)
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
index 80ac441f..0b9597d4 100644
--- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -111,7 +111,7 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase):
                  first_sequence: str = None,
                  second_sequence: str = None,
                  mode: str = ModeKeys.INFERENCE,
-                 sequence_length: int = 128,
+                 max_length: int = None,
                  use_fast: bool = None,
                  **kwargs):
         """The preprocessor for fill mask task, based on transformers' tokenizer.
@@ -119,13 +119,16 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase):
         Args:
             model_dir: The model dir used to initialize the tokenizer.
             use_fast: Use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      True)
         super().__init__(first_sequence, second_sequence, mode)
@@ -183,7 +186,7 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
                  first_sequence: str = None,
                  second_sequence: str = None,
                  mode: str = ModeKeys.INFERENCE,
-                 sequence_length: int = 512,
+                 max_length: int = None,
                  use_fast: bool = None,
                  **kwargs):
         """The tokenizer preprocessor used in PoNet model's MLM task.
@@ -191,13 +194,16 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
         Args:
             model_dir: The model dir used to initialize the tokenizer.
             use_fast: Use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 512)
+        kwargs.pop('sequence_length', None)
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      True)
         super().__init__(first_sequence, second_sequence, mode)
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index ccbf3ef2..77d65dec 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -22,7 +22,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
                  second_sequence='sentences_to_compare',
                  mode=ModeKeys.INFERENCE,
                  use_fast: bool = None,
-                 sequence_length: int = 128,
+                 max_length: int = None,
                  **kwargs):
         """The preprocessor for sentence embedding task, based on transformers' tokenizer.
 
@@ -32,13 +32,16 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
             second_sequence: The key of the second sequence.
             mode: The mode for the preprocessor.
             use_fast: Use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
         self.first_sequence = first_sequence
         self.second_sequence = second_sequence
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         model_type = None
         if model_dir is not None:
             model_type = get_model_type(model_dir)
diff --git a/modelscope/preprocessors/nlp/text_classification_preprocessor.py b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
index 06820e6c..ef38594f 100644
--- a/modelscope/preprocessors/nlp/text_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
@@ -129,20 +129,23 @@ class TextClassificationTransformersPreprocessor(
                  label: Union[str, List] = 'label',
                  label2id: Dict = None,
                  mode: str = ModeKeys.INFERENCE,
-                 sequence_length: int = 128,
+                 max_length: int = None,
                  use_fast: bool = None,
                  **kwargs):
         """The tokenizer preprocessor used in sequence classification.
 
         Args:
             use_fast: Whether to use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         model_type = None
         if model_dir is not None:
             model_type = get_model_type(model_dir)
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
index 2823748b..e0f8d943 100644
--- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -99,7 +99,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
                  mode: str = ModeKeys.INFERENCE,
                  src_txt='src_txt',
                  tgt_txt='tgt_txt',
-                 sequence_length: int = 128,
+                 max_length: int = None,
                  use_fast: bool = None,
                  **kwargs):
         """The tokenizer preprocessor used in text generation.
@@ -109,7 +109,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
             mode: The mode for the preprocessor.
             src_txt: The key of the source sentence.
             tgt_txt: The key of the generated sentence.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             use_fast: Whether to use the fast tokenizer or not.
             **kwargs: Extra args input into the tokenizer's __call__ method.
@@ -121,7 +121,10 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
         kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         self.src_length = kwargs['max_length']
         self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length'])
         model_type = None
@@ -237,7 +240,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
                  src_txt='src_txt',
                  tgt_txt='tgt_txt',
                  use_fast: bool = None,
-                 sequence_length: int = 128,
                  **kwargs):
         """The preprocessor for text to text generation task, based on transformers' tokenizer.
 
@@ -245,8 +247,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
             model_dir: The model dir used to initialize the tokenizer.
             src_txt: The key of the first sequence.
             use_fast: Use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
-                will be passed into tokenizer as the 'max_length' param.
             mode: The mode for the preprocessor.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
@@ -255,7 +255,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
             mode=mode,
             src_txt=src_txt,
             tgt_txt=tgt_txt,
-            sequence_length=sequence_length,
             use_fast=use_fast,
             truncation=kwargs.pop('truncation', True),
             padding=kwargs.pop('padding', 'max_length'),
diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
index 574b94ae..86d42a3e 100644
--- a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
@@ -22,7 +22,7 @@ class TextRankingTransformersPreprocessor(Preprocessor):
                  second_sequence='sentences_to_compare',
                  label='labels',
                  qid='qid',
-                 sequence_length=128,
+                 max_length=None,
                  **kwargs):
         """The tokenizer preprocessor class for the text ranking preprocessor.
 
@@ -33,7 +33,7 @@ class TextRankingTransformersPreprocessor(Preprocessor):
             label(str, `optional`): The keys of the label columns, default `labels`.
             qid(str, `optional`): The qid info.
             mode: The mode for the preprocessor.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
         """
         super().__init__(mode)
@@ -42,7 +42,9 @@ class TextRankingTransformersPreprocessor(Preprocessor):
         self.second_sequence = second_sequence
         self.label = label
         self.qid = qid
-        self.sequence_length = sequence_length
+        self.sequence_length = max_length if max_length is not None else kwargs.get(
+            'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
 
     @type_assert(object, dict)
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 1d42324d..eb94e85b 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -198,14 +198,14 @@ class TokenClassificationTransformersPreprocessor(
                  label2id: Dict = None,
                  label_all_tokens: bool = False,
                  mode: str = ModeKeys.INFERENCE,
-                 sequence_length=128,
+                 max_length=None,
                  use_fast=None,
                  **kwargs):
         """
 
         Args:
             use_fast: Whether to use the fast tokenizer or not.
-            sequence_length: The max sequence length which the model supported,
+            max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
@@ -219,7 +219,10 @@ class TokenClassificationTransformersPreprocessor(
             model_type = get_model_type(model_dir)
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
         kwargs['add_special_tokens'] = model_type != 'lstm'
         self.nlp_tokenizer = NLPTokenizerForLSTM(
             model_dir=model_dir,
diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py
index a7d87674..34b87e10 100644
--- a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py
@@ -20,7 +20,7 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor):
                  model_dir: str,
                  first_sequence=None,
                  mode=ModeKeys.INFERENCE,
-                 sequence_length=512,
+                 max_length=None,
                  use_fast=None,
                  **kwargs):
         """preprocess the data
@@ -28,7 +28,10 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor):
         Args:
             model_dir (str): model path
         """
-        self.sequence_length = sequence_length
+        kwargs[
+            'max_length'] = max_length if max_length is not None else kwargs.get(
+                'sequence_length', 512)
+        kwargs.pop('sequence_length', None)
         model_type = None
         if model_dir is not None:
             model_type = get_model_type(model_dir)