Browse Source

change sequence_length to max_length

To cooperate with other tokenizing args, change sequence_length to max_length, meanwhile making the input args compatible with old 'sequence_length' arg.
master^2
yuze.zyz 3 years ago
parent
commit
348e87e697
8 changed files with 52 additions and 30 deletions
  1. +6
    -3
      modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
  2. +12
    -6
      modelscope/preprocessors/nlp/fill_mask_preprocessor.py
  3. +6
    -3
      modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
  4. +6
    -3
      modelscope/preprocessors/nlp/text_classification_preprocessor.py
  5. +6
    -7
      modelscope/preprocessors/nlp/text_generation_preprocessor.py
  6. +5
    -3
      modelscope/preprocessors/nlp/text_ranking_preprocessor.py
  7. +6
    -3
      modelscope/preprocessors/nlp/token_classification_preprocessor.py
  8. +5
    -2
      modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py

+ 6
- 3
modelscope/preprocessors/nlp/feature_extraction_preprocessor.py View File

@@ -22,7 +22,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
first_sequence: str = None, first_sequence: str = None,
second_sequence: str = None, second_sequence: str = None,
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
sequence_length: int = 128,
max_length: int = None,
use_fast: bool = None, use_fast: bool = None,
**kwargs): **kwargs):
"""The preprocessor for feature extraction task, based on transformers' tokenizer. """The preprocessor for feature extraction task, based on transformers' tokenizer.
@@ -30,7 +30,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
Args: Args:
model_dir: The model dir used to initialize the tokenizer. model_dir: The model dir used to initialize the tokenizer.
use_fast: Use the fast tokenizer or not. use_fast: Use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
@@ -38,7 +38,10 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor):
self.second_sequence = second_sequence self.second_sequence = second_sequence
kwargs['truncation'] = kwargs.get('truncation', True) kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
True) True)
super().__init__(mode) super().__init__(mode)


+ 12
- 6
modelscope/preprocessors/nlp/fill_mask_preprocessor.py View File

@@ -111,7 +111,7 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase):
first_sequence: str = None, first_sequence: str = None,
second_sequence: str = None, second_sequence: str = None,
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
sequence_length: int = 128,
max_length: int = None,
use_fast: bool = None, use_fast: bool = None,
**kwargs): **kwargs):
"""The preprocessor for fill mask task, based on transformers' tokenizer. """The preprocessor for fill mask task, based on transformers' tokenizer.
@@ -119,13 +119,16 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase):
Args: Args:
model_dir: The model dir used to initialize the tokenizer. model_dir: The model dir used to initialize the tokenizer.
use_fast: Use the fast tokenizer or not. use_fast: Use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
kwargs['truncation'] = kwargs.get('truncation', True) kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
True) True)
super().__init__(first_sequence, second_sequence, mode) super().__init__(first_sequence, second_sequence, mode)
@@ -183,7 +186,7 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
first_sequence: str = None, first_sequence: str = None,
second_sequence: str = None, second_sequence: str = None,
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
sequence_length: int = 512,
max_length: int = None,
use_fast: bool = None, use_fast: bool = None,
**kwargs): **kwargs):
"""The tokenizer preprocessor used in PoNet model's MLM task. """The tokenizer preprocessor used in PoNet model's MLM task.
@@ -191,13 +194,16 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
Args: Args:
model_dir: The model dir used to initialize the tokenizer. model_dir: The model dir used to initialize the tokenizer.
use_fast: Use the fast tokenizer or not. use_fast: Use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
kwargs['truncation'] = kwargs.get('truncation', True) kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 512)
kwargs.pop('sequence_length', None)
kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
True) True)
super().__init__(first_sequence, second_sequence, mode) super().__init__(first_sequence, second_sequence, mode)


+ 6
- 3
modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py View File

@@ -22,7 +22,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
second_sequence='sentences_to_compare', second_sequence='sentences_to_compare',
mode=ModeKeys.INFERENCE, mode=ModeKeys.INFERENCE,
use_fast: bool = None, use_fast: bool = None,
sequence_length: int = 128,
max_length: int = None,
**kwargs): **kwargs):
"""The preprocessor for sentence embedding task, based on transformers' tokenizer. """The preprocessor for sentence embedding task, based on transformers' tokenizer.


@@ -32,13 +32,16 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
second_sequence: The key of the second sequence. second_sequence: The key of the second sequence.
mode: The mode for the preprocessor. mode: The mode for the preprocessor.
use_fast: Use the fast tokenizer or not. use_fast: Use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
self.first_sequence = first_sequence self.first_sequence = first_sequence
self.second_sequence = second_sequence self.second_sequence = second_sequence
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
model_type = None model_type = None
if model_dir is not None: if model_dir is not None:
model_type = get_model_type(model_dir) model_type = get_model_type(model_dir)


+ 6
- 3
modelscope/preprocessors/nlp/text_classification_preprocessor.py View File

@@ -129,20 +129,23 @@ class TextClassificationTransformersPreprocessor(
label: Union[str, List] = 'label', label: Union[str, List] = 'label',
label2id: Dict = None, label2id: Dict = None,
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
sequence_length: int = 128,
max_length: int = None,
use_fast: bool = None, use_fast: bool = None,
**kwargs): **kwargs):
"""The tokenizer preprocessor used in sequence classification. """The tokenizer preprocessor used in sequence classification.


Args: Args:
use_fast: Whether to use the fast tokenizer or not. use_fast: Whether to use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
kwargs['truncation'] = kwargs.get('truncation', True) kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
model_type = None model_type = None
if model_dir is not None: if model_dir is not None:
model_type = get_model_type(model_dir) model_type = get_model_type(model_dir)


+ 6
- 7
modelscope/preprocessors/nlp/text_generation_preprocessor.py View File

@@ -99,7 +99,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
src_txt='src_txt', src_txt='src_txt',
tgt_txt='tgt_txt', tgt_txt='tgt_txt',
sequence_length: int = 128,
max_length: int = None,
use_fast: bool = None, use_fast: bool = None,
**kwargs): **kwargs):
"""The tokenizer preprocessor used in text generation. """The tokenizer preprocessor used in text generation.
@@ -109,7 +109,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
mode: The mode for the preprocessor. mode: The mode for the preprocessor.
src_txt: The key of the source sentence. src_txt: The key of the source sentence.
tgt_txt: The key of the generated sentence. tgt_txt: The key of the generated sentence.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
use_fast: Whether to use the fast tokenizer or not. use_fast: Whether to use the fast tokenizer or not.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
@@ -121,7 +121,10 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
False) False)
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
self.src_length = kwargs['max_length'] self.src_length = kwargs['max_length']
self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length']) self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length'])
model_type = None model_type = None
@@ -237,7 +240,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
src_txt='src_txt', src_txt='src_txt',
tgt_txt='tgt_txt', tgt_txt='tgt_txt',
use_fast: bool = None, use_fast: bool = None,
sequence_length: int = 128,
**kwargs): **kwargs):
"""The preprocessor for text to text generation task, based on transformers' tokenizer. """The preprocessor for text to text generation task, based on transformers' tokenizer.


@@ -245,8 +247,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
model_dir: The model dir used to initialize the tokenizer. model_dir: The model dir used to initialize the tokenizer.
src_txt: The key of the first sequence. src_txt: The key of the first sequence.
use_fast: Use the fast tokenizer or not. use_fast: Use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param.
mode: The mode for the preprocessor. mode: The mode for the preprocessor.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
@@ -255,7 +255,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
mode=mode, mode=mode,
src_txt=src_txt, src_txt=src_txt,
tgt_txt=tgt_txt, tgt_txt=tgt_txt,
sequence_length=sequence_length,
use_fast=use_fast, use_fast=use_fast,
truncation=kwargs.pop('truncation', True), truncation=kwargs.pop('truncation', True),
padding=kwargs.pop('padding', 'max_length'), padding=kwargs.pop('padding', 'max_length'),


+ 5
- 3
modelscope/preprocessors/nlp/text_ranking_preprocessor.py View File

@@ -22,7 +22,7 @@ class TextRankingTransformersPreprocessor(Preprocessor):
second_sequence='sentences_to_compare', second_sequence='sentences_to_compare',
label='labels', label='labels',
qid='qid', qid='qid',
sequence_length=128,
max_length=None,
**kwargs): **kwargs):
"""The tokenizer preprocessor class for the text ranking preprocessor. """The tokenizer preprocessor class for the text ranking preprocessor.


@@ -33,7 +33,7 @@ class TextRankingTransformersPreprocessor(Preprocessor):
label(str, `optional`): The keys of the label columns, default `labels`. label(str, `optional`): The keys of the label columns, default `labels`.
qid(str, `optional`): The qid info. qid(str, `optional`): The qid info.
mode: The mode for the preprocessor. mode: The mode for the preprocessor.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
""" """
super().__init__(mode) super().__init__(mode)
@@ -42,7 +42,9 @@ class TextRankingTransformersPreprocessor(Preprocessor):
self.second_sequence = second_sequence self.second_sequence = second_sequence
self.label = label self.label = label
self.qid = qid self.qid = qid
self.sequence_length = sequence_length
self.sequence_length = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)


@type_assert(object, dict) @type_assert(object, dict)


+ 6
- 3
modelscope/preprocessors/nlp/token_classification_preprocessor.py View File

@@ -198,14 +198,14 @@ class TokenClassificationTransformersPreprocessor(
label2id: Dict = None, label2id: Dict = None,
label_all_tokens: bool = False, label_all_tokens: bool = False,
mode: str = ModeKeys.INFERENCE, mode: str = ModeKeys.INFERENCE,
sequence_length=128,
max_length=None,
use_fast=None, use_fast=None,
**kwargs): **kwargs):
""" """


Args: Args:
use_fast: Whether to use the fast tokenizer or not. use_fast: Whether to use the fast tokenizer or not.
sequence_length: The max sequence length which the model supported,
max_length: The max sequence length which the model supported,
will be passed into tokenizer as the 'max_length' param. will be passed into tokenizer as the 'max_length' param.
**kwargs: Extra args input into the tokenizer's __call__ method. **kwargs: Extra args input into the tokenizer's __call__ method.
""" """
@@ -219,7 +219,10 @@ class TokenClassificationTransformersPreprocessor(
model_type = get_model_type(model_dir) model_type = get_model_type(model_dir)
kwargs['truncation'] = kwargs.get('truncation', True) kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['padding'] = kwargs.get('padding', 'max_length')
kwargs['max_length'] = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 128)
kwargs.pop('sequence_length', None)
kwargs['add_special_tokens'] = model_type != 'lstm' kwargs['add_special_tokens'] = model_type != 'lstm'
self.nlp_tokenizer = NLPTokenizerForLSTM( self.nlp_tokenizer = NLPTokenizerForLSTM(
model_dir=model_dir, model_dir=model_dir,


+ 5
- 2
modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py View File

@@ -20,7 +20,7 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor):
model_dir: str, model_dir: str,
first_sequence=None, first_sequence=None,
mode=ModeKeys.INFERENCE, mode=ModeKeys.INFERENCE,
sequence_length=512,
max_length=None,
use_fast=None, use_fast=None,
**kwargs): **kwargs):
"""preprocess the data """preprocess the data
@@ -28,7 +28,10 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor):
Args: Args:
model_dir (str): model path model_dir (str): model path
""" """
self.sequence_length = sequence_length
kwargs[
'max_length'] = max_length if max_length is not None else kwargs.get(
'sequence_length', 512)
kwargs.pop('sequence_length', None)
model_type = None model_type = None
if model_dir is not None: if model_dir is not None:
model_type = get_model_type(model_dir) model_type = get_model_type(model_dir)


Loading…
Cancel
Save