To cooperate with other tokenizing args, change sequence_length to max_length, meanwhile making the input args compatible with old 'sequence_length' arg.master^2
| @@ -22,7 +22,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): | |||||
| first_sequence: str = None, | first_sequence: str = None, | ||||
| second_sequence: str = None, | second_sequence: str = None, | ||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| sequence_length: int = 128, | |||||
| max_length: int = None, | |||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| **kwargs): | **kwargs): | ||||
| """The preprocessor for feature extraction task, based on transformers' tokenizer. | """The preprocessor for feature extraction task, based on transformers' tokenizer. | ||||
| @@ -30,7 +30,7 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): | |||||
| Args: | Args: | ||||
| model_dir: The model dir used to initialize the tokenizer. | model_dir: The model dir used to initialize the tokenizer. | ||||
| use_fast: Use the fast tokenizer or not. | use_fast: Use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| @@ -38,7 +38,10 @@ class FeatureExtractionTransformersPreprocessor(Preprocessor): | |||||
| self.second_sequence = second_sequence | self.second_sequence = second_sequence | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | ||||
| True) | True) | ||||
| super().__init__(mode) | super().__init__(mode) | ||||
| @@ -111,7 +111,7 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase): | |||||
| first_sequence: str = None, | first_sequence: str = None, | ||||
| second_sequence: str = None, | second_sequence: str = None, | ||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| sequence_length: int = 128, | |||||
| max_length: int = None, | |||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| **kwargs): | **kwargs): | ||||
| """The preprocessor for fill mask task, based on transformers' tokenizer. | """The preprocessor for fill mask task, based on transformers' tokenizer. | ||||
| @@ -119,13 +119,16 @@ class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase): | |||||
| Args: | Args: | ||||
| model_dir: The model dir used to initialize the tokenizer. | model_dir: The model dir used to initialize the tokenizer. | ||||
| use_fast: Use the fast tokenizer or not. | use_fast: Use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | ||||
| True) | True) | ||||
| super().__init__(first_sequence, second_sequence, mode) | super().__init__(first_sequence, second_sequence, mode) | ||||
| @@ -183,7 +186,7 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): | |||||
| first_sequence: str = None, | first_sequence: str = None, | ||||
| second_sequence: str = None, | second_sequence: str = None, | ||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| sequence_length: int = 512, | |||||
| max_length: int = None, | |||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| **kwargs): | **kwargs): | ||||
| """The tokenizer preprocessor used in PoNet model's MLM task. | """The tokenizer preprocessor used in PoNet model's MLM task. | ||||
| @@ -191,13 +194,16 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): | |||||
| Args: | Args: | ||||
| model_dir: The model dir used to initialize the tokenizer. | model_dir: The model dir used to initialize the tokenizer. | ||||
| use_fast: Use the fast tokenizer or not. | use_fast: Use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 512) | |||||
| kwargs.pop('sequence_length', None) | |||||
| kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | ||||
| True) | True) | ||||
| super().__init__(first_sequence, second_sequence, mode) | super().__init__(first_sequence, second_sequence, mode) | ||||
| @@ -22,7 +22,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor): | |||||
| second_sequence='sentences_to_compare', | second_sequence='sentences_to_compare', | ||||
| mode=ModeKeys.INFERENCE, | mode=ModeKeys.INFERENCE, | ||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| sequence_length: int = 128, | |||||
| max_length: int = None, | |||||
| **kwargs): | **kwargs): | ||||
| """The preprocessor for sentence embedding task, based on transformers' tokenizer. | """The preprocessor for sentence embedding task, based on transformers' tokenizer. | ||||
| @@ -32,13 +32,16 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor): | |||||
| second_sequence: The key of the second sequence. | second_sequence: The key of the second sequence. | ||||
| mode: The mode for the preprocessor. | mode: The mode for the preprocessor. | ||||
| use_fast: Use the fast tokenizer or not. | use_fast: Use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| self.first_sequence = first_sequence | self.first_sequence = first_sequence | ||||
| self.second_sequence = second_sequence | self.second_sequence = second_sequence | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| model_type = None | model_type = None | ||||
| if model_dir is not None: | if model_dir is not None: | ||||
| model_type = get_model_type(model_dir) | model_type = get_model_type(model_dir) | ||||
| @@ -129,20 +129,23 @@ class TextClassificationTransformersPreprocessor( | |||||
| label: Union[str, List] = 'label', | label: Union[str, List] = 'label', | ||||
| label2id: Dict = None, | label2id: Dict = None, | ||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| sequence_length: int = 128, | |||||
| max_length: int = None, | |||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| **kwargs): | **kwargs): | ||||
| """The tokenizer preprocessor used in sequence classification. | """The tokenizer preprocessor used in sequence classification. | ||||
| Args: | Args: | ||||
| use_fast: Whether to use the fast tokenizer or not. | use_fast: Whether to use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| model_type = None | model_type = None | ||||
| if model_dir is not None: | if model_dir is not None: | ||||
| model_type = get_model_type(model_dir) | model_type = get_model_type(model_dir) | ||||
| @@ -99,7 +99,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): | |||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| src_txt='src_txt', | src_txt='src_txt', | ||||
| tgt_txt='tgt_txt', | tgt_txt='tgt_txt', | ||||
| sequence_length: int = 128, | |||||
| max_length: int = None, | |||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| **kwargs): | **kwargs): | ||||
| """The tokenizer preprocessor used in text generation. | """The tokenizer preprocessor used in text generation. | ||||
| @@ -109,7 +109,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): | |||||
| mode: The mode for the preprocessor. | mode: The mode for the preprocessor. | ||||
| src_txt: The key of the source sentence. | src_txt: The key of the source sentence. | ||||
| tgt_txt: The key of the generated sentence. | tgt_txt: The key of the generated sentence. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| use_fast: Whether to use the fast tokenizer or not. | use_fast: Whether to use the fast tokenizer or not. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| @@ -121,7 +121,10 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): | |||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', | ||||
| False) | False) | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| self.src_length = kwargs['max_length'] | self.src_length = kwargs['max_length'] | ||||
| self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length']) | self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length']) | ||||
| model_type = None | model_type = None | ||||
| @@ -237,7 +240,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): | |||||
| src_txt='src_txt', | src_txt='src_txt', | ||||
| tgt_txt='tgt_txt', | tgt_txt='tgt_txt', | ||||
| use_fast: bool = None, | use_fast: bool = None, | ||||
| sequence_length: int = 128, | |||||
| **kwargs): | **kwargs): | ||||
| """The preprocessor for text to text generation task, based on transformers' tokenizer. | """The preprocessor for text to text generation task, based on transformers' tokenizer. | ||||
| @@ -245,8 +247,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): | |||||
| model_dir: The model dir used to initialize the tokenizer. | model_dir: The model dir used to initialize the tokenizer. | ||||
| src_txt: The key of the first sequence. | src_txt: The key of the first sequence. | ||||
| use_fast: Use the fast tokenizer or not. | use_fast: Use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | |||||
| mode: The mode for the preprocessor. | mode: The mode for the preprocessor. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| @@ -255,7 +255,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): | |||||
| mode=mode, | mode=mode, | ||||
| src_txt=src_txt, | src_txt=src_txt, | ||||
| tgt_txt=tgt_txt, | tgt_txt=tgt_txt, | ||||
| sequence_length=sequence_length, | |||||
| use_fast=use_fast, | use_fast=use_fast, | ||||
| truncation=kwargs.pop('truncation', True), | truncation=kwargs.pop('truncation', True), | ||||
| padding=kwargs.pop('padding', 'max_length'), | padding=kwargs.pop('padding', 'max_length'), | ||||
| @@ -22,7 +22,7 @@ class TextRankingTransformersPreprocessor(Preprocessor): | |||||
| second_sequence='sentences_to_compare', | second_sequence='sentences_to_compare', | ||||
| label='labels', | label='labels', | ||||
| qid='qid', | qid='qid', | ||||
| sequence_length=128, | |||||
| max_length=None, | |||||
| **kwargs): | **kwargs): | ||||
| """The tokenizer preprocessor class for the text ranking preprocessor. | """The tokenizer preprocessor class for the text ranking preprocessor. | ||||
| @@ -33,7 +33,7 @@ class TextRankingTransformersPreprocessor(Preprocessor): | |||||
| label(str, `optional`): The keys of the label columns, default `labels`. | label(str, `optional`): The keys of the label columns, default `labels`. | ||||
| qid(str, `optional`): The qid info. | qid(str, `optional`): The qid info. | ||||
| mode: The mode for the preprocessor. | mode: The mode for the preprocessor. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| """ | """ | ||||
| super().__init__(mode) | super().__init__(mode) | ||||
| @@ -42,7 +42,9 @@ class TextRankingTransformersPreprocessor(Preprocessor): | |||||
| self.second_sequence = second_sequence | self.second_sequence = second_sequence | ||||
| self.label = label | self.label = label | ||||
| self.qid = qid | self.qid = qid | ||||
| self.sequence_length = sequence_length | |||||
| self.sequence_length = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | ||||
| @type_assert(object, dict) | @type_assert(object, dict) | ||||
| @@ -198,14 +198,14 @@ class TokenClassificationTransformersPreprocessor( | |||||
| label2id: Dict = None, | label2id: Dict = None, | ||||
| label_all_tokens: bool = False, | label_all_tokens: bool = False, | ||||
| mode: str = ModeKeys.INFERENCE, | mode: str = ModeKeys.INFERENCE, | ||||
| sequence_length=128, | |||||
| max_length=None, | |||||
| use_fast=None, | use_fast=None, | ||||
| **kwargs): | **kwargs): | ||||
| """ | """ | ||||
| Args: | Args: | ||||
| use_fast: Whether to use the fast tokenizer or not. | use_fast: Whether to use the fast tokenizer or not. | ||||
| sequence_length: The max sequence length which the model supported, | |||||
| max_length: The max sequence length which the model supported, | |||||
| will be passed into tokenizer as the 'max_length' param. | will be passed into tokenizer as the 'max_length' param. | ||||
| **kwargs: Extra args input into the tokenizer's __call__ method. | **kwargs: Extra args input into the tokenizer's __call__ method. | ||||
| """ | """ | ||||
| @@ -219,7 +219,10 @@ class TokenClassificationTransformersPreprocessor( | |||||
| model_type = get_model_type(model_dir) | model_type = get_model_type(model_dir) | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get('padding', 'max_length') | kwargs['padding'] = kwargs.get('padding', 'max_length') | ||||
| kwargs['max_length'] = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 128) | |||||
| kwargs.pop('sequence_length', None) | |||||
| kwargs['add_special_tokens'] = model_type != 'lstm' | kwargs['add_special_tokens'] = model_type != 'lstm' | ||||
| self.nlp_tokenizer = NLPTokenizerForLSTM( | self.nlp_tokenizer = NLPTokenizerForLSTM( | ||||
| model_dir=model_dir, | model_dir=model_dir, | ||||
| @@ -20,7 +20,7 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor): | |||||
| model_dir: str, | model_dir: str, | ||||
| first_sequence=None, | first_sequence=None, | ||||
| mode=ModeKeys.INFERENCE, | mode=ModeKeys.INFERENCE, | ||||
| sequence_length=512, | |||||
| max_length=None, | |||||
| use_fast=None, | use_fast=None, | ||||
| **kwargs): | **kwargs): | ||||
| """preprocess the data | """preprocess the data | ||||
| @@ -28,7 +28,10 @@ class ZeroShotClassificationTransformersPreprocessor(Preprocessor): | |||||
| Args: | Args: | ||||
| model_dir (str): model path | model_dir (str): model path | ||||
| """ | """ | ||||
| self.sequence_length = sequence_length | |||||
| kwargs[ | |||||
| 'max_length'] = max_length if max_length is not None else kwargs.get( | |||||
| 'sequence_length', 512) | |||||
| kwargs.pop('sequence_length', None) | |||||
| model_type = None | model_type = None | ||||
| if model_dir is not None: | if model_dir is not None: | ||||
| model_type = get_model_type(model_dir) | model_type = get_model_type(model_dir) | ||||