1. Redo a CR in current code
2. Refactor sbert's model configs
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9643861
master
| @@ -53,7 +53,7 @@ from .configuration_sbert import SbertConfig | |||||
| logger = get_logger(__name__) | logger = get_logger(__name__) | ||||
| _CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512' | |||||
| _CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std' | |||||
| _CONFIG_FOR_DOC = 'SbertConfig' | _CONFIG_FOR_DOC = 'SbertConfig' | ||||
| _TOKENIZER_FOR_DOC = 'SbertTokenizer' | _TOKENIZER_FOR_DOC = 'SbertTokenizer' | ||||
| @@ -32,8 +32,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} | |||||
| PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} | PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} | ||||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | ||||
| 'chinese_sbert-large-std-512': 512, | |||||
| 'english_sbert-large-std-512': 512, | |||||
| 'nlp_structbert_backbone_large_std': 512, | |||||
| 'nlp_structbert_backbone_base_std': 512, | |||||
| 'nlp_structbert_backbone_lite_std': 512, | |||||
| 'nlp_structbert_backbone_tiny_std': 512, | |||||
| } | } | ||||
| PRETRAINED_INIT_CONFIGURATION = { | PRETRAINED_INIT_CONFIGURATION = { | ||||
| @@ -38,8 +38,10 @@ PRETRAINED_VOCAB_FILES_MAP = { | |||||
| } | } | ||||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | ||||
| 'chinese_sbert-large-std-512': 512, | |||||
| 'english_sbert-large-std-512': 512, | |||||
| 'nlp_structbert_backbone_large_std': 512, | |||||
| 'nlp_structbert_backbone_base_std': 512, | |||||
| 'nlp_structbert_backbone_lite_std': 512, | |||||
| 'nlp_structbert_backbone_tiny_std': 512, | |||||
| } | } | ||||
| PRETRAINED_INIT_CONFIGURATION = { | PRETRAINED_INIT_CONFIGURATION = { | ||||
| @@ -37,7 +37,8 @@ class FillMaskPipeline(Pipeline): | |||||
| preprocessor = FillMaskPreprocessor( | preprocessor = FillMaskPreprocessor( | ||||
| fill_mask_model.model_dir, | fill_mask_model.model_dir, | ||||
| first_sequence=first_sequence, | first_sequence=first_sequence, | ||||
| second_sequence=None) | |||||
| second_sequence=None, | |||||
| sequence_length=kwargs.pop('sequence_length', 128)) | |||||
| fill_mask_model.eval() | fill_mask_model.eval() | ||||
| super().__init__( | super().__init__( | ||||
| model=fill_mask_model, preprocessor=preprocessor, **kwargs) | model=fill_mask_model, preprocessor=preprocessor, **kwargs) | ||||
| @@ -26,7 +26,9 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||||
| model = model if isinstance(model, | model = model if isinstance(model, | ||||
| Model) else Model.from_pretrained(model) | Model) else Model.from_pretrained(model) | ||||
| if preprocessor is None: | if preprocessor is None: | ||||
| preprocessor = NERPreprocessor(model.model_dir) | |||||
| preprocessor = NERPreprocessor( | |||||
| model.model_dir, | |||||
| sequence_length=kwargs.pop('sequence_length', 512)) | |||||
| model.eval() | model.eval() | ||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| self.tokenizer = preprocessor.tokenizer | self.tokenizer = preprocessor.tokenizer | ||||
| @@ -33,5 +33,6 @@ class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): | |||||
| preprocessor = PairSentenceClassificationPreprocessor( | preprocessor = PairSentenceClassificationPreprocessor( | ||||
| model.model_dir if isinstance(model, Model) else model, | model.model_dir if isinstance(model, Model) else model, | ||||
| first_sequence=first_sequence, | first_sequence=first_sequence, | ||||
| second_sequence=second_sequence) | |||||
| second_sequence=second_sequence, | |||||
| sequence_length=kwargs.pop('sequence_length', 512)) | |||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| @@ -37,7 +37,8 @@ class SequenceClassificationPipeline(Pipeline): | |||||
| preprocessor = SequenceClassificationPreprocessor( | preprocessor = SequenceClassificationPreprocessor( | ||||
| sc_model.model_dir, | sc_model.model_dir, | ||||
| first_sequence='sentence', | first_sequence='sentence', | ||||
| second_sequence=None) | |||||
| second_sequence=None, | |||||
| sequence_length=kwargs.pop('sequence_length', 512)) | |||||
| super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) | super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) | ||||
| assert hasattr(self.model, 'id2label'), \ | assert hasattr(self.model, 'id2label'), \ | ||||
| @@ -31,5 +31,6 @@ class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): | |||||
| if preprocessor is None: | if preprocessor is None: | ||||
| preprocessor = SingleSentenceClassificationPreprocessor( | preprocessor = SingleSentenceClassificationPreprocessor( | ||||
| model.model_dir if isinstance(model, Model) else model, | model.model_dir if isinstance(model, Model) else model, | ||||
| first_sequence=first_sequence) | |||||
| first_sequence=first_sequence, | |||||
| sequence_length=kwargs.pop('sequence_length', 512)) | |||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| @@ -32,7 +32,8 @@ class TextGenerationPipeline(Pipeline): | |||||
| preprocessor = TextGenerationPreprocessor( | preprocessor = TextGenerationPreprocessor( | ||||
| model.model_dir, | model.model_dir, | ||||
| first_sequence='sentence', | first_sequence='sentence', | ||||
| second_sequence=None) | |||||
| second_sequence=None, | |||||
| sequence_length=kwargs.pop('sequence_length', 128)) | |||||
| model.eval() | model.eval() | ||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| @@ -31,7 +31,9 @@ class WordSegmentationPipeline(Pipeline): | |||||
| model = model if isinstance(model, | model = model if isinstance(model, | ||||
| Model) else Model.from_pretrained(model) | Model) else Model.from_pretrained(model) | ||||
| if preprocessor is None: | if preprocessor is None: | ||||
| preprocessor = TokenClassificationPreprocessor(model.model_dir) | |||||
| preprocessor = TokenClassificationPreprocessor( | |||||
| model.model_dir, | |||||
| sequence_length=kwargs.pop('sequence_length', 128)) | |||||
| model.eval() | model.eval() | ||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| self.id2label = kwargs.get('id2label') | self.id2label = kwargs.get('id2label') | ||||
| @@ -36,7 +36,9 @@ class ZeroShotClassificationPipeline(Pipeline): | |||||
| self.entailment_id = 0 | self.entailment_id = 0 | ||||
| self.contradiction_id = 2 | self.contradiction_id = 2 | ||||
| if preprocessor is None: | if preprocessor is None: | ||||
| preprocessor = ZeroShotClassificationPreprocessor(model.model_dir) | |||||
| preprocessor = ZeroShotClassificationPreprocessor( | |||||
| model.model_dir, | |||||
| sequence_length=kwargs.pop('sequence_length', 512)) | |||||
| model.eval() | model.eval() | ||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | super().__init__(model=model, preprocessor=preprocessor, **kwargs) | ||||
| @@ -216,7 +216,7 @@ class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||||
| def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): | def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get( | kwargs['padding'] = kwargs.get( | ||||
| 'padding', False if mode == 'inference' else 'max_length') | |||||
| 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') | |||||
| kwargs['max_length'] = kwargs.pop('sequence_length', 128) | kwargs['max_length'] = kwargs.pop('sequence_length', 128) | ||||
| super().__init__(model_dir, pair=True, mode=mode, **kwargs) | super().__init__(model_dir, pair=True, mode=mode, **kwargs) | ||||
| @@ -228,7 +228,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||||
| def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): | def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): | ||||
| kwargs['truncation'] = kwargs.get('truncation', True) | kwargs['truncation'] = kwargs.get('truncation', True) | ||||
| kwargs['padding'] = kwargs.get( | kwargs['padding'] = kwargs.get( | ||||
| 'padding', False if mode == 'inference' else 'max_length') | |||||
| 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') | |||||
| kwargs['max_length'] = kwargs.pop('sequence_length', 128) | kwargs['max_length'] = kwargs.pop('sequence_length', 128) | ||||
| super().__init__(model_dir, pair=False, mode=mode, **kwargs) | super().__init__(model_dir, pair=False, mode=mode, **kwargs) | ||||
| @@ -309,7 +309,7 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): | |||||
| return super().build_tokenizer(model_dir) | return super().build_tokenizer(model_dir) | ||||
| def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: | def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: | ||||
| if self._mode == 'inference': | |||||
| if self._mode == ModeKeys.INFERENCE: | |||||
| return super().__call__(data) | return super().__call__(data) | ||||
| src_txt = data['src_txt'] | src_txt = data['src_txt'] | ||||
| tgt_txt = data['tgt_txt'] | tgt_txt = data['tgt_txt'] | ||||
| @@ -420,6 +420,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||||
| elif isinstance(data, dict): | elif isinstance(data, dict): | ||||
| text_a = data.get(self.first_sequence) | text_a = data.get(self.first_sequence) | ||||
| labels_list = data.get(self.label) | labels_list = data.get(self.label) | ||||
| text_a = text_a.replace(' ', '').strip() | |||||
| tokenized_inputs = self.tokenizer( | tokenized_inputs = self.tokenizer( | ||||
| text_a, | text_a, | ||||
| return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | ||||
| @@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level | |||||
| class SentimentClassificationTest(unittest.TestCase): | class SentimentClassificationTest(unittest.TestCase): | ||||
| model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' | |||||
| model_id = 'damo/nlp_structbert_sentiment-classification_chinese-tiny' | |||||
| sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' | sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' | ||||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||