From c7a19c9c1f38442523251ccab4981b44669d05b5 Mon Sep 17 00:00:00 2001 From: suluyan Date: Mon, 20 Jun 2022 16:17:31 +0800 Subject: [PATCH] fix comments: rename and refactor AliceMindMLM; adjust pipeline --- .../models/nlp/masked_language_model.py | 24 ++++--- .../pipelines/nlp/fill_mask_pipeline.py | 63 ++++++++++++------- tests/pipelines/test_fill_mask.py | 8 +-- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py index cb12a4dd..848d7484 100644 --- a/modelscope/models/nlp/masked_language_model.py +++ b/modelscope/models/nlp/masked_language_model.py @@ -6,12 +6,12 @@ from ...utils.constant import Tasks from ..base import Model, Tensor from ..builder import MODELS -__all__ = ['MaskedLanguageModel'] +__all__ = [ + 'StructBertForMaskedLM', 'VecoForMaskedLM', 'AliceMindBaseForMaskedLM' +] -@MODELS.register_module(Tasks.fill_mask, module_name=r'sbert') -@MODELS.register_module(Tasks.fill_mask, module_name=r'veco') -class MaskedLanguageModel(Model): +class AliceMindBaseForMaskedLM(Model): def __init__(self, model_dir: str, *args, **kwargs): from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM @@ -30,15 +30,19 @@ class MaskedLanguageModel(Model): Returns: Dict[str, np.ndarray]: results - Example: - { - 'predictions': array([1]), # lable 0-negative 1-positive - 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } """ rst = self.model( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids']) return {'logits': rst['logits'], 'input_ids': inputs['input_ids']} + + +@MODELS.register_module(Tasks.fill_mask, module_name=r'sbert') +class StructBertForMaskedLM(AliceMindBaseForMaskedLM): + pass + + +@MODELS.register_module(Tasks.fill_mask, module_name=r'veco') +class VecoForMaskedLM(AliceMindBaseForMaskedLM): + pass diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 14b1d317..abe5b5b5 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -1,7 +1,7 @@ from typing import Dict, Optional from modelscope.models import Model -from modelscope.models.nlp import MaskedLanguageModel +from modelscope.models.nlp import AliceMindBaseForMaskedLM from modelscope.preprocessors import FillMaskPreprocessor from modelscope.utils.constant import Tasks from ..base import Pipeline, Tensor @@ -15,20 +15,20 @@ __all__ = ['FillMaskPipeline'] class FillMaskPipeline(Pipeline): def __init__(self, - model: MaskedLanguageModel, + model: AliceMindBaseForMaskedLM, preprocessor: Optional[FillMaskPreprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction Args: - model (MaskedLanguageModel): a model instance + model (AliceMindBaseForMaskedLM): a model instance preprocessor (FillMaskPreprocessor): a preprocessor instance """ - sc_model = model if isinstance( - model, MaskedLanguageModel) else Model.from_pretrained(model) + fill_mask_model = model if isinstance( + model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model) if preprocessor is None: preprocessor = FillMaskPreprocessor( - sc_model.model_dir, + fill_mask_model.model_dir, first_sequence='sentence', second_sequence=None) super().__init__(model=model, preprocessor=preprocessor, **kwargs) @@ -36,6 +36,27 @@ class FillMaskPipeline(Pipeline): self.tokenizer = preprocessor.tokenizer self.mask_id = {'veco': 250001, 'sbert': 103} + self.rep_map = { + 'sbert': { + '[unused0]': '', + '[PAD]': '', + '[unused1]': '', + r' +': ' ', + '[SEP]': '', + '[unused2]': '', + '[CLS]': '', + '[UNK]': '' + }, + 'veco': { + r' +': ' ', + '': '', + '': '', + '': '', + '': '', + '': ' ' + } + } + def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: """process the prediction results @@ -49,25 +70,23 @@ class FillMaskPipeline(Pipeline): logits = inputs['logits'].detach().numpy() input_ids = inputs['input_ids'].detach().numpy() pred_ids = np.argmax(logits, axis=-1) - rst_ids = np.where( - input_ids == self.mask_id[self.model.config.model_type], pred_ids, - input_ids) + model_type = self.model.config.model_type + rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids, + input_ids) + + def rep_tokens(string, rep_map): + for k, v in rep_map.items(): + string = string.replace(k, v) + return string.strip() + pred_strings = [] - for ids in rst_ids: - if self.model.config.model_type == 'veco': - pred_string = self.tokenizer.decode(ids).split( - '')[0].replace('', - '').replace('', - '').replace('', '') - elif self.model.config.vocab_size == 21128: # zh bert + for ids in rst_ids: # batch + if self.model.config.vocab_size == 21128: # zh bert pred_string = self.tokenizer.convert_ids_to_tokens(ids) - pred_string = ''.join(pred_string).replace('##', '') - pred_string = pred_string.split('[SEP]')[0].replace( - '[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') - else: # en bert + pred_string = ''.join(pred_string) + else: pred_string = self.tokenizer.decode(ids) - pred_string = pred_string.split('[SEP]')[0].replace( - '[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') + pred_string = rep_tokens(pred_string, self.rep_map[model_type]) pred_strings.append(pred_string) return {'text': pred_strings} diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 293608e0..a4d53403 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -6,7 +6,7 @@ import unittest from maas_hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import MaskedLanguageModel +from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM from modelscope.pipelines import FillMaskPipeline, pipeline from modelscope.preprocessors import FillMaskPreprocessor from modelscope.utils.constant import Tasks @@ -39,14 +39,14 @@ class FillMaskTest(unittest.TestCase): '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.' } - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_by_direct_model_download(self): # sbert for language in ['zh', 'en']: model_dir = snapshot_download(self.model_id_sbert[language]) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = MaskedLanguageModel(model_dir) + model = StructBertForMaskedLM(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -61,7 +61,7 @@ class FillMaskTest(unittest.TestCase): model_dir = snapshot_download(self.model_id_veco) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) - model = MaskedLanguageModel(model_dir) + model = VecoForMaskedLM(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor)