diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py index fd5f97e6..a760822b 100644 --- a/modelscope/models/nlp/masked_language_model.py +++ b/modelscope/models/nlp/masked_language_model.py @@ -2,24 +2,28 @@ from typing import Any, Dict, Optional, Union import numpy as np -from modelscope.metainfo import Models -from modelscope.utils.constant import Tasks +from ...metainfo import Models +from ...utils.constant import Tasks from ..base import Model, Tensor from ..builder import MODELS -__all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM'] +__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM'] -class AliceMindBaseForMaskedLM(Model): +class MaskedLanguageModelBase(Model): def __init__(self, model_dir: str, *args, **kwargs): - from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM - self.model_dir = model_dir super().__init__(model_dir, *args, **kwargs) + self.model = self.build_model() - self.config = AutoConfig.from_pretrained(model_dir) - self.model = AutoModelForMaskedLM.from_pretrained( - model_dir, config=self.config) + def build_model(): + raise NotImplementedError() + + @property + def config(self): + if hasattr(self.model, 'config'): + return self.model.config + return None def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]: """return the result by the model @@ -38,14 +42,24 @@ class AliceMindBaseForMaskedLM(Model): @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) -class StructBertForMaskedLM(AliceMindBaseForMaskedLM): - # The StructBert for MaskedLM uses the same underlying model structure - # as the base model class. - pass +class StructBertForMaskedLM(MaskedLanguageModelBase): + + def build_model(self): + from sofa import SbertForMaskedLM + return SbertForMaskedLM.from_pretrained(self.model_dir) @MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) -class VecoForMaskedLM(AliceMindBaseForMaskedLM): - # The Veco for MaskedLM uses the same underlying model structure - # as the base model class. - pass +class VecoForMaskedLM(MaskedLanguageModelBase): + + def build_model(self): + from sofa import VecoForMaskedLM + return VecoForMaskedLM.from_pretrained(self.model_dir) + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert) +class BertForMaskedLM(MaskedLanguageModelBase): + + def build_model(self): + from transformers import BertForMaskedLM + return BertForMaskedLM.from_pretrained(self.model_dir) diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 863d9a6d..1567ef9d 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -1,32 +1,34 @@ +import os from typing import Dict, Optional, Union -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp.masked_language_model import \ - AliceMindBaseForMaskedLM -from modelscope.preprocessors import FillMaskPreprocessor -from modelscope.utils.constant import Tasks +from ...metainfo import Pipelines +from ...models import Model +from ...models.nlp.masked_language_model import MaskedLanguageModelBase +from ...preprocessors import FillMaskPreprocessor +from ...utils.config import Config +from ...utils.constant import ModelFile, Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES __all__ = ['FillMaskPipeline'] +_type_map = {'veco': 'roberta', 'sbert': 'bert'} @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask) class FillMaskPipeline(Pipeline): def __init__(self, - model: Union[AliceMindBaseForMaskedLM, str], + model: Union[MaskedLanguageModelBase, str], preprocessor: Optional[FillMaskPreprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction Args: - model (AliceMindBaseForMaskedLM): a model instance + model (MaskedLanguageModelBase): a model instance preprocessor (FillMaskPreprocessor): a preprocessor instance """ fill_mask_model = model if isinstance( - model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model) + model, MaskedLanguageModelBase) else Model.from_pretrained(model) if preprocessor is None: preprocessor = FillMaskPreprocessor( fill_mask_model.model_dir, @@ -34,11 +36,13 @@ class FillMaskPipeline(Pipeline): second_sequence=None) super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.preprocessor = preprocessor + self.config = Config.from_file( + os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION)) self.tokenizer = preprocessor.tokenizer - self.mask_id = {'veco': 250001, 'sbert': 103} + self.mask_id = {'roberta': 250001, 'bert': 103} self.rep_map = { - 'sbert': { + 'bert': { '[unused0]': '', '[PAD]': '', '[unused1]': '', @@ -48,7 +52,7 @@ class FillMaskPipeline(Pipeline): '[CLS]': '', '[UNK]': '' }, - 'veco': { + 'roberta': { r' +': ' ', '': '', '': '', @@ -72,7 +76,9 @@ class FillMaskPipeline(Pipeline): input_ids = inputs['input_ids'].detach().numpy() pred_ids = np.argmax(logits, axis=-1) model_type = self.model.config.model_type - rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids, + process_type = model_type if model_type in self.mask_id else _type_map[ + model_type] + rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids, input_ids) def rep_tokens(string, rep_map): @@ -82,12 +88,12 @@ class FillMaskPipeline(Pipeline): pred_strings = [] for ids in rst_ids: # batch - if self.model.config.vocab_size == 21128: # zh bert + if 'language' in self.config.model and self.config.model.language == 'zh': pred_string = self.tokenizer.convert_ids_to_tokens(ids) pred_string = ''.join(pred_string) else: pred_string = self.tokenizer.decode(ids) - pred_string = rep_tokens(pred_string, self.rep_map[model_type]) + pred_string = rep_tokens(pred_string, self.rep_map[process_type]) pred_strings.append(pred_string) return {'text': pred_strings} diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 3f98a081..4ed63f3c 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -192,14 +192,17 @@ class FillMaskPreprocessor(Preprocessor): model_dir (str): model path """ super().__init__(*args, **kwargs) - from sofa.utils.backend import AutoTokenizer self.model_dir = model_dir self.first_sequence: str = kwargs.pop('first_sequence', 'first_sequence') self.sequence_length = kwargs.pop('sequence_length', 128) - - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=False) + try: + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model_dir) + except KeyError: + from sofa.utils.backend import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_dir, use_fast=False) @type_assert(object, str) def __call__(self, data: str) -> Dict[str, Any]: diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 49c5dc8a..d44ba4c8 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -3,7 +3,8 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM +from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM, + VecoForMaskedLM) from modelscope.pipelines import FillMaskPipeline, pipeline from modelscope.preprocessors import FillMaskPreprocessor from modelscope.utils.constant import Tasks @@ -16,6 +17,7 @@ class FillMaskTest(unittest.TestCase): 'en': 'damo/nlp_structbert_fill-mask_english-large' } model_id_veco = 'damo/nlp_veco_fill-mask-large' + model_id_bert = 'damo/nlp_bert_fill-mask_chinese-base' ori_texts = { 'zh': @@ -69,6 +71,20 @@ class FillMaskTest(unittest.TestCase): f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n' ) + # zh bert + language = 'zh' + model_dir = snapshot_download(self.model_id_bert) + preprocessor = FillMaskPreprocessor( + model_dir, first_sequence='sentence', second_sequence=None) + model = BertForMaskedLM(model_dir) + pipeline1 = FillMaskPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + ori_text = self.ori_texts[language] + test_input = self.test_inputs[language] + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: ' + f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): # sbert @@ -97,6 +113,18 @@ class FillMaskTest(unittest.TestCase): print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') + # zh bert + model = Model.from_pretrained(self.model_id_bert) + preprocessor = FillMaskPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + language = 'zh' + ori_text = self.ori_texts[language] + test_input = self.test_inputs[language] + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' + f'{pipeline_ins(test_input)}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): # veco @@ -115,6 +143,12 @@ class FillMaskTest(unittest.TestCase): f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'{pipeline_ins(self.test_inputs[language])}\n') + # bert + pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert) + print( + f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' + f'{pipeline_ins(self.test_inputs[language])}\n') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.fill_mask)