Browse Source

[to #42322933]feat: add nlp-chinese-bert-fill-mask-pipeline to maas_lib

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9155437
master
suluyan.sly 3 years ago
parent
commit
a7c1cd0fc9
4 changed files with 94 additions and 37 deletions
  1. +31
    -17
      modelscope/models/nlp/masked_language_model.py
  2. +21
    -15
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  3. +7
    -4
      modelscope/preprocessors/nlp.py
  4. +35
    -1
      tests/pipelines/test_fill_mask.py

+ 31
- 17
modelscope/models/nlp/masked_language_model.py View File

@@ -2,24 +2,28 @@ from typing import Any, Dict, Optional, Union


import numpy as np import numpy as np


from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from ...metainfo import Models
from ...utils.constant import Tasks
from ..base import Model, Tensor from ..base import Model, Tensor
from ..builder import MODELS from ..builder import MODELS


__all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM']
__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']




class AliceMindBaseForMaskedLM(Model):
class MaskedLanguageModelBase(Model):


def __init__(self, model_dir: str, *args, **kwargs): def __init__(self, model_dir: str, *args, **kwargs):
from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM
self.model_dir = model_dir
super().__init__(model_dir, *args, **kwargs) super().__init__(model_dir, *args, **kwargs)
self.model = self.build_model()


self.config = AutoConfig.from_pretrained(model_dir)
self.model = AutoModelForMaskedLM.from_pretrained(
model_dir, config=self.config)
def build_model():
raise NotImplementedError()

@property
def config(self):
if hasattr(self.model, 'config'):
return self.model.config
return None


def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]: def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
"""return the result by the model """return the result by the model
@@ -38,14 +42,24 @@ class AliceMindBaseForMaskedLM(Model):




@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
class StructBertForMaskedLM(AliceMindBaseForMaskedLM):
# The StructBert for MaskedLM uses the same underlying model structure
# as the base model class.
pass
class StructBertForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from sofa import SbertForMaskedLM
return SbertForMaskedLM.from_pretrained(self.model_dir)




@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) @MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
class VecoForMaskedLM(AliceMindBaseForMaskedLM):
# The Veco for MaskedLM uses the same underlying model structure
# as the base model class.
pass
class VecoForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from sofa import VecoForMaskedLM
return VecoForMaskedLM.from_pretrained(self.model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
class BertForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from transformers import BertForMaskedLM
return BertForMaskedLM.from_pretrained(self.model_dir)

+ 21
- 15
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -1,32 +1,34 @@
import os
from typing import Dict, Optional, Union from typing import Dict, Optional, Union


from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.nlp.masked_language_model import \
AliceMindBaseForMaskedLM
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
from ...metainfo import Pipelines
from ...models import Model
from ...models.nlp.masked_language_model import MaskedLanguageModelBase
from ...preprocessors import FillMaskPreprocessor
from ...utils.config import Config
from ...utils.constant import ModelFile, Tasks
from ..base import Pipeline, Tensor from ..base import Pipeline, Tensor
from ..builder import PIPELINES from ..builder import PIPELINES


__all__ = ['FillMaskPipeline'] __all__ = ['FillMaskPipeline']
_type_map = {'veco': 'roberta', 'sbert': 'bert'}




@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask) @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
class FillMaskPipeline(Pipeline): class FillMaskPipeline(Pipeline):


def __init__(self, def __init__(self,
model: Union[AliceMindBaseForMaskedLM, str],
model: Union[MaskedLanguageModelBase, str],
preprocessor: Optional[FillMaskPreprocessor] = None, preprocessor: Optional[FillMaskPreprocessor] = None,
**kwargs): **kwargs):
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction


Args: Args:
model (AliceMindBaseForMaskedLM): a model instance
model (MaskedLanguageModelBase): a model instance
preprocessor (FillMaskPreprocessor): a preprocessor instance preprocessor (FillMaskPreprocessor): a preprocessor instance
""" """
fill_mask_model = model if isinstance( fill_mask_model = model if isinstance(
model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
model, MaskedLanguageModelBase) else Model.from_pretrained(model)
if preprocessor is None: if preprocessor is None:
preprocessor = FillMaskPreprocessor( preprocessor = FillMaskPreprocessor(
fill_mask_model.model_dir, fill_mask_model.model_dir,
@@ -34,11 +36,13 @@ class FillMaskPipeline(Pipeline):
second_sequence=None) second_sequence=None)
super().__init__(model=model, preprocessor=preprocessor, **kwargs) super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.preprocessor = preprocessor self.preprocessor = preprocessor
self.config = Config.from_file(
os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
self.tokenizer = preprocessor.tokenizer self.tokenizer = preprocessor.tokenizer
self.mask_id = {'veco': 250001, 'sbert': 103}
self.mask_id = {'roberta': 250001, 'bert': 103}


self.rep_map = { self.rep_map = {
'sbert': {
'bert': {
'[unused0]': '', '[unused0]': '',
'[PAD]': '', '[PAD]': '',
'[unused1]': '', '[unused1]': '',
@@ -48,7 +52,7 @@ class FillMaskPipeline(Pipeline):
'[CLS]': '', '[CLS]': '',
'[UNK]': '' '[UNK]': ''
}, },
'veco': {
'roberta': {
r' +': ' ', r' +': ' ',
'<mask>': '<q>', '<mask>': '<q>',
'<pad>': '', '<pad>': '',
@@ -72,7 +76,9 @@ class FillMaskPipeline(Pipeline):
input_ids = inputs['input_ids'].detach().numpy() input_ids = inputs['input_ids'].detach().numpy()
pred_ids = np.argmax(logits, axis=-1) pred_ids = np.argmax(logits, axis=-1)
model_type = self.model.config.model_type model_type = self.model.config.model_type
rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
process_type = model_type if model_type in self.mask_id else _type_map[
model_type]
rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
input_ids) input_ids)


def rep_tokens(string, rep_map): def rep_tokens(string, rep_map):
@@ -82,12 +88,12 @@ class FillMaskPipeline(Pipeline):


pred_strings = [] pred_strings = []
for ids in rst_ids: # batch for ids in rst_ids: # batch
if self.model.config.vocab_size == 21128: # zh bert
if 'language' in self.config.model and self.config.model.language == 'zh':
pred_string = self.tokenizer.convert_ids_to_tokens(ids) pred_string = self.tokenizer.convert_ids_to_tokens(ids)
pred_string = ''.join(pred_string) pred_string = ''.join(pred_string)
else: else:
pred_string = self.tokenizer.decode(ids) pred_string = self.tokenizer.decode(ids)
pred_string = rep_tokens(pred_string, self.rep_map[model_type])
pred_string = rep_tokens(pred_string, self.rep_map[process_type])
pred_strings.append(pred_string) pred_strings.append(pred_string)


return {'text': pred_strings} return {'text': pred_strings}

+ 7
- 4
modelscope/preprocessors/nlp.py View File

@@ -192,14 +192,17 @@ class FillMaskPreprocessor(Preprocessor):
model_dir (str): model path model_dir (str): model path
""" """
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
from sofa.utils.backend import AutoTokenizer
self.model_dir = model_dir self.model_dir = model_dir
self.first_sequence: str = kwargs.pop('first_sequence', self.first_sequence: str = kwargs.pop('first_sequence',
'first_sequence') 'first_sequence')
self.sequence_length = kwargs.pop('sequence_length', 128) self.sequence_length = kwargs.pop('sequence_length', 128)

self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=False)
try:
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
except KeyError:
from sofa.utils.backend import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=False)


@type_assert(object, str) @type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]: def __call__(self, data: str) -> Dict[str, Any]:


+ 35
- 1
tests/pipelines/test_fill_mask.py View File

@@ -3,7 +3,8 @@ import unittest


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM
from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
VecoForMaskedLM)
from modelscope.pipelines import FillMaskPipeline, pipeline from modelscope.pipelines import FillMaskPipeline, pipeline
from modelscope.preprocessors import FillMaskPreprocessor from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
@@ -16,6 +17,7 @@ class FillMaskTest(unittest.TestCase):
'en': 'damo/nlp_structbert_fill-mask_english-large' 'en': 'damo/nlp_structbert_fill-mask_english-large'
} }
model_id_veco = 'damo/nlp_veco_fill-mask-large' model_id_veco = 'damo/nlp_veco_fill-mask-large'
model_id_bert = 'damo/nlp_bert_fill-mask_chinese-base'


ori_texts = { ori_texts = {
'zh': 'zh':
@@ -69,6 +71,20 @@ class FillMaskTest(unittest.TestCase):
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n' f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
) )


# zh bert
language = 'zh'
model_dir = snapshot_download(self.model_id_bert)
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = BertForMaskedLM(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language]
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
# sbert # sbert
@@ -97,6 +113,18 @@ class FillMaskTest(unittest.TestCase):
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n') f'{pipeline_ins(test_input)}\n')


# zh bert
model = Model.from_pretrained(self.model_id_bert)
preprocessor = FillMaskPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
language = 'zh'
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language]
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
# veco # veco
@@ -115,6 +143,12 @@ class FillMaskTest(unittest.TestCase):
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n') f'{pipeline_ins(self.test_inputs[language])}\n')


# bert
pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
print(
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self): def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.fill_mask) pipeline_ins = pipeline(task=Tasks.fill_mask)


Loading…
Cancel
Save