Browse Source

fix comments: rename and refactor AliceMindMLM; adjust pipeline

master
suluyan 3 years ago
parent
commit
c7a19c9c1f
3 changed files with 59 additions and 36 deletions
  1. +14
    -10
      modelscope/models/nlp/masked_language_model.py
  2. +41
    -22
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  3. +4
    -4
      tests/pipelines/test_fill_mask.py

+ 14
- 10
modelscope/models/nlp/masked_language_model.py View File

@@ -6,12 +6,12 @@ from ...utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['MaskedLanguageModel']
__all__ = [
'StructBertForMaskedLM', 'VecoForMaskedLM', 'AliceMindBaseForMaskedLM'
]


@MODELS.register_module(Tasks.fill_mask, module_name=r'sbert')
@MODELS.register_module(Tasks.fill_mask, module_name=r'veco')
class MaskedLanguageModel(Model):
class AliceMindBaseForMaskedLM(Model):

def __init__(self, model_dir: str, *args, **kwargs):
from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM
@@ -30,15 +30,19 @@ class MaskedLanguageModel(Model):

Returns:
Dict[str, np.ndarray]: results
Example:
{
'predictions': array([1]), # lable 0-negative 1-positive
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
}
"""
rst = self.model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
token_type_ids=inputs['token_type_ids'])
return {'logits': rst['logits'], 'input_ids': inputs['input_ids']}


@MODELS.register_module(Tasks.fill_mask, module_name=r'sbert')
class StructBertForMaskedLM(AliceMindBaseForMaskedLM):
pass


@MODELS.register_module(Tasks.fill_mask, module_name=r'veco')
class VecoForMaskedLM(AliceMindBaseForMaskedLM):
pass

+ 41
- 22
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -1,7 +1,7 @@
from typing import Dict, Optional

from modelscope.models import Model
from modelscope.models.nlp import MaskedLanguageModel
from modelscope.models.nlp import AliceMindBaseForMaskedLM
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
from ..base import Pipeline, Tensor
@@ -15,20 +15,20 @@ __all__ = ['FillMaskPipeline']
class FillMaskPipeline(Pipeline):

def __init__(self,
model: MaskedLanguageModel,
model: AliceMindBaseForMaskedLM,
preprocessor: Optional[FillMaskPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction

Args:
model (MaskedLanguageModel): a model instance
model (AliceMindBaseForMaskedLM): a model instance
preprocessor (FillMaskPreprocessor): a preprocessor instance
"""
sc_model = model if isinstance(
model, MaskedLanguageModel) else Model.from_pretrained(model)
fill_mask_model = model if isinstance(
model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = FillMaskPreprocessor(
sc_model.model_dir,
fill_mask_model.model_dir,
first_sequence='sentence',
second_sequence=None)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
@@ -36,6 +36,27 @@ class FillMaskPipeline(Pipeline):
self.tokenizer = preprocessor.tokenizer
self.mask_id = {'veco': 250001, 'sbert': 103}

self.rep_map = {
'sbert': {
'[unused0]': '',
'[PAD]': '',
'[unused1]': '',
r' +': ' ',
'[SEP]': '',
'[unused2]': '',
'[CLS]': '',
'[UNK]': ''
},
'veco': {
r' +': ' ',
'<mask>': '<q>',
'<pad>': '',
'<s>': '',
'</s>': '',
'<unk>': ' '
}
}

def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""process the prediction results

@@ -49,25 +70,23 @@ class FillMaskPipeline(Pipeline):
logits = inputs['logits'].detach().numpy()
input_ids = inputs['input_ids'].detach().numpy()
pred_ids = np.argmax(logits, axis=-1)
rst_ids = np.where(
input_ids == self.mask_id[self.model.config.model_type], pred_ids,
input_ids)
model_type = self.model.config.model_type
rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
input_ids)

def rep_tokens(string, rep_map):
for k, v in rep_map.items():
string = string.replace(k, v)
return string.strip()

pred_strings = []
for ids in rst_ids:
if self.model.config.model_type == 'veco':
pred_string = self.tokenizer.decode(ids).split(
'</s>')[0].replace('<s>',
'').replace('</s>',
'').replace('<pad>', '')
elif self.model.config.vocab_size == 21128: # zh bert
for ids in rst_ids: # batch
if self.model.config.vocab_size == 21128: # zh bert
pred_string = self.tokenizer.convert_ids_to_tokens(ids)
pred_string = ''.join(pred_string).replace('##', '')
pred_string = pred_string.split('[SEP]')[0].replace(
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '')
else: # en bert
pred_string = ''.join(pred_string)
else:
pred_string = self.tokenizer.decode(ids)
pred_string = pred_string.split('[SEP]')[0].replace(
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '')
pred_string = rep_tokens(pred_string, self.rep_map[model_type])
pred_strings.append(pred_string)

return {'text': pred_strings}

+ 4
- 4
tests/pipelines/test_fill_mask.py View File

@@ -6,7 +6,7 @@ import unittest
from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import MaskedLanguageModel
from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM
from modelscope.pipelines import FillMaskPipeline, pipeline
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
@@ -39,14 +39,14 @@ class FillMaskTest(unittest.TestCase):
'[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
}

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_by_direct_model_download(self):
# sbert
for language in ['zh', 'en']:
model_dir = snapshot_download(self.model_id_sbert[language])
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = MaskedLanguageModel(model_dir)
model = StructBertForMaskedLM(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -61,7 +61,7 @@ class FillMaskTest(unittest.TestCase):
model_dir = snapshot_download(self.model_id_veco)
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = MaskedLanguageModel(model_dir)
model = VecoForMaskedLM(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)


Loading…
Cancel
Save