|
|
|
@@ -1,7 +1,7 @@ |
|
|
|
from typing import Dict, Optional |
|
|
|
|
|
|
|
from modelscope.models import Model |
|
|
|
from modelscope.models.nlp import MaskedLanguageModel |
|
|
|
from modelscope.models.nlp import AliceMindBaseForMaskedLM |
|
|
|
from modelscope.preprocessors import FillMaskPreprocessor |
|
|
|
from modelscope.utils.constant import Tasks |
|
|
|
from ..base import Pipeline, Tensor |
|
|
|
@@ -15,20 +15,20 @@ __all__ = ['FillMaskPipeline'] |
|
|
|
class FillMaskPipeline(Pipeline): |
|
|
|
|
|
|
|
def __init__(self, |
|
|
|
model: MaskedLanguageModel, |
|
|
|
model: AliceMindBaseForMaskedLM, |
|
|
|
preprocessor: Optional[FillMaskPreprocessor] = None, |
|
|
|
**kwargs): |
|
|
|
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction |
|
|
|
|
|
|
|
Args: |
|
|
|
model (MaskedLanguageModel): a model instance |
|
|
|
model (AliceMindBaseForMaskedLM): a model instance |
|
|
|
preprocessor (FillMaskPreprocessor): a preprocessor instance |
|
|
|
""" |
|
|
|
sc_model = model if isinstance( |
|
|
|
model, MaskedLanguageModel) else Model.from_pretrained(model) |
|
|
|
fill_mask_model = model if isinstance( |
|
|
|
model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model) |
|
|
|
if preprocessor is None: |
|
|
|
preprocessor = FillMaskPreprocessor( |
|
|
|
sc_model.model_dir, |
|
|
|
fill_mask_model.model_dir, |
|
|
|
first_sequence='sentence', |
|
|
|
second_sequence=None) |
|
|
|
super().__init__(model=model, preprocessor=preprocessor, **kwargs) |
|
|
|
@@ -36,6 +36,27 @@ class FillMaskPipeline(Pipeline): |
|
|
|
self.tokenizer = preprocessor.tokenizer |
|
|
|
self.mask_id = {'veco': 250001, 'sbert': 103} |
|
|
|
|
|
|
|
self.rep_map = { |
|
|
|
'sbert': { |
|
|
|
'[unused0]': '', |
|
|
|
'[PAD]': '', |
|
|
|
'[unused1]': '', |
|
|
|
r' +': ' ', |
|
|
|
'[SEP]': '', |
|
|
|
'[unused2]': '', |
|
|
|
'[CLS]': '', |
|
|
|
'[UNK]': '' |
|
|
|
}, |
|
|
|
'veco': { |
|
|
|
r' +': ' ', |
|
|
|
'<mask>': '<q>', |
|
|
|
'<pad>': '', |
|
|
|
'<s>': '', |
|
|
|
'</s>': '', |
|
|
|
'<unk>': ' ' |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: |
|
|
|
"""process the prediction results |
|
|
|
|
|
|
|
@@ -49,25 +70,23 @@ class FillMaskPipeline(Pipeline): |
|
|
|
logits = inputs['logits'].detach().numpy() |
|
|
|
input_ids = inputs['input_ids'].detach().numpy() |
|
|
|
pred_ids = np.argmax(logits, axis=-1) |
|
|
|
rst_ids = np.where( |
|
|
|
input_ids == self.mask_id[self.model.config.model_type], pred_ids, |
|
|
|
input_ids) |
|
|
|
model_type = self.model.config.model_type |
|
|
|
rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids, |
|
|
|
input_ids) |
|
|
|
|
|
|
|
def rep_tokens(string, rep_map): |
|
|
|
for k, v in rep_map.items(): |
|
|
|
string = string.replace(k, v) |
|
|
|
return string.strip() |
|
|
|
|
|
|
|
pred_strings = [] |
|
|
|
for ids in rst_ids: |
|
|
|
if self.model.config.model_type == 'veco': |
|
|
|
pred_string = self.tokenizer.decode(ids).split( |
|
|
|
'</s>')[0].replace('<s>', |
|
|
|
'').replace('</s>', |
|
|
|
'').replace('<pad>', '') |
|
|
|
elif self.model.config.vocab_size == 21128: # zh bert |
|
|
|
for ids in rst_ids: # batch |
|
|
|
if self.model.config.vocab_size == 21128: # zh bert |
|
|
|
pred_string = self.tokenizer.convert_ids_to_tokens(ids) |
|
|
|
pred_string = ''.join(pred_string).replace('##', '') |
|
|
|
pred_string = pred_string.split('[SEP]')[0].replace( |
|
|
|
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') |
|
|
|
else: # en bert |
|
|
|
pred_string = ''.join(pred_string) |
|
|
|
else: |
|
|
|
pred_string = self.tokenizer.decode(ids) |
|
|
|
pred_string = pred_string.split('[SEP]')[0].replace( |
|
|
|
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') |
|
|
|
pred_string = rep_tokens(pred_string, self.rep_map[model_type]) |
|
|
|
pred_strings.append(pred_string) |
|
|
|
|
|
|
|
return {'text': pred_strings} |