Browse Source

add tests

master
suluyan 3 years ago
parent
commit
3aa1a70ac8
7 changed files with 116 additions and 79 deletions
  1. +1
    -1
      modelscope/models/nlp/__init__.py
  2. +1
    -1
      modelscope/pipelines/nlp/__init__.py
  3. +18
    -9
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  4. +6
    -0
      modelscope/pipelines/outputs.py
  5. +6
    -12
      modelscope/preprocessors/nlp.py
  6. +1
    -1
      requirements/nlp.txt
  7. +83
    -55
      tests/pipelines/test_fill_mask.py

+ 1
- 1
modelscope/models/nlp/__init__.py View File

@@ -1,4 +1,4 @@
from .masked_language_model import * # noqa F403
from .sentence_similarity_model import * # noqa F403 from .sentence_similarity_model import * # noqa F403
from .sequence_classification_model import * # noqa F403 from .sequence_classification_model import * # noqa F403
from .text_generation_model import * # noqa F403 from .text_generation_model import * # noqa F403
from .masked_language_model import * # noqa F403

+ 1
- 1
modelscope/pipelines/nlp/__init__.py View File

@@ -1,4 +1,4 @@
from .fill_mask_pipeline import * # noqa F403
from .sentence_similarity_pipeline import * # noqa F403 from .sentence_similarity_pipeline import * # noqa F403
from .sequence_classification_pipeline import * # noqa F403 from .sequence_classification_pipeline import * # noqa F403
from .text_generation_pipeline import * # noqa F403 from .text_generation_pipeline import * # noqa F403
from .fill_mask_pipeline import * # noqa F403

+ 18
- 9
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -1,5 +1,6 @@
from typing import Dict
from typing import Dict, Optional


from modelscope.models import Model
from modelscope.models.nlp import MaskedLanguageModel from modelscope.models.nlp import MaskedLanguageModel
from modelscope.preprocessors import FillMaskPreprocessor from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
@@ -13,15 +14,23 @@ __all__ = ['FillMaskPipeline']
@PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco') @PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco')
class FillMaskPipeline(Pipeline): class FillMaskPipeline(Pipeline):


def __init__(self, model: MaskedLanguageModel,
preprocessor: FillMaskPreprocessor, **kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
def __init__(self,
model: MaskedLanguageModel,
preprocessor: Optional[FillMaskPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction


Args: Args:
model (SequenceClassificationModel): a model instance
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
model (MaskedLanguageModel): a model instance
preprocessor (FillMaskPreprocessor): a preprocessor instance
""" """

sc_model = model if isinstance(
model, MaskedLanguageModel) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = FillMaskPreprocessor(
sc_model.model_dir,
first_sequence='sentence',
second_sequence=None)
super().__init__(model=model, preprocessor=preprocessor, **kwargs) super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.preprocessor = preprocessor self.preprocessor = preprocessor
self.tokenizer = preprocessor.tokenizer self.tokenizer = preprocessor.tokenizer
@@ -55,10 +64,10 @@ class FillMaskPipeline(Pipeline):
pred_string = ''.join(pred_string).replace('##', '') pred_string = ''.join(pred_string).replace('##', '')
pred_string = pred_string.split('[SEP]')[0].replace( pred_string = pred_string.split('[SEP]')[0].replace(
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') '[CLS]', '').replace('[SEP]', '').replace('[UNK]', '')
else: ## en bert
else: # en bert
pred_string = self.tokenizer.decode(ids) pred_string = self.tokenizer.decode(ids)
pred_string = pred_string.split('[SEP]')[0].replace( pred_string = pred_string.split('[SEP]')[0].replace(
'[CLS]', '').replace('[SEP]', '').replace('[UNK]', '') '[CLS]', '').replace('[SEP]', '').replace('[UNK]', '')
pred_strings.append(pred_string) pred_strings.append(pred_string)


return {'pred_string': pred_strings}
return {'text': pred_strings}

+ 6
- 0
modelscope/pipelines/outputs.py View File

@@ -69,6 +69,12 @@ TASK_OUTPUTS = {
# } # }
Tasks.text_generation: ['text'], Tasks.text_generation: ['text'],


# fill mask result for single sample
# {
# "text": "this is the text which masks filled by model."
# }
Tasks.fill_mask: ['text'],

# ============ audio tasks =================== # ============ audio tasks ===================


# ============ multi-modal tasks =================== # ============ multi-modal tasks ===================


+ 6
- 12
modelscope/preprocessors/nlp.py View File

@@ -12,8 +12,7 @@ from .builder import PREPROCESSORS


__all__ = [ __all__ = [
'Tokenize', 'SequenceClassificationPreprocessor', 'Tokenize', 'SequenceClassificationPreprocessor',
'TextGenerationPreprocessor',
'FillMaskPreprocessor'
'TextGenerationPreprocessor', 'FillMaskPreprocessor'
] ]




@@ -173,8 +172,7 @@ class TextGenerationPreprocessor(Preprocessor):
return {k: torch.tensor(v) for k, v in rst.items()} return {k: torch.tensor(v) for k, v in rst.items()}




@PREPROCESSORS.register_module(
Fields.nlp, module_name=r'sbert')
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'sbert')
class FillMaskPreprocessor(Preprocessor): class FillMaskPreprocessor(Preprocessor):


def __init__(self, model_dir: str, *args, **kwargs): def __init__(self, model_dir: str, *args, **kwargs):
@@ -190,7 +188,8 @@ class FillMaskPreprocessor(Preprocessor):
'first_sequence') 'first_sequence')
self.sequence_length = kwargs.pop('sequence_length', 128) self.sequence_length = kwargs.pop('sequence_length', 128)


self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=False)


@type_assert(object, str) @type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]: def __call__(self, data: str) -> Dict[str, Any]:
@@ -205,15 +204,11 @@ class FillMaskPreprocessor(Preprocessor):
Dict[str, Any]: the preprocessed data Dict[str, Any]: the preprocessed data
""" """
import torch import torch
new_data = {self.first_sequence: data} new_data = {self.first_sequence: data}
# preprocess the data for the model input # preprocess the data for the model input


rst = {
'input_ids': [],
'attention_mask': [],
'token_type_ids': []
}
rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}


max_seq_length = self.sequence_length max_seq_length = self.sequence_length


@@ -230,4 +225,3 @@ class FillMaskPreprocessor(Preprocessor):
rst['token_type_ids'].append(feature['token_type_ids']) rst['token_type_ids'].append(feature['token_type_ids'])


return {k: torch.tensor(v) for k, v in rst.items()} return {k: torch.tensor(v) for k, v in rst.items()}


+ 1
- 1
requirements/nlp.txt View File

@@ -1 +1 @@
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.3-py3-none-any.whl

+ 83
- 55
tests/pipelines/test_fill_mask.py View File

@@ -23,82 +23,110 @@ class FillMaskTest(unittest.TestCase):


ori_texts = { ori_texts = {
'zh': 'zh':
f'段誉轻挥折扇,摇了摇头,说道:“你师父是你的师父,你师父可不是我的师父。'
f'你师父差得动你,你师父可差不动我。',
'段誉轻挥折扇,摇了摇头,说道:“你师父是你的师父,你师父可不是我的师父。'
'你师父差得动你,你师父可差不动我。',
'en': 'en':
f'Everything in what you call reality is really just a r'
f'eflection of your consciousness. Your whole universe is'
f'just a mirror reflection of your story.'
'Everything in what you call reality is really just a reflection of your '
'consciousness. Your whole universe is just a mirror reflection of your story.'
} }


test_inputs = { test_inputs = {
'zh': 'zh':
f'段誉轻[MASK]折扇,摇了摇[MASK],[MASK]道:“你师父是你的[MASK][MASK]'
f',你师父可不是[MASK]的师父。你师父差得动你,你师父可[MASK]不动我。',
'段誉轻[MASK]折扇,摇了摇[MASK],[MASK]道:“你师父是你的[MASK][MASK],你'
'师父可不是[MASK]的师父。你师父差得动你,你师父可[MASK]不动我。',
'en': 'en':
f'Everything in [MASK] you call reality is really [MASK] a '
f'reflection of your [MASK]. Your whole universe is just a '
f'mirror [MASK] of your story.'
'Everything in [MASK] you call reality is really [MASK] a reflection of your '
'[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
} }


#def test_run(self):
# # sbert
# for language in ["zh", "en"]:
# model_dir = snapshot_download(self.model_id_sbert[language])
# preprocessor = FillMaskPreprocessor(
# model_dir, first_sequence='sentence', second_sequence=None)
# model = MaskedLanguageModel(model_dir)
# pipeline1 = FillMaskPipeline(model, preprocessor)
# pipeline2 = pipeline(
# Tasks.fill_mask, model=model, preprocessor=preprocessor)
# ori_text = self.ori_texts[language]
# test_input = self.test_inputs[language]
# print(
# f'ori_text: {ori_text}\ninput: {test_input}\npipeline1: '
# f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}'
# )
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
# sbert
for language in ['zh', 'en']:
model_dir = snapshot_download(self.model_id_sbert[language])
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = MaskedLanguageModel(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language]
print(
f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
)


## veco
#model_dir = snapshot_download(self.model_id_veco)
#preprocessor = FillMaskPreprocessor(
# model_dir, first_sequence='sentence', second_sequence=None)
#model = MaskedLanguageModel(model_dir)
#pipeline1 = FillMaskPipeline(model, preprocessor)
#pipeline2 = pipeline(
# Tasks.fill_mask, model=model, preprocessor=preprocessor)
#for language in ["zh", "en"]:
# ori_text = self.ori_texts[language]
# test_input = self.test_inputs["zh"].replace("[MASK]", "<mask>")
# print(
# f'ori_text: {ori_text}\ninput: {test_input}\npipeline1: '
# f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}'
# veco
model_dir = snapshot_download(self.model_id_veco)
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = MaskedLanguageModel(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
for language in ['zh', 'en']:
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
print(
f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
)


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
for language in ['zh']:
# sbert
for language in ['zh', 'en']:
print(self.model_id_sbert[language]) print(self.model_id_sbert[language])
model = Model.from_pretrained(self.model_id_sbert[language]) model = Model.from_pretrained(self.model_id_sbert[language])
print('model', model.model_dir)
preprocessor = FillMaskPreprocessor( preprocessor = FillMaskPreprocessor(
model.model_dir, model.model_dir,
first_sequence='sentence', first_sequence='sentence',
second_sequence=None) second_sequence=None)
pipeline_ins = pipeline( pipeline_ins = pipeline(
task=Tasks.fill_mask, model=model, preprocessor=preprocessor) task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
print(pipeline_ins(self.test_inputs[language]))
print(
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n')

# veco
model = Model.from_pretrained(self.model_id_veco)
preprocessor = FillMaskPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
for language in ['zh', 'en']:
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_name(self):
# veco
pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_veco)
for language in ['zh', 'en']:
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')


#def test_run_with_model_name(self):
## veco
#pipeline_ins = pipeline(
# task=Tasks.fill_mask, model=self.model_id_veco)
#for language in ["zh", "en"]:
# input_ = self.test_inputs[language].replace("[MASK]", "<mask>")
# print(pipeline_ins(input_))
# structBert
language = 'zh'
pipeline_ins = pipeline(
task=Tasks.fill_mask, model=self.model_id_sbert[language])
print(
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n')


## structBert
#for language in ["zh"]:
# pipeline_ins = pipeline(
# task=Tasks.fill_mask, model=self.model_id_sbert[language])
# print(pipeline_ins(self_test_inputs[language]))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.fill_mask)
language = 'en'
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')




if __name__ == '__main__': if __name__ == '__main__':


Loading…
Cancel
Save