Browse Source

merge with nlp/space/dst

master
智丞 3 years ago
parent
commit
7b20e96af2
21 changed files with 100 additions and 92 deletions
  1. +4
    -4
      modelscope/models/nlp/__init__.py
  2. +0
    -0
      modelscope/models/nlp/bert_for_sequence_classification.py
  3. +9
    -18
      modelscope/models/nlp/palm_for_text_generation.py
  4. +0
    -0
      modelscope/models/nlp/sbert_for_sentence_similarity.py
  5. +0
    -1
      modelscope/models/nlp/sbert_for_token_classification.py
  6. +1
    -1
      modelscope/pipelines/base.py
  7. +2
    -1
      modelscope/pipelines/builder.py
  8. +1
    -1
      modelscope/pipelines/multi_modal/__init__.py
  9. +0
    -0
      modelscope/pipelines/multi_modal/image_caption_pipeline.py
  10. +0
    -3
      modelscope/pipelines/nlp/sentence_similarity_pipeline.py
  11. +0
    -3
      modelscope/pipelines/nlp/sequence_classification_pipeline.py
  12. +19
    -15
      modelscope/pipelines/nlp/text_generation_pipeline.py
  13. +0
    -2
      modelscope/pipelines/nlp/word_segmentation_pipeline.py
  14. +4
    -7
      modelscope/preprocessors/nlp.py
  15. +1
    -1
      modelscope/utils/constant.py
  16. +2
    -2
      modelscope/utils/nlp/space/utils.py
  17. +4
    -1
      requirements/nlp.txt
  18. +0
    -2
      requirements/nlp/space.txt
  19. +1
    -1
      requirements/runtime.txt
  20. +4
    -5
      tests/pipelines/nlp/test_dialog_modeling.py
  21. +48
    -24
      tests/pipelines/test_text_generation.py

+ 4
- 4
modelscope/models/nlp/__init__.py View File

@@ -1,6 +1,6 @@
from .sentence_similarity_model import * # noqa F403
from .sequence_classification_model import * # noqa F403
from .bert_for_sequence_classification import * # noqa F403
from .palm_for_text_generation import * # noqa F403
from .sbert_for_sentence_similarity import * # noqa F403
from .sbert_for_token_classification import * # noqa F403
from .space.dialog_intent_prediction_model import * # noqa F403
from .space.dialog_modeling_model import * # noqa F403
from .text_generation_model import * # noqa F403
from .token_classification_model import * # noqa F403

modelscope/models/nlp/sequence_classification_model.py → modelscope/models/nlp/bert_for_sequence_classification.py View File


modelscope/models/nlp/text_generation_model.py → modelscope/models/nlp/palm_for_text_generation.py View File

@@ -1,4 +1,4 @@
from typing import Any, Dict
from typing import Dict

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
@@ -7,7 +7,7 @@ from ..builder import MODELS
__all__ = ['PalmForTextGeneration']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
class PalmForTextGeneration(Model):

def __init__(self, model_dir: str, *args, **kwargs):
@@ -18,35 +18,26 @@ class PalmForTextGeneration(Model):
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
from sofa import PalmTokenizer

super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir

from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
tokenizer = kwargs.pop('tokenizer',
PalmTokenizer.from_pretrained(model_dir))
from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
model = PalmForConditionalGeneration.from_pretrained(model_dir)
self.generator = TextGenerator(model, tokenizer)
self.tokenizer = model.tokenizer
self.generator = Translator(model)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data
input (Dict[str, Tensor]): the preprocessed data

Returns:
Dict[str, np.ndarray]: results
Dict[str, Tensor]: results
Example:
{
'predictions': array([1]), # lable 0-negative 1-positive
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
}
"""

encoder_inputs = [
input['input_ids'], input['token_type_ids'],
input['attention_mask']
]
return self.generator(encoder_inputs)
return self.generator(**input)

modelscope/models/nlp/sentence_similarity_model.py → modelscope/models/nlp/sbert_for_sentence_similarity.py View File


modelscope/models/nlp/token_classification_model.py → modelscope/models/nlp/sbert_for_token_classification.py View File

@@ -1,4 +1,3 @@
import os
from typing import Any, Dict, Union

import numpy as np

+ 1
- 1
modelscope/pipelines/base.py View File

@@ -16,7 +16,7 @@ from .outputs import TASK_OUTPUTS
from .util import is_model_name

Tensor = Union['torch.Tensor', 'tf.Tensor']
Input = Union[str, PyDataset, Dict, tuple, 'PIL.Image.Image', 'numpy.ndarray']
Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
InputModel = Union[str, Model]

output_keys = [


+ 2
- 1
modelscope/pipelines/builder.py View File

@@ -22,7 +22,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
Tasks.text_classification:
('bert-sentiment-analysis', 'damo/bert-base-sst2'),
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
Tasks.text_generation: ('palm2.0',
'damo/nlp_palm2.0_text-generation_chinese-base'),
Tasks.image_captioning: ('ofa', None),
Tasks.image_generation:
('person-image-cartoon',


+ 1
- 1
modelscope/pipelines/multi_modal/__init__.py View File

@@ -1 +1 @@
from .image_captioning import ImageCaptionPipeline
from .image_caption_pipeline import ImageCaptionPipeline

modelscope/pipelines/multi_modal/image_captioning.py → modelscope/pipelines/multi_modal/image_caption_pipeline.py View File


+ 0
- 3
modelscope/pipelines/nlp/sentence_similarity_pipeline.py View File

@@ -1,8 +1,5 @@
import os
import uuid
from typing import Any, Dict, Union

import json
import numpy as np

from modelscope.models.nlp import SbertForSentenceSimilarity


+ 0
- 3
modelscope/pipelines/nlp/sequence_classification_pipeline.py View File

@@ -1,8 +1,5 @@
import os
import uuid
from typing import Any, Dict, Union

import json
import numpy as np

from modelscope.models.nlp import BertForSequenceClassification


+ 19
- 15
modelscope/pipelines/nlp/text_generation_pipeline.py View File

@@ -10,7 +10,7 @@ from ..builder import PIPELINES
__all__ = ['TextGenerationPipeline']


@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
class TextGenerationPipeline(Pipeline):

def __init__(self,
@@ -23,15 +23,16 @@ class TextGenerationPipeline(Pipeline):
model (SequenceClassificationModel): a model instance
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
"""
sc_model = model if isinstance(
model = model if isinstance(
model, PalmForTextGeneration) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TextGenerationPreprocessor(
sc_model.model_dir,
model.model_dir,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = model.tokenizer

def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
"""process the prediction results
@@ -42,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
Returns:
Dict[str, str]: the prediction results
"""
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
''),
('<s>', ''), ('</s>', ''), ('<unk>', ' '))

vocab_size = len(self.tokenizer.vocab)
pred_list = inputs['predictions']
pred_ids = pred_list[0][0].cpu().numpy().tolist()
for j in range(len(pred_ids)):
if pred_ids[j] >= vocab_size:
pred_ids[j] = 100
pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
pred_string = ''.join(pred).replace(
'##',
'').split('[SEP]')[0].replace('[CLS]',
'').replace('[SEP]',
'').replace('[UNK]', '')
pred_string = self.tokenizer.decode(pred_ids)
for _old, _new in replace_tokens_bert:
pred_string = pred_string.replace(_old, _new)
pred_string.strip()
for _old, _new in replace_tokens_roberta:
pred_string = pred_string.replace(_old, _new)
pred_string.strip()
return {'text': pred_string}

+ 0
- 2
modelscope/pipelines/nlp/word_segmentation_pipeline.py View File

@@ -1,7 +1,5 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.preprocessors import TokenClassifcationPreprocessor


+ 4
- 7
modelscope/preprocessors/nlp.py View File

@@ -115,17 +115,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
return rst


@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
class TextGenerationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
"""preprocess the data using the vocab.txt from the `model_dir` path

Args:
model_dir (str): model path
"""
from sofa import PalmTokenizer

super().__init__(*args, **kwargs)

self.model_dir: str = model_dir
@@ -134,7 +132,7 @@ class TextGenerationPreprocessor(Preprocessor):
self.second_sequence: str = kwargs.pop('second_sequence',
'second_sequence')
self.sequence_length: int = kwargs.pop('sequence_length', 128)
self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
self.tokenizer = tokenizer

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
@@ -153,7 +151,7 @@ class TextGenerationPreprocessor(Preprocessor):
new_data = {self.first_sequence: data}
# preprocess the data for the model input

rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
rst = {'input_ids': [], 'attention_mask': []}

max_seq_length = self.sequence_length

@@ -168,7 +166,6 @@ class TextGenerationPreprocessor(Preprocessor):

rst['input_ids'].append(feature['input_ids'])
rst['attention_mask'].append(feature['attention_mask'])
rst['token_type_ids'].append(feature['token_type_ids'])

return {k: torch.tensor(v) for k, v in rst.items()}



+ 1
- 1
modelscope/utils/constant.py View File

@@ -40,7 +40,7 @@ class Tasks(object):
token_classification = 'token-classification'
conversational = 'conversational'
text_generation = 'text-generation'
dialog_modeling = 'dialog_modeling'
dialog_modeling = 'dialog-modeling'
dialog_intent_prediction = 'dialog-intent-prediction'
table_question_answering = 'table-question-answering'
feature_extraction = 'feature-extraction'


+ 2
- 2
modelscope/utils/nlp/space/utils.py View File

@@ -169,8 +169,8 @@ class MultiWOZVocab(object):
if include_oov:
if self._word2idx.get(word, None) is None:
raise ValueError(
'Unknown word: %s. Vocabulary should include oovs here.' %
word)
'Unknown word: %s. Vocabulary should include oovs here.'
% word)
return self._word2idx[word]
else:
word = '<unk>' if word not in self._word2idx else word


+ 4
- 1
requirements/nlp.txt View File

@@ -1 +1,4 @@
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
spacy>=2.3.5
# python -m spacy download en_core_web_sm

+ 0
- 2
requirements/nlp/space.txt View File

@@ -1,2 +0,0 @@
spacy==2.3.5
# python -m spacy download en_core_web_sm

+ 1
- 1
requirements/runtime.txt View File

@@ -1,7 +1,7 @@
addict
datasets
easydict
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
numpy
opencv-python-headless
Pillow>=6.2.0


+ 4
- 5
tests/pipelines/nlp/test_dialog_modeling.py View File

@@ -92,10 +92,9 @@ class DialogModelingTest(unittest.TestCase):
}
}

# @unittest.skip('test with snapshot_download')
@unittest.skip('test with snapshot_download')
def test_run(self):

# cache_path = '/Users/yangliu/Space/maas_model/nlp_space_dialog-modeling'
cache_path = snapshot_download(self.model_id)

preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
@@ -124,12 +123,12 @@ class DialogModelingTest(unittest.TestCase):

def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
preprocessor = DialogGenerationPreprocessor(model_dir=model.model_dir)
preprocessor = DialogModelingPreprocessor(model_dir=model.model_dir)

pipelines = [
DialogGenerationPipeline(model=model, preprocessor=preprocessor),
DialogModelingPipeline(model=model, preprocessor=preprocessor),
pipeline(
task=Tasks.dialog_generation,
task=Tasks.dialog_modeling,
model=model,
preprocessor=preprocessor)
]


+ 48
- 24
tests/pipelines/test_text_generation.py View File

@@ -12,43 +12,67 @@ from modelscope.utils.test_utils import test_level


class TextGenerationTest(unittest.TestCase):
model_id = 'damo/nlp_palm_text-generation_chinese'
input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"
model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
input_zh = """
本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:
1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代
"""
input_en = """
The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges
of paedophilia against nine children because he has dementia . Today , newly-released documents
revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years .
And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her
pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 .
"""

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run(self):
cache_path = snapshot_download(self.model_id)
preprocessor = TextGenerationPreprocessor(
cache_path, first_sequence='sentence', second_sequence=None)
model = PalmForTextGeneration(
cache_path, tokenizer=preprocessor.tokenizer)
pipeline1 = TextGenerationPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.text_generation, model=model, preprocessor=preprocessor)
print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}')
print()
print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
cache_path = snapshot_download(model_id)
model = PalmForTextGeneration(cache_path)
preprocessor = TextGenerationPreprocessor(
cache_path,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
pipeline1 = TextGenerationPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.text_generation, model=model, preprocessor=preprocessor)
print(
f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
preprocessor = TextGenerationPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
task=Tasks.text_generation, model=model, preprocessor=preprocessor)
print(pipeline_ins(self.input1))
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
model = Model.from_pretrained(model_id)
preprocessor = TextGenerationPreprocessor(
model.model_dir,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
pipeline_ins = pipeline(
task=Tasks.text_generation,
model=model,
preprocessor=preprocessor)
print(pipeline_ins(input))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.text_generation, model=self.model_id)
print(pipeline_ins(self.input2))
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
print(pipeline_ins(input))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.text_generation)
print(pipeline_ins(self.input2))
print(pipeline_ins(self.input_zh))


if __name__ == '__main__':


Loading…
Cancel
Save