Browse Source

1130中文分词/词性标注新增LSTM轻量化模型

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10931263
master^2
dingkun.ldk yingda.chen 3 years ago
parent
commit
000976836f
7 changed files with 140 additions and 9 deletions
  1. +2
    -0
      modelscope/models/nlp/__init__.py
  2. +3
    -3
      modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
  3. +2
    -0
      modelscope/models/nlp/task_models/__init__.py
  4. +8
    -1
      modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
  5. +4
    -0
      modelscope/preprocessors/base.py
  6. +33
    -3
      tests/pipelines/test_part_of_speech.py
  7. +88
    -2
      tests/pipelines/test_word_segmentation.py

+ 2
- 0
modelscope/models/nlp/__init__.py View File

@@ -40,6 +40,7 @@ if TYPE_CHECKING:
InformationExtractionModel, InformationExtractionModel,
LSTMCRFForNamedEntityRecognition, LSTMCRFForNamedEntityRecognition,
LSTMCRFForWordSegmentation, LSTMCRFForWordSegmentation,
LSTMCRFForPartOfSpeech,
SequenceClassificationModel, SequenceClassificationModel,
SingleBackboneTaskModelBase, SingleBackboneTaskModelBase,
TaskModelForTextGeneration, TaskModelForTextGeneration,
@@ -95,6 +96,7 @@ else:
'InformationExtractionModel', 'InformationExtractionModel',
'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForNamedEntityRecognition',
'LSTMCRFForWordSegmentation', 'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech',
'SequenceClassificationModel', 'SequenceClassificationModel',
'SingleBackboneTaskModelBase', 'SingleBackboneTaskModelBase',
'TaskModelForTextGeneration', 'TaskModelForTextGeneration',


+ 3
- 3
modelscope/models/nlp/gpt_moe/moe/sharded_moe.py View File

@@ -421,9 +421,9 @@ class MOELayer(Base):
self.use_expert_residual_network = use_expert_residual_network self.use_expert_residual_network = use_expert_residual_network


if self.use_expert_residual_network: if self.use_expert_residual_network:
self.expert_network = nn.Sequential(
*([ExpertResidualLayer(self.gate.model_dim)
for _ in range(6)]))
self.expert_network = nn.Sequential(*([
ExpertResidualLayer(self.gate.model_dim) for _ in range(6)
])) # noqa


self.use_tutel = use_tutel and TUTEL_INSTALLED self.use_tutel = use_tutel and TUTEL_INSTALLED




+ 2
- 0
modelscope/models/nlp/task_models/__init__.py View File

@@ -10,6 +10,7 @@ if TYPE_CHECKING:
from .nncrf_for_named_entity_recognition import ( from .nncrf_for_named_entity_recognition import (
LSTMCRFForNamedEntityRecognition, LSTMCRFForNamedEntityRecognition,
LSTMCRFForWordSegmentation, LSTMCRFForWordSegmentation,
LSTMCRFForPartOfSpeech,
TransformerCRFForNamedEntityRecognition, TransformerCRFForNamedEntityRecognition,
TransformerCRFForWordSegmentation, TransformerCRFForWordSegmentation,
) )
@@ -26,6 +27,7 @@ else:
'nncrf_for_named_entity_recognition': [ 'nncrf_for_named_entity_recognition': [
'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForNamedEntityRecognition',
'LSTMCRFForWordSegmentation', 'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech',
'TransformerCRFForNamedEntityRecognition', 'TransformerCRFForNamedEntityRecognition',
'TransformerCRFForWordSegmentation', 'TransformerCRFForWordSegmentation',
], ],


+ 8
- 1
modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py View File

@@ -17,7 +17,8 @@ from modelscope.utils.constant import ModelFile, Tasks


__all__ = [ __all__ = [
'TransformerCRFForNamedEntityRecognition', 'TransformerCRFForNamedEntityRecognition',
'LSTMCRFForNamedEntityRecognition'
'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech'
] ]




@@ -193,10 +194,16 @@ class LSTMCRFForNamedEntityRecognition(




@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg) @MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf)
class LSTMCRFForWordSegmentation(LSTMCRFForNamedEntityRecognition): class LSTMCRFForWordSegmentation(LSTMCRFForNamedEntityRecognition):
pass pass




@MODELS.register_module(Tasks.part_of_speech, module_name=Models.lcrf)
class LSTMCRFForPartOfSpeech(LSTMCRFForNamedEntityRecognition):
pass


class TransformerCRF(nn.Module): class TransformerCRF(nn.Module):
"""A transformer based model to NER tasks. """A transformer based model to NER tasks.




+ 4
- 0
modelscope/preprocessors/base.py View File

@@ -123,6 +123,10 @@ PREPROCESSOR_MAP = {
# taskmodels # taskmodels
(Models.lcrf, Tasks.named_entity_recognition): (Models.lcrf, Tasks.named_entity_recognition):
Preprocessors.sequence_labeling_tokenizer, Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf, Tasks.word_segmentation):
Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf, Tasks.part_of_speech):
Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf_wseg, Tasks.word_segmentation): (Models.lcrf_wseg, Tasks.word_segmentation):
Preprocessors.sequence_labeling_tokenizer, Preprocessors.sequence_labeling_tokenizer,
(Models.tcrf_wseg, Tasks.word_segmentation): (Models.tcrf_wseg, Tasks.word_segmentation):


+ 33
- 3
tests/pipelines/test_part_of_speech.py View File

@@ -4,7 +4,8 @@ import unittest


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp import TokenClassificationModel
from modelscope.models.nlp import (LSTMCRFForPartOfSpeech,
TokenClassificationModel)
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import TokenClassificationPipeline from modelscope.pipelines.nlp import TokenClassificationPipeline
from modelscope.preprocessors import \ from modelscope.preprocessors import \
@@ -15,6 +16,7 @@ from modelscope.utils.test_utils import test_level


class PartOfSpeechTest(unittest.TestCase): class PartOfSpeechTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_part-of-speech_chinese-lite' model_id = 'damo/nlp_structbert_part-of-speech_chinese-lite'
lstmcrf_news_model_id = 'damo/nlp_lstmcrf_part-of-speech_chinese-news'
sentence = '今天天气不错,适合出去游玩' sentence = '今天天气不错,适合出去游玩'


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -30,7 +32,20 @@ class PartOfSpeechTest(unittest.TestCase):
print() print()
print(f'pipeline2: {pipeline2(input=self.sentence)}') print(f'pipeline2: {pipeline2(input=self.sentence)}')


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_news_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForPartOfSpeech.from_pretrained(cache_path)
pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.part_of_speech, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassificationTransformersPreprocessor( tokenizer = TokenClassificationTransformersPreprocessor(
@@ -40,11 +55,26 @@ class PartOfSpeechTest(unittest.TestCase):
print(pipeline_ins(input=self.sentence)) print(pipeline_ins(input=self.sentence))


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.part_of_speech, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
pipeline_ins = pipeline(task=Tasks.part_of_speech, model=self.model_id) pipeline_ins = pipeline(task=Tasks.part_of_speech, model=self.model_id)
print(pipeline_ins(input=self.sentence)) print(pipeline_ins(input=self.sentence))


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_new_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.part_of_speech, model=self.lstmcrf_news_model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self): def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.part_of_speech) pipeline_ins = pipeline(task=Tasks.part_of_speech)
print(pipeline_ins(input=self.sentence)) print(pipeline_ins(input=self.sentence))


+ 88
- 2
tests/pipelines/test_word_segmentation.py View File

@@ -3,7 +3,8 @@ import unittest


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp import SbertForTokenClassification
from modelscope.models.nlp import (LSTMCRFForWordSegmentation,
SbertForTokenClassification)
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import WordSegmentationPipeline from modelscope.pipelines.nlp import WordSegmentationPipeline
from modelscope.preprocessors import \ from modelscope.preprocessors import \
@@ -19,8 +20,12 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
def setUp(self) -> None: def setUp(self) -> None:
self.task = Tasks.word_segmentation self.task = Tasks.word_segmentation
self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base' self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
self.ecom_model_id = 'damo/nlp_structbert_word-segmentation_chinese-base-ecommerce'
self.lstmcrf_news_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-news'
self.lstmcrf_ecom_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-ecommerce'


sentence = '今天天气不错,适合出去游玩' sentence = '今天天气不错,适合出去游玩'
sentence_ecom = '东阳草肌醇复合物'
sentence_eng = 'I am a program.' sentence_eng = 'I am a program.'
regress_tool = MsRegressTool(baseline=False) regress_tool = MsRegressTool(baseline=False)


@@ -36,7 +41,43 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
f'pipeline1:{pipeline1(input=self.sentence)}') f'pipeline1:{pipeline1(input=self.sentence)}')
print(f'pipeline2: {pipeline2(input=self.sentence)}') print(f'pipeline2: {pipeline2(input=self.sentence)}')


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_ecom_by_direct_model_download(self):
cache_path = snapshot_download(self.ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = SbertForTokenClassification.from_pretrained(cache_path)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence_ecom}\n'
f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_news_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_ecom_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence_ecom}\n'
f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassificationTransformersPreprocessor( tokenizer = TokenClassificationTransformersPreprocessor(
@@ -46,6 +87,33 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
print(pipeline_ins(input=self.sentence)) print(pipeline_ins(input=self.sentence))


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_ecom_with_model_from_modelhub(self):
model = Model.from_pretrained(self.ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_lstmcrf_ecom_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
pipeline_ins = pipeline( pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.model_id) task=Tasks.word_segmentation, model=self.model_id)
@@ -56,6 +124,24 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
print(pipeline_ins(input=self.sentence)) print(pipeline_ins(input=self.sentence))
print(pipeline_ins(input=self.sentence_eng)) print(pipeline_ins(input=self.sentence_eng))


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_ecom_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.ecom_model_id)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.lstmcrf_news_model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_ecom_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.lstmcrf_ecom_model_id)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name_batch(self): def test_run_with_model_name_batch(self):
pipeline_ins = pipeline( pipeline_ins = pipeline(


Loading…
Cancel
Save