Browse Source

1130中文分词/词性标注新增LSTM轻量化模型

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10931263
master^2
dingkun.ldk yingda.chen 3 years ago
parent
commit
000976836f
7 changed files with 140 additions and 9 deletions
  1. +2
    -0
      modelscope/models/nlp/__init__.py
  2. +3
    -3
      modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
  3. +2
    -0
      modelscope/models/nlp/task_models/__init__.py
  4. +8
    -1
      modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
  5. +4
    -0
      modelscope/preprocessors/base.py
  6. +33
    -3
      tests/pipelines/test_part_of_speech.py
  7. +88
    -2
      tests/pipelines/test_word_segmentation.py

+ 2
- 0
modelscope/models/nlp/__init__.py View File

@@ -40,6 +40,7 @@ if TYPE_CHECKING:
InformationExtractionModel,
LSTMCRFForNamedEntityRecognition,
LSTMCRFForWordSegmentation,
LSTMCRFForPartOfSpeech,
SequenceClassificationModel,
SingleBackboneTaskModelBase,
TaskModelForTextGeneration,
@@ -95,6 +96,7 @@ else:
'InformationExtractionModel',
'LSTMCRFForNamedEntityRecognition',
'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech',
'SequenceClassificationModel',
'SingleBackboneTaskModelBase',
'TaskModelForTextGeneration',


+ 3
- 3
modelscope/models/nlp/gpt_moe/moe/sharded_moe.py View File

@@ -421,9 +421,9 @@ class MOELayer(Base):
self.use_expert_residual_network = use_expert_residual_network

if self.use_expert_residual_network:
self.expert_network = nn.Sequential(
*([ExpertResidualLayer(self.gate.model_dim)
for _ in range(6)]))
self.expert_network = nn.Sequential(*([
ExpertResidualLayer(self.gate.model_dim) for _ in range(6)
])) # noqa

self.use_tutel = use_tutel and TUTEL_INSTALLED



+ 2
- 0
modelscope/models/nlp/task_models/__init__.py View File

@@ -10,6 +10,7 @@ if TYPE_CHECKING:
from .nncrf_for_named_entity_recognition import (
LSTMCRFForNamedEntityRecognition,
LSTMCRFForWordSegmentation,
LSTMCRFForPartOfSpeech,
TransformerCRFForNamedEntityRecognition,
TransformerCRFForWordSegmentation,
)
@@ -26,6 +27,7 @@ else:
'nncrf_for_named_entity_recognition': [
'LSTMCRFForNamedEntityRecognition',
'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech',
'TransformerCRFForNamedEntityRecognition',
'TransformerCRFForWordSegmentation',
],


+ 8
- 1
modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py View File

@@ -17,7 +17,8 @@ from modelscope.utils.constant import ModelFile, Tasks

__all__ = [
'TransformerCRFForNamedEntityRecognition',
'LSTMCRFForNamedEntityRecognition'
'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForWordSegmentation',
'LSTMCRFForPartOfSpeech'
]


@@ -193,10 +194,16 @@ class LSTMCRFForNamedEntityRecognition(


@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf)
class LSTMCRFForWordSegmentation(LSTMCRFForNamedEntityRecognition):
pass


@MODELS.register_module(Tasks.part_of_speech, module_name=Models.lcrf)
class LSTMCRFForPartOfSpeech(LSTMCRFForNamedEntityRecognition):
pass


class TransformerCRF(nn.Module):
"""A transformer based model to NER tasks.



+ 4
- 0
modelscope/preprocessors/base.py View File

@@ -123,6 +123,10 @@ PREPROCESSOR_MAP = {
# taskmodels
(Models.lcrf, Tasks.named_entity_recognition):
Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf, Tasks.word_segmentation):
Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf, Tasks.part_of_speech):
Preprocessors.sequence_labeling_tokenizer,
(Models.lcrf_wseg, Tasks.word_segmentation):
Preprocessors.sequence_labeling_tokenizer,
(Models.tcrf_wseg, Tasks.word_segmentation):


+ 33
- 3
tests/pipelines/test_part_of_speech.py View File

@@ -4,7 +4,8 @@ import unittest

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import TokenClassificationModel
from modelscope.models.nlp import (LSTMCRFForPartOfSpeech,
TokenClassificationModel)
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import TokenClassificationPipeline
from modelscope.preprocessors import \
@@ -15,6 +16,7 @@ from modelscope.utils.test_utils import test_level

class PartOfSpeechTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_part-of-speech_chinese-lite'
lstmcrf_news_model_id = 'damo/nlp_lstmcrf_part-of-speech_chinese-news'
sentence = '今天天气不错,适合出去游玩'

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -30,7 +32,20 @@ class PartOfSpeechTest(unittest.TestCase):
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_news_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForPartOfSpeech.from_pretrained(cache_path)
pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.part_of_speech, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
@@ -40,11 +55,26 @@ class PartOfSpeechTest(unittest.TestCase):
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.part_of_speech, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(task=Tasks.part_of_speech, model=self.model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_new_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.part_of_speech, model=self.lstmcrf_news_model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.part_of_speech)
print(pipeline_ins(input=self.sentence))


+ 88
- 2
tests/pipelines/test_word_segmentation.py View File

@@ -3,7 +3,8 @@ import unittest

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import SbertForTokenClassification
from modelscope.models.nlp import (LSTMCRFForWordSegmentation,
SbertForTokenClassification)
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import WordSegmentationPipeline
from modelscope.preprocessors import \
@@ -19,8 +20,12 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
def setUp(self) -> None:
self.task = Tasks.word_segmentation
self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
self.ecom_model_id = 'damo/nlp_structbert_word-segmentation_chinese-base-ecommerce'
self.lstmcrf_news_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-news'
self.lstmcrf_ecom_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-ecommerce'

sentence = '今天天气不错,适合出去游玩'
sentence_ecom = '东阳草肌醇复合物'
sentence_eng = 'I am a program.'
regress_tool = MsRegressTool(baseline=False)

@@ -36,7 +41,43 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
f'pipeline1:{pipeline1(input=self.sentence)}')
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_ecom_by_direct_model_download(self):
cache_path = snapshot_download(self.ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = SbertForTokenClassification.from_pretrained(cache_path)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence_ecom}\n'
f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_news_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lstmcrf_ecom_by_direct_model_download(self):
cache_path = snapshot_download(self.lstmcrf_ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence_ecom}\n'
f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
@@ -46,6 +87,33 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_ecom_with_model_from_modelhub(self):
model = Model.from_pretrained(self.ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_news_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_lstmcrf_ecom_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lstmcrf_ecom_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.model_id)
@@ -56,6 +124,24 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
print(pipeline_ins(input=self.sentence))
print(pipeline_ins(input=self.sentence_eng))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_ecom_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.ecom_model_id)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_news_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.lstmcrf_news_model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstmcrf_ecom_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.lstmcrf_ecom_model_id)
print(pipeline_ins(input=self.sentence_ecom))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name_batch(self):
pipeline_ins = pipeline(


Loading…
Cancel
Save