1130中文分词/词性标注新增LSTM轻量化模型

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10931263
3 years ago · 000976836f
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -40,6 +40,7 @@ if TYPE_CHECKING:
        InformationExtractionModel,
        LSTMCRFForNamedEntityRecognition,
        LSTMCRFForWordSegmentation,
        LSTMCRFForPartOfSpeech,
        SequenceClassificationModel,
        SingleBackboneTaskModelBase,
        TaskModelForTextGeneration,
@@ -95,6 +96,7 @@ else:
            'InformationExtractionModel',
            'LSTMCRFForNamedEntityRecognition',
            'LSTMCRFForWordSegmentation',
            'LSTMCRFForPartOfSpeech',
            'SequenceClassificationModel',
            'SingleBackboneTaskModelBase',
            'TaskModelForTextGeneration',
--- a/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
+++ b/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
@@ -421,9 +421,9 @@ class MOELayer(Base):
        self.use_expert_residual_network = use_expert_residual_network

        if self.use_expert_residual_network:
            self.expert_network = nn.Sequential(
                *([ExpertResidualLayer(self.gate.model_dim)
                   for _ in range(6)]))
            self.expert_network = nn.Sequential(*([
                ExpertResidualLayer(self.gate.model_dim) for _ in range(6)
            ]))  # noqa

        self.use_tutel = use_tutel and TUTEL_INSTALLED

--- a/modelscope/models/nlp/task_models/init.py
+++ b/modelscope/models/nlp/task_models/init.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
    from .nncrf_for_named_entity_recognition import (
        LSTMCRFForNamedEntityRecognition,
        LSTMCRFForWordSegmentation,
        LSTMCRFForPartOfSpeech,
        TransformerCRFForNamedEntityRecognition,
        TransformerCRFForWordSegmentation,
    )
@@ -26,6 +27,7 @@ else:
        'nncrf_for_named_entity_recognition': [
            'LSTMCRFForNamedEntityRecognition',
            'LSTMCRFForWordSegmentation',
            'LSTMCRFForPartOfSpeech',
            'TransformerCRFForNamedEntityRecognition',
            'TransformerCRFForWordSegmentation',
        ],
--- a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
@@ -17,7 +17,8 @@ from modelscope.utils.constant import ModelFile, Tasks

 __all__ = [
    'TransformerCRFForNamedEntityRecognition',
    'LSTMCRFForNamedEntityRecognition'
    'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForWordSegmentation',
    'LSTMCRFForPartOfSpeech'
 ]


@@ -193,10 +194,16 @@ class LSTMCRFForNamedEntityRecognition(


@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf)
 class LSTMCRFForWordSegmentation(LSTMCRFForNamedEntityRecognition):
    pass


@MODELS.register_module(Tasks.part_of_speech, module_name=Models.lcrf)
 class LSTMCRFForPartOfSpeech(LSTMCRFForNamedEntityRecognition):
    pass


 class TransformerCRF(nn.Module):
    """A transformer based model to NER tasks.

--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -123,6 +123,10 @@ PREPROCESSOR_MAP = {
    # taskmodels
    (Models.lcrf, Tasks.named_entity_recognition):
    Preprocessors.sequence_labeling_tokenizer,
    (Models.lcrf, Tasks.word_segmentation):
    Preprocessors.sequence_labeling_tokenizer,
    (Models.lcrf, Tasks.part_of_speech):
    Preprocessors.sequence_labeling_tokenizer,
    (Models.lcrf_wseg, Tasks.word_segmentation):
    Preprocessors.sequence_labeling_tokenizer,
    (Models.tcrf_wseg, Tasks.word_segmentation):
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -4,7 +4,8 @@ import unittest

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import TokenClassificationModel
 from modelscope.models.nlp import (LSTMCRFForPartOfSpeech,
                                   TokenClassificationModel)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TokenClassificationPipeline
 from modelscope.preprocessors import \
@@ -15,6 +16,7 @@ from modelscope.utils.test_utils import test_level

 class PartOfSpeechTest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_part-of-speech_chinese-lite'
    lstmcrf_news_model_id = 'damo/nlp_lstmcrf_part-of-speech_chinese-news'
    sentence = '今天天气不错，适合出去游玩'

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -30,7 +32,20 @@ class PartOfSpeechTest(unittest.TestCase):
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lstmcrf_news_by_direct_model_download(self):
        cache_path = snapshot_download(self.lstmcrf_news_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = LSTMCRFForPartOfSpeech.from_pretrained(cache_path)
        pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.part_of_speech, model=model, preprocessor=tokenizer)
        print(f'sentence: {self.sentence}\n'
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
@@ -40,11 +55,26 @@ class PartOfSpeechTest(unittest.TestCase):
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_lstmcrf_news_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.lstmcrf_news_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.part_of_speech, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(task=Tasks.part_of_speech, model=self.model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstmcrf_new_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.part_of_speech, model=self.lstmcrf_news_model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.part_of_speech)
        print(pipeline_ins(input=self.sentence))
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -3,7 +3,8 @@ import unittest

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.models.nlp import (LSTMCRFForWordSegmentation,
                                   SbertForTokenClassification)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import \
@@ -19,8 +20,12 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
    def setUp(self) -> None:
        self.task = Tasks.word_segmentation
        self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
        self.ecom_model_id = 'damo/nlp_structbert_word-segmentation_chinese-base-ecommerce'
        self.lstmcrf_news_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-news'
        self.lstmcrf_ecom_model_id = 'damo/nlp_lstmcrf_word-segmentation_chinese-ecommerce'

    sentence = '今天天气不错，适合出去游玩'
    sentence_ecom = '东阳草肌醇复合物'
    sentence_eng = 'I am a program.'
    regress_tool = MsRegressTool(baseline=False)

@@ -36,7 +41,43 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_ecom_by_direct_model_download(self):
        cache_path = snapshot_download(self.ecom_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = SbertForTokenClassification.from_pretrained(cache_path)
        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(f'sentence: {self.sentence_ecom}\n'
              f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
        print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lstmcrf_news_by_direct_model_download(self):
        cache_path = snapshot_download(self.lstmcrf_news_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(f'sentence: {self.sentence}\n'
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lstmcrf_ecom_by_direct_model_download(self):
        cache_path = snapshot_download(self.lstmcrf_ecom_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(f'sentence: {self.sentence_ecom}\n'
              f'pipeline1:{pipeline1(input=self.sentence_ecom)}')
        print(f'pipeline2: {pipeline2(input=self.sentence_ecom)}')

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
@@ -46,6 +87,33 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_ecom_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.ecom_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence_ecom))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstmcrf_news_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.lstmcrf_news_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_lstmcrf_ecom_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.lstmcrf_ecom_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence_ecom))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=self.model_id)
@@ -56,6 +124,24 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
            print(pipeline_ins(input=self.sentence))
        print(pipeline_ins(input=self.sentence_eng))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_ecom_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=self.ecom_model_id)
        print(pipeline_ins(input=self.sentence_ecom))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstmcrf_news_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=self.lstmcrf_news_model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstmcrf_ecom_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=self.lstmcrf_ecom_model_id)
        print(pipeline_ins(input=self.sentence_ecom))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name_batch(self):
        pipeline_ins = pipeline(