From 1640df567b12025fa53c7a401df81aa449bad508 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 10 Aug 2022 13:30:42 +0800 Subject: [PATCH] [to #42322933] refine some comments Refine some comments. Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9674650 --- .../metrics/sequence_classification_metric.py | 4 +- modelscope/metrics/text_generation_metric.py | 2 + .../metrics/token_classification_metric.py | 6 +- modelscope/models/nlp/backbones/structbert.py | 2 - modelscope/models/nlp/masked_language.py | 12 ++++ .../nlp/nncrf_for_named_entity_recognition.py | 6 ++ .../models/nlp/sequence_classification.py | 32 +++++++++ modelscope/models/nlp/token_classification.py | 25 ++++++- .../nlp/dialog_intent_prediction_pipeline.py | 7 +- .../nlp/dialog_state_tracking_pipeline.py | 5 +- .../pipelines/nlp/fill_mask_pipeline.py | 24 ++++++- .../nlp/named_entity_recognition_pipeline.py | 18 +++++ .../pair_sentence_classification_pipeline.py | 25 ++++++- .../sequence_classification_pipeline_base.py | 6 +- ...single_sentence_classification_pipeline.py | 24 ++++++- .../pipelines/nlp/summarization_pipeline.py | 8 ++- .../task_oriented_conversation_pipeline.py | 7 +- .../nlp/text_error_correction_pipeline.py | 16 ++++- .../pipelines/nlp/text_generation_pipeline.py | 28 ++++++-- .../pipelines/nlp/translation_pipeline.py | 7 +- .../nlp/word_segmentation_pipeline.py | 23 +++++-- .../nlp/zero_shot_classification_pipeline.py | 32 ++++++++- modelscope/preprocessors/nlp.py | 66 ++++++++++++++++++- modelscope/trainers/nlp_trainer.py | 28 ++++++++ modelscope/utils/hub.py | 18 +++++ 25 files changed, 385 insertions(+), 46 deletions(-) diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py index 04b0ee81..83cb39ca 100644 --- a/modelscope/metrics/sequence_classification_metric.py +++ b/modelscope/metrics/sequence_classification_metric.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union +from typing import Dict import numpy as np @@ -15,6 +15,8 @@ from .builder import METRICS, MetricKeys group_key=default_group, module_name=Metrics.seq_cls_metric) class SequenceClassificationMetric(Metric): """The metric computation class for sequence classification classes. + + This metric class calculates accuracy for the whole input batches. """ def __init__(self, *args, **kwargs): diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index 6390d33b..6bdcbc58 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -10,6 +10,8 @@ from .builder import METRICS, MetricKeys group_key=default_group, module_name=Metrics.text_gen_metric) class TextGenerationMetric(Metric): """The metric computation class for text generation classes. + + This metric class calculates F1 of the rouge scores for the whole evaluation dataset. """ def __init__(self): diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py index 8606148e..53d13b6a 100644 --- a/modelscope/metrics/token_classification_metric.py +++ b/modelscope/metrics/token_classification_metric.py @@ -14,8 +14,10 @@ from .builder import METRICS, MetricKeys @METRICS.register_module( group_key=default_group, module_name=Metrics.token_cls_metric) class TokenClassificationMetric(Metric): - """ - The metric computation class for token-classification task. + """The metric computation class for token-classification task. + + This metric class uses seqeval to calculate the scores. + Args: return_entity_level_metrics (bool, *optional*): Whether to return every label's detail metrics, default False. diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py index cc062129..f47900c3 100644 --- a/modelscope/models/nlp/backbones/structbert.py +++ b/modelscope/models/nlp/backbones/structbert.py @@ -1,5 +1,3 @@ -from transformers import PreTrainedModel - from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import BACKBONES diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index ff16335f..17324be9 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -17,6 +17,10 @@ __all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM'] @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert) class StructBertForMaskedLM(TorchModel, SbertForMaskedLM): + """Structbert for MLM model. + + Inherited from structbert.SbertForMaskedLM and TorchModel, so this class can be registered into Model sets. + """ def __init__(self, config, model_dir): super(TorchModel, self).__init__(model_dir) @@ -49,6 +53,10 @@ class StructBertForMaskedLM(TorchModel, SbertForMaskedLM): @MODELS.register_module(Tasks.fill_mask, module_name=Models.bert) class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer): + """Bert for MLM model. + + Inherited from transformers.BertForMaskedLM and TorchModel, so this class can be registered into Model sets. + """ def __init__(self, config, model_dir): super(TorchModel, self).__init__(model_dir) @@ -83,6 +91,10 @@ class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer): @MODELS.register_module(Tasks.fill_mask, module_name=Models.veco) class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer): + """Veco for MLM model. + + Inherited from veco.VecoForMaskedLM and TorchModel, so this class can be registered into Model sets. + """ def __init__(self, config, model_dir): super(TorchModel, self).__init__(model_dir) diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py index de6bef65..2015997f 100644 --- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py @@ -16,6 +16,8 @@ __all__ = ['TransformerCRFForNamedEntityRecognition'] @MODELS.register_module( Tasks.named_entity_recognition, module_name=Models.tcrf) class TransformerCRFForNamedEntityRecognition(TorchModel): + """This model wraps the TransformerCRF model to register into model sets. + """ def __init__(self, model_dir, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) @@ -63,6 +65,10 @@ class TransformerCRFForNamedEntityRecognition(TorchModel): class TransformerCRF(nn.Module): + """A transformer based model to NER tasks. + + This model will use transformers' backbones as its backbone. + """ def __init__(self, model_dir, num_labels, **kwargs): super(TransformerCRF, self).__init__() diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py index 5550d749..e8802dbd 100644 --- a/modelscope/models/nlp/sequence_classification.py +++ b/modelscope/models/nlp/sequence_classification.py @@ -18,6 +18,8 @@ __all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification'] class SequenceClassificationBase(TorchModel): + """A sequence classification base class for all the fitted sequence classification models. + """ base_model_prefix: str = 'bert' def __init__(self, config, model_dir): @@ -81,6 +83,10 @@ class SequenceClassificationBase(TorchModel): Tasks.zero_shot_classification, module_name=Models.structbert) class SbertForSequenceClassification(SequenceClassificationBase, SbertPreTrainedModel): + """Sbert sequence classification model. + + Inherited from SequenceClassificationBase. + """ base_model_prefix: str = 'bert' supports_gradient_checkpointing = True _keys_to_ignore_on_load_missing = [r'position_ids'] @@ -108,6 +114,16 @@ class SbertForSequenceClassification(SequenceClassificationBase, @classmethod def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + model_dir = kwargs.get('model_dir') num_labels = kwargs.get('num_labels') if num_labels is None: @@ -129,6 +145,12 @@ class SbertForSequenceClassification(SequenceClassificationBase, @MODELS.register_module(Tasks.nli, module_name=Models.veco) class VecoForSequenceClassification(TorchModel, VecoForSequenceClassificationTransform): + """Veco sequence classification model. + + Inherited from VecoForSequenceClassification and TorchModel, so this class can be registered into the model set. + This model cannot be inherited from SequenceClassificationBase, because Veco/XlmRoberta's classification structure + is different. + """ def __init__(self, config, model_dir): super().__init__(model_dir) @@ -159,6 +181,16 @@ class VecoForSequenceClassification(TorchModel, @classmethod def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by veco.VecoForSequenceClassification.from_pretrained + """ + model_dir = kwargs.get('model_dir') num_labels = kwargs.get('num_labels') if num_labels is None: diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py index ebb1eda2..59d7d0cf 100644 --- a/modelscope/models/nlp/token_classification.py +++ b/modelscope/models/nlp/token_classification.py @@ -19,6 +19,8 @@ __all__ = ['SbertForTokenClassification'] class TokenClassification(TorchModel): + """A token classification base class for all the fitted token classification models. + """ base_model_prefix: str = 'bert' @@ -56,7 +58,7 @@ class TokenClassification(TorchModel): labels: The labels **kwargs: Other input params. - Returns: Loss. + Returns: The loss. """ pass @@ -92,6 +94,10 @@ class TokenClassification(TorchModel): @MODELS.register_module( Tasks.token_classification, module_name=Models.structbert) class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): + """Sbert token classification model. + + Inherited from TokenClassification. + """ supports_gradient_checkpointing = True _keys_to_ignore_on_load_unexpected = [r'pooler'] @@ -118,6 +124,14 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): labels=labels) def compute_loss(self, logits, labels, attention_mask=None, **kwargs): + """Compute the loss with an attention mask. + + @param logits: The logits output from the classifier. + @param labels: The labels. + @param attention_mask: The attention_mask. + @param kwargs: Unused input args. + @return: The loss + """ loss_fct = nn.CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: @@ -132,6 +146,15 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): @classmethod def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ model_dir = kwargs.get('model_dir') num_labels = kwargs.get('num_labels') if num_labels is None: diff --git a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py index a425f21a..2c498a98 100644 --- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py @@ -23,11 +23,12 @@ class DialogIntentPredictionPipeline(Pipeline): model: Union[SpaceForDialogIntent, str], preprocessor: DialogIntentPredictionPreprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a dialog intent prediction pipeline + """Use `model` and `preprocessor` to create a dialog intent prediction pipeline Args: - model (SpaceForDialogIntent): a model instance - preprocessor (DialogIntentPredictionPreprocessor): a preprocessor instance + model (str or SpaceForDialogIntent): Supply either a local model dir or a model id from the model hub, + or a SpaceForDialogIntent instance. + preprocessor (DialogIntentPredictionPreprocessor): An optional preprocessor instance. """ model = model if isinstance( model, SpaceForDialogIntent) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index 2f5b64e1..2ab012fe 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -24,8 +24,9 @@ class DialogStateTrackingPipeline(Pipeline): observation of dialog states tracking after many turns of open domain dialogue Args: - model (SpaceForDialogStateTracking): a model instance - preprocessor (DialogStateTrackingPreprocessor): a preprocessor instance + model (str or SpaceForDialogStateTracking): Supply either a local model dir or a model id + from the model hub, or a SpaceForDialogStateTracking instance. + preprocessor (DialogStateTrackingPreprocessor): An optional preprocessor instance. """ model = model if isinstance( diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 644db597..60a9631b 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -24,11 +24,29 @@ class FillMaskPipeline(Pipeline): preprocessor: Optional[Preprocessor] = None, first_sequence='sentence', **kwargs): - """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction + """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction Args: - model (Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported mlm task, or a + mlm model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the sentence in. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large') + >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].' + >>> print(pipeline_ins(input)) + + NOTE2: Please pay attention to the model's special tokens. + If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'. + If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is ''. + To view other examples plese check the tests/pipelines/test_fill_mask.py. """ fill_mask_model = model if isinstance( model, Model) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 4ea2f45d..b0b06c88 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -22,6 +22,24 @@ class NamedEntityRecognitionPipeline(Pipeline): model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, **kwargs): + """Use `model` and `preprocessor` to create a nlp NER pipeline for prediction + + Args: + model (str or Model): Supply either a local model dir which supported NER task, or a + model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='named-entity-recognition', + >>> model='damo/nlp_raner_named-entity-recognition_chinese-base-news') + >>> input = '这与温岭市新河镇的一个神秘的传说有关。' + >>> print(pipeline_ins(input)) + + To view other examples plese check the tests/pipelines/test_named_entity_recognition.py. + """ model = model if isinstance(model, Model) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py index d0329da8..5248db8c 100644 --- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py @@ -23,11 +23,30 @@ class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): first_sequence='first_sequence', second_sequence='second_sequence', **kwargs): - """use `model` and `preprocessor` to create a nlp pair sentence classification pipeline for prediction + """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction. Args: - model (Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported the sequence classification task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the first sentence in. + second_sequence: The key to read the second sentence in. + sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. + + NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and + 'second_sequence' param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base') + >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好?' + >>> sentence2 = '四川商务职业学院商务管理在哪个校区?' + >>> print(pipeline_ins((sentence1, sentence2))) + >>> # Or use the dict input: + >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2})) + + To view other examples plese check the tests/pipelines/test_nli.py. """ if preprocessor is None: preprocessor = PairSentenceClassificationPreprocessor( diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py index ad31bfbd..25d68993 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py @@ -13,11 +13,11 @@ class SequenceClassificationPipelineBase(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Preprocessor, **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + """This is the base class for all the sequence classification sub-tasks. Args: - model (str or Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): A model instance or a model local dir or a model id in the model hub. + preprocessor (Preprocessor): a preprocessor instance, must not be None. """ assert isinstance(model, str) or isinstance(model, Model), \ 'model must be a single str or Model' diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py index cc91ddf2..844c6839 100644 --- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py @@ -22,11 +22,29 @@ class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): preprocessor: Preprocessor = None, first_sequence='first_sequence', **kwargs): - """use `model` and `preprocessor` to create a nlp single sentence classification pipeline for prediction + """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction. Args: - model (Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported the sequence classification task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the first sentence in. + sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='sentiment-classification', + >>> model='damo/nlp_structbert_sentiment-classification_chinese-base') + >>> sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' + >>> print(pipeline_ins(sentence1)) + >>> # Or use the dict input: + >>> print(pipeline_ins({'first_sequence': sentence1})) + + To view other examples plese check the tests/pipelines/test_sentiment-classification.py. """ if preprocessor is None: preprocessor = SingleSentenceClassificationPreprocessor( diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py index 7c163f04..7a91eff1 100644 --- a/modelscope/pipelines/nlp/summarization_pipeline.py +++ b/modelscope/pipelines/nlp/summarization_pipeline.py @@ -20,10 +20,12 @@ class SummarizationPipeline(Pipeline): model: Union[Model, str], preprocessor: [Preprocessor] = None, **kwargs): - """ - use `model` and `preprocessor` to create a kws pipeline for prediction + """Use `model` and `preprocessor` to create a Summarization pipeline for prediction. + Args: - model: model id on modelscope hub. + model (str or Model): Supply either a local model dir which supported the summarization task, + or a model id from the model hub, or a model instance. + preprocessor (Preprocessor): An optional preprocessor instance. """ super().__init__(model=model) assert isinstance(model, str) or isinstance(model, Model), \ diff --git a/modelscope/pipelines/nlp/task_oriented_conversation_pipeline.py b/modelscope/pipelines/nlp/task_oriented_conversation_pipeline.py index e946596f..49e09422 100644 --- a/modelscope/pipelines/nlp/task_oriented_conversation_pipeline.py +++ b/modelscope/pipelines/nlp/task_oriented_conversation_pipeline.py @@ -23,11 +23,12 @@ class TaskOrientedConversationPipeline(Pipeline): model: Union[SpaceForDialogModeling, str], preprocessor: DialogModelingPreprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a dialog modeling pipeline for dialog response generation + """Use `model` and `preprocessor` to create a dialog modeling pipeline for dialog response generation Args: - model (SpaceForDialogModeling): a model instance - preprocessor (DialogModelingPreprocessor): a preprocessor instance + model (str or SpaceForDialogModeling): Supply either a local model dir or a model id from the model hub, + or a SpaceForDialogModeling instance. + preprocessor (DialogModelingPreprocessor): An optional preprocessor instance. """ model = model if isinstance( model, SpaceForDialogModeling) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py index 44fae08f..b63d8d36 100644 --- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py +++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py @@ -23,12 +23,22 @@ class TextErrorCorrectionPipeline(Pipeline): model: Union[BartForTextErrorCorrection, str], preprocessor: Optional[TextErrorCorrectionPreprocessor] = None, **kwargs): - """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction + """use `model` and `preprocessor` to create a nlp text correction pipeline. Args: - model (BartForTextErrorCorrection): a model instance - preprocessor (TextErrorCorrectionPreprocessor): a preprocessor instance + model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub. + preprocessor (TextErrorCorrectionPreprocessor): An optional preprocessor instance. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline( + >>> task='text-error-correction', model='damo/nlp_bart_text-error-correction_chinese') + >>> sentence1 = '随着中国经济突飞猛近,建造工业与日俱增' + >>> print(pipeline_ins(sentence1)) + + To view other examples plese check the tests/pipelines/test_text_error_correction.py. """ + model = model if isinstance( model, BartForTextErrorCorrection) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 8e9b36d5..3d27ffa9 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -19,19 +19,39 @@ class TextGenerationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[TextGenerationPreprocessor] = None, + first_sequence='sentence', **kwargs): - """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction + """Use `model` and `preprocessor` to create a generation pipeline for prediction. Args: - model (PalmForTextGeneration): a model instance - preprocessor (TextGenerationPreprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported the text generation task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the first sentence in. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='text-generation', + >>> model='damo/nlp_palm2.0_text-generation_chinese-base') + >>> sentence1 = '本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:' + >>> '1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代' + >>> print(pipeline_ins(sentence1)) + >>> # Or use the dict input: + >>> print(pipeline_ins({'sentence': sentence1})) + + To view other examples plese check the tests/pipelines/test_text_generation.py. """ model = model if isinstance(model, Model) else Model.from_pretrained(model) if preprocessor is None: preprocessor = TextGenerationPreprocessor( model.model_dir, - first_sequence='sentence', + first_sequence=first_sequence, second_sequence=None, sequence_length=kwargs.pop('sequence_length', 128)) model.eval() diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index ebd51f02..909e3c6c 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -5,6 +5,7 @@ import numpy as np import tensorflow as tf from modelscope.metainfo import Pipelines +from modelscope.models.base import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES @@ -25,7 +26,11 @@ __all__ = ['TranslationPipeline'] Tasks.translation, module_name=Pipelines.csanmt_translation) class TranslationPipeline(Pipeline): - def __init__(self, model: str, **kwargs): + def __init__(self, model: Model, **kwargs): + """Build a translation pipeline with a model dir or a model id in the model hub. + + @param model: A Model instance. + """ super().__init__(model=model) model = self.model.model_dir tf.reset_default_graph() diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 5d80ea22..66a5c524 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -5,7 +5,7 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import (Preprocessor, TokenClassificationPreprocessor) @@ -22,11 +22,26 @@ class WordSegmentationPipeline(Pipeline): model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, **kwargs): - """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction + """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction. Args: - model (Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported the WS task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: The preprocessor will first split the sentence into single characters, + then feed them into the tokenizer with the parameter is_split_into_words=True. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='word-segmentation', + >>> model='damo/nlp_structbert_word-segmentation_chinese-base') + >>> sentence1 = '今天天气不错,适合出去游玩' + >>> print(pipeline_ins(sentence1)) + + To view other examples plese check the tests/pipelines/test_word_segmentation.py. """ model = model if isinstance(model, Model) else Model.from_pretrained(model) diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index 56ef0da0..e39cb0e1 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -24,10 +24,36 @@ class ZeroShotClassificationPipeline(Pipeline): model: Union[Model, str], preprocessor: Preprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a nlp zero-shot text classification pipeline for prediction + """Use `model` and `preprocessor` to create a nlp zero shot classifiction for prediction. + + A zero-shot classification task is used to classify texts by prompts. + In a normal classification task, model may produce a positive label by the input text + like 'The ice cream is made of the high quality milk, it is so delicious' + In a zero-shot task, the sentence is converted to: + ['The ice cream is made of the high quality milk, it is so delicious', 'This means it is good'] + And: + ['The ice cream is made of the high quality milk, it is so delicious', 'This means it is bad'] + Then feed these sentences into the model and turn the task to a NLI task(entailment, contradiction), + and compare the output logits to give the original classification label. + + Args: - model (Model): a model instance - preprocessor (Preprocessor): a preprocessor instance + model (str or Model): Supply either a local model dir which supported the task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='zero-shot-classification', + >>> model='damo/nlp_structbert_zero-shot-classification_chinese-base') + >>> sentence1 = '全新突破 解放军运20版空中加油机曝光' + >>> labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事'] + >>> template = '这篇文章的标题是{}' + >>> print(pipeline_ins(sentence1, candidate_labels=labels, hypothesis_template=template)) + + To view other examples plese check the tests/pipelines/test_zero_shot_classification.py. """ assert isinstance(model, str) or isinstance(model, Model), \ 'model must be a single str or Model' diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 41f78c4a..f231df9a 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -107,10 +107,20 @@ class SequenceClassificationPreprocessor(Preprocessor): class NLPTokenizerPreprocessorBase(Preprocessor): def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """The NLP tokenizer preprocessor base class. + + Any nlp preprocessor which uses the hf tokenizer can inherit from this class. Args: - model_dir (str): model path + model_dir (str): The local model path + first_sequence: The key for the first sequence + second_sequence: The key for the second sequence + label: The label key + label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping + if this mapping is not supplied. + pair (bool): Pair sentence input or single sentence input. + mode: Run this preprocessor in either 'train'/'eval'/'inference' mode + kwargs: These kwargs will be directly fed into the tokenizer. """ super().__init__(**kwargs) @@ -132,11 +142,24 @@ class NLPTokenizerPreprocessorBase(Preprocessor): @property def id2label(self): + """Return the id2label mapping according to the label2id mapping. + + @return: The id2label mapping if exists. + """ if self.label2id is not None: return {id: label for label, id in self.label2id.items()} return None def build_tokenizer(self, model_dir): + """Build a tokenizer by the model type. + + NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a + multi-thread problem. + + @param model_dir: The local model dir. + @return: The initialized tokenizer. + """ + model_type = get_model_type(model_dir) if model_type in (Models.structbert, Models.gpt3, Models.palm): from modelscope.models.nlp.structbert import SbertTokenizer @@ -172,6 +195,15 @@ class NLPTokenizerPreprocessorBase(Preprocessor): return output def parse_text_and_label(self, data): + """Parse the input and return the sentences and labels. + + When input type is tuple or list and its size is 2: + If the pair param is False, data will be parsed as the first_sentence and the label, + else it will be parsed as the first_sentence and the second_sentence. + + @param data: The input data. + @return: The sentences and labels tuple. + """ text_a, text_b, labels = None, None, None if isinstance(data, str): text_a = data @@ -191,6 +223,16 @@ class NLPTokenizerPreprocessorBase(Preprocessor): return text_a, text_b, labels def labels_to_id(self, labels, output): + """Turn the labels to id with the type int or float. + + If the original label's type is str or int, the label2id mapping will try to convert it to the final label. + If the original label's type is float, or the label2id mapping does not exist, + the original label will be returned. + + @param labels: The input labels. + @param output: The label id. + @return: The final labels. + """ def label_can_be_mapped(label): return isinstance(label, str) or isinstance(label, int) @@ -212,6 +254,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in pair sentence classification. + """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) @@ -224,6 +268,8 @@ class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in single sentence classification. + """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) @@ -236,6 +282,8 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in zero shot classification. + """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -277,6 +325,8 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in text generation. + """ def __init__(self, model_dir: str, @@ -325,6 +375,8 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): @PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) class FillMaskPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in MLM task. + """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) @@ -339,6 +391,8 @@ class FillMaskPreprocessor(NLPTokenizerPreprocessorBase): Fields.nlp, module_name=Preprocessors.word_segment_text_to_label_preprocessor) class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): + """The preprocessor used to turn a single sentence to a labeled token-classification dict. + """ def __init__(self, **kwargs): super().__init__(**kwargs) @@ -373,6 +427,8 @@ class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in normal token classification task. + """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) @@ -455,6 +511,10 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.ner_tokenizer) class NERPreprocessor(Preprocessor): + """The tokenizer preprocessor used in normal NER task. + + NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. + """ def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -544,6 +604,8 @@ class NERPreprocessor(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_error_correction) class TextErrorCorrectionPreprocessor(Preprocessor): + """The preprocessor used in text correction task. + """ def __init__(self, model_dir: str, *args, **kwargs): from fairseq.data import Dictionary diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 527f4087..322070a1 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -38,8 +38,36 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): **kwargs): """Add code to adapt with nlp models. + This trainer will accept the information of labels&text keys in the cfg, and then initialize + the nlp models/preprocessors with this information. + + Labels&text key information may be carried in the cfg like this: + + >>> cfg = { + >>> ... + >>> "dataset": { + >>> "train": { + >>> "first_sequence": "text1", + >>> "second_sequence": "text2", + >>> "label": "label", + >>> "labels": [1, 2, 3, 4] + >>> } + >>> } + >>> } + + Args: cfg_modify_fn: An input fn which is used to modify the cfg read out of the file. + + Example: + >>> def cfg_modify_fn(cfg): + >>> cfg.preprocessor.first_sequence= 'text1' + >>> cfg.preprocessor.second_sequence='text2' + >>> return cfg + + To view some actual finetune examples, please check the test files listed below: + tests/trainers/test_finetune_sequence_classification.py + tests/trainers/test_finetune_token_classification.py """ if isinstance(model, str): diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 6e5326f4..6d685b87 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -74,6 +74,14 @@ def auto_load(model: Union[str, List[str]]): def get_model_type(model_dir): + """Get the model type from the configuration. + + This method will try to get the 'model.type' or 'model.model_type' field from the configuration.json file. + If this file does not exist, the method will try to get the 'model_type' field from the config.json. + + @param model_dir: The local model dir to use. + @return: The model type string, returns None if nothing is found. + """ try: configuration_file = osp.join(model_dir, ModelFile.CONFIGURATION) config_file = osp.join(model_dir, 'config.json') @@ -89,6 +97,16 @@ def get_model_type(model_dir): def parse_label_mapping(model_dir): + """Get the label mapping from the model dir. + + This method will do: + 1. Try to read label-id mapping from the label_mapping.json + 2. Try to read label-id mapping from the configuration.json + 3. Try to read label-id mapping from the config.json + + @param model_dir: The local model dir to use. + @return: The label2id mapping if found. + """ import json import os label2id = None