diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin index a85d787f..ed753e50 100644 --- a/data/test/regression/sbert_ws_zh.bin +++ b/data/test/regression/sbert_ws_zh.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030 -size 63349 +oid sha256:3b38bfb5a851d35d5fba4d59eda926557666dbd62c70e3e3b24c22605e7d9c4a +size 40771 diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py index 7cee331b..7a11f73a 100644 --- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py +++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py @@ -7,7 +7,8 @@ from torch.utils.data.dataloader import default_collate from modelscope.exporters.builder import EXPORTERS from modelscope.exporters.torch_model_exporter import TorchModelExporter from modelscope.metainfo import Models -from modelscope.preprocessors import Preprocessor, build_preprocessor +from modelscope.preprocessors import ( + TextClassificationTransformersPreprocessor, build_preprocessor) from modelscope.utils.config import Config from modelscope.utils.constant import ModeKeys, Tasks @@ -59,12 +60,13 @@ class SbertForSequenceClassificationExporter(TorchModelExporter): 'mode': ModeKeys.TRAIN, **sequence_length }) - preprocessor: Preprocessor = build_preprocessor(cfg, field_name) + preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor( + cfg, field_name) if pair: - first_sequence = preprocessor.tokenizer.unk_token - second_sequence = preprocessor.tokenizer.unk_token + first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token + second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token else: - first_sequence = preprocessor.tokenizer.unk_token + first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token second_sequence = None batched = [] diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py index 1fe1c329..dc11c3d8 100644 --- a/modelscope/metrics/sequence_classification_metric.py +++ b/modelscope/metrics/sequence_classification_metric.py @@ -19,18 +19,27 @@ from .builder import METRICS, MetricKeys class SequenceClassificationMetric(Metric): """The metric computation class for sequence classification tasks. - This metric class calculates accuracy of the whole input batches. + This metric class calculates accuracy/F1 of all the input batches. + + Args: + label_name: The key of label column in the 'inputs' arg. + logit_name: The key of logits column in the 'inputs' arg. """ - def __init__(self, *args, **kwargs): + def __init__(self, + label_name=OutputKeys.LABELS, + logit_name=OutputKeys.LOGITS, + *args, + **kwargs): super().__init__(*args, **kwargs) self.preds = [] self.labels = [] + self.label_name = label_name + self.logit_name = logit_name def add(self, outputs: Dict, inputs: Dict): - label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS - ground_truths = inputs[label_name] - eval_results = outputs[OutputKeys.LOGITS] + ground_truths = inputs[self.label_name] + eval_results = outputs[self.logit_name] self.preds.append( torch_nested_numpify(torch_nested_detach(eval_results))) self.labels.append( diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index 08df5235..3d6e6964 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -18,16 +18,22 @@ class TextGenerationMetric(Metric): """The metric computation class for text generation classes. This metric class calculates F1 of the rouge scores for the whole evaluation dataset. + + Args: + target_text: The key of the target text column in the `inputs` arg. + pred_text: The key of the predicted text column in the `outputs` arg. """ - def __init__(self): + def __init__(self, target_text='tgts', pred_text='preds'): self.preds: List[str] = [] self.tgts: List[str] = [] self.rouge = Rouge() + self.target_text = target_text + self.pred_text = pred_text def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]): - ground_truths = inputs['tgts'] - eval_results = outputs['preds'] + ground_truths = inputs[self.target_text] + eval_results = outputs[self.pred_text] for truth in ground_truths: self.tgts.append(rebuild_chinese_str(truth)) for result in eval_results: diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py index f8595fc1..5d1ece4a 100644 --- a/modelscope/metrics/token_classification_metric.py +++ b/modelscope/metrics/token_classification_metric.py @@ -21,20 +21,16 @@ class TokenClassificationMetric(Metric): This metric class uses seqeval to calculate the scores. Args: - return_entity_level_metrics (bool, *optional*): + label_name(str, `optional`): The key of label column in the 'inputs' arg. + logit_name(str, `optional`): The key of logits column in the 'inputs' arg. + return_entity_level_metrics (bool, `optional`): Whether to return every label's detail metrics, default False. + label2id(dict, `optional`): The label2id information to get the token labels. """ - def add(self, outputs: Dict, inputs: Dict): - label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS - ground_truths = inputs[label_name] - eval_results = outputs[OutputKeys.LOGITS] - self.preds.append( - torch_nested_numpify(torch_nested_detach(eval_results))) - self.labels.append( - torch_nested_numpify(torch_nested_detach(ground_truths))) - def __init__(self, + label_name=OutputKeys.LABELS, + logit_name=OutputKeys.LOGITS, return_entity_level_metrics=False, label2id=None, *args, @@ -44,6 +40,16 @@ class TokenClassificationMetric(Metric): self.preds = [] self.labels = [] self.label2id = label2id + self.label_name = label_name + self.logit_name = logit_name + + def add(self, outputs: Dict, inputs: Dict): + ground_truths = inputs[self.label_name] + eval_results = outputs[self.logit_name] + self.preds.append( + torch_nested_numpify(torch_nested_detach(eval_results))) + self.labels.append( + torch_nested_numpify(torch_nested_detach(ground_truths))) def evaluate(self): label2id = self.label2id diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index 1f464bf3..94757641 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -6,7 +6,8 @@ from typing import Any, Callable, Dict, List, Optional, Union from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.builder import build_model -from modelscope.utils.checkpoint import save_checkpoint, save_pretrained +from modelscope.utils.checkpoint import (save_checkpoint, save_configuration, + save_pretrained) from modelscope.utils.config import Config from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile from modelscope.utils.device import verify_device @@ -129,11 +130,9 @@ class Model(ABC): model_cfg[k] = v if device is not None: model_cfg.device = device - model = build_model( - model_cfg, task_name=task_name, default_args=kwargs) + model = build_model(model_cfg, task_name=task_name) else: - model = build_model( - model_cfg, task_name=task_name, default_args=kwargs) + model = build_model(model_cfg, task_name=task_name) # dynamically add pipeline info to model for pipeline inference if hasattr(cfg, 'pipeline'): @@ -142,6 +141,7 @@ class Model(ABC): if not hasattr(model, 'cfg'): model.cfg = cfg + model_cfg.pop('model_dir', None) model.name = model_name_or_path model.model_dir = local_model_dir return model @@ -151,6 +151,7 @@ class Model(ABC): save_checkpoint_names: Union[str, List[str]] = None, save_function: Callable = save_checkpoint, config: Optional[dict] = None, + save_config_function: Callable = save_configuration, **kwargs): """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded @@ -168,18 +169,15 @@ class Model(ABC): config (Optional[dict], optional): The config for the configuration.json, might not be identical with model.config + save_config_function (Callble, optional): + The function to use to save the configuration. + """ if config is None and hasattr(self, 'cfg'): config = self.cfg - assert config is not None, 'Cannot save the model because the model config is empty.' - if isinstance(config, Config): - config = config.to_dict() - if 'preprocessor' in config and config['preprocessor'] is not None: - if 'mode' in config['preprocessor']: - config['preprocessor']['mode'] = 'inference' - elif 'val' in config['preprocessor'] and 'mode' in config[ - 'preprocessor']['val']: - config['preprocessor']['val']['mode'] = 'inference' + + if config is not None: + save_config_function(target_folder, config) save_pretrained(self, target_folder, save_checkpoint_names, - save_function, config, **kwargs) + save_function, **kwargs) diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py index 3c99a1f2..ff059f7b 100644 --- a/modelscope/models/base/base_torch_model.py +++ b/modelscope/models/base/base_torch_model.py @@ -6,6 +6,7 @@ import torch from torch import nn from modelscope.utils.file_utils import func_receive_dict_inputs +from modelscope.utils.hub import parse_label_mapping from modelscope.utils.logger import get_logger from .base_model import Model diff --git a/modelscope/models/nlp/T5/backbone.py b/modelscope/models/nlp/T5/backbone.py index 9a46d980..e8abfbae 100644 --- a/modelscope/models/nlp/T5/backbone.py +++ b/modelscope/models/nlp/T5/backbone.py @@ -36,9 +36,7 @@ from transformers.utils.model_parallel_utils import (assert_device_map, from modelscope.metainfo import Models from modelscope.models.base import Model, Tensor, TorchModel from modelscope.models.builder import MODELS -from modelscope.outputs import (BaseModelOutput, - BaseModelOutputWithPastAndCrossAttentions, - Seq2SeqModelOutput) +from modelscope.outputs import AttentionBackboneModelOutput, Seq2SeqModelOutput from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from .configuration import T5Config @@ -1182,7 +1180,7 @@ class T5Stack(T5PreTrainedModel): all_attentions, all_cross_attentions, ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=hidden_states, past_key_values=present_key_value_states, hidden_states=all_hidden_states, @@ -1475,8 +1473,9 @@ class T5Model(T5PreTrainedModel): output_hidden_states=output_hidden_states, return_dict=return_dict, ) - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( + elif return_dict and not isinstance(encoder_outputs, + AttentionBackboneModelOutput): + encoder_outputs = AttentionBackboneModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py index c4dcdfdb..0275ecb9 100644 --- a/modelscope/models/nlp/T5/text2text_generation.py +++ b/modelscope/models/nlp/T5/text2text_generation.py @@ -24,7 +24,8 @@ from transformers.utils.model_parallel_utils import (assert_device_map, from modelscope.metainfo import Models from modelscope.models.builder import MODELS -from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput +from modelscope.outputs import (AttentionBackboneModelOutput, Seq2SeqLMOutput, + TokenGeneratorOutput) from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from .backbone import T5PreTrainedModel, T5Stack @@ -311,8 +312,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel): output_hidden_states=output_hidden_states, return_dict=return_dict, ) - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( + elif return_dict and not isinstance(encoder_outputs, + AttentionBackboneModelOutput): + encoder_outputs = AttentionBackboneModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -426,6 +428,16 @@ class T5ForConditionalGeneration(T5PreTrainedModel): def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) + def generate( + self, + *args, + **kwargs, + ): + output = super().generate(*args, **kwargs) + return TokenGeneratorOutput( + sequences=output if isinstance(output, torch.Tensor) else output[0] + ) + def _reorder_cache(self, past, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index ef2dc424..26205bcb 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -30,9 +30,7 @@ if TYPE_CHECKING: SbertForMaskedLM, SbertForSequenceClassification, SbertForTokenClassification, - SbertTokenizer, SbertModel, - SbertTokenizerFast, ) from .T5 import T5ForConditionalGeneration from .mglm import MGLMForTextSummarization @@ -51,8 +49,7 @@ if TYPE_CHECKING: ) from .veco import (VecoConfig, VecoForMaskedLM, VecoForSequenceClassification, - VecoForTokenClassification, VecoModel, VecoTokenizer, - VecoTokenizerFast) + VecoForTokenClassification, VecoModel) from .bloom import BloomModel else: _import_structure = { @@ -66,8 +63,6 @@ else: 'SbertForMaskedLM', 'SbertForSequenceClassification', 'SbertForTokenClassification', - 'SbertTokenizer', - 'SbertTokenizerFast', 'SbertModel', ], 'veco': [ @@ -76,8 +71,6 @@ else: 'VecoForSequenceClassification', 'VecoForTokenClassification', 'VecoModel', - 'VecoTokenizer', - 'VecoTokenizerFast', ], 'bert': [ 'BertForMaskedLM', diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py index 27abedb5..ab765190 100644 --- a/modelscope/models/nlp/bart/text_error_correction.py +++ b/modelscope/models/nlp/bart/text_error_correction.py @@ -7,6 +7,7 @@ import torch.cuda from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.outputs import TextErrorCorrectionOutput from modelscope.utils.constant import ModelFile, Tasks __all__ = ['BartForTextErrorCorrection'] @@ -55,7 +56,7 @@ class BartForTextErrorCorrection(TorchModel): self.task = task - def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]: + def forward(self, input: Dict[str, Dict]) -> TextErrorCorrectionOutput: """return the result by the model Args: @@ -91,4 +92,4 @@ class BartForTextErrorCorrection(TorchModel): # get 1-best List[Tensor] preds = translations[0][0]['tokens'] - return {'predictions': preds} + return TextErrorCorrectionOutput(predictions=preds) diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py index df0aebd2..bd432509 100755 --- a/modelscope/models/nlp/bert/backbone.py +++ b/modelscope/models/nlp/bert/backbone.py @@ -16,9 +16,6 @@ """PyTorch BERT model. """ import math -import os -from dataclasses import dataclass -from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -33,11 +30,10 @@ from transformers.modeling_utils import (PreTrainedModel, from modelscope.metainfo import Models from modelscope.models import Model, TorchModel from modelscope.models.builder import MODELS -from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions) +from modelscope.outputs import AttentionBackboneModelOutput from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping from modelscope.utils.logger import get_logger +from modelscope.utils.nlp.utils import parse_labels_in_order from .configuration import BertConfig logger = get_logger(__name__) @@ -562,7 +558,7 @@ class BertEncoder(nn.Module): all_self_attentions, all_cross_attentions, ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, @@ -639,30 +635,15 @@ class BertPreTrainedModel(TorchModel, PreTrainedModel): The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained """ - model_dir = kwargs.get('model_dir', None) + model_dir = kwargs.pop('model_dir', None) + cfg = kwargs.pop('cfg', None) + model_args = parse_labels_in_order(model_dir, cfg, **kwargs) if model_dir is None: - config = BertConfig(**kwargs) + config = BertConfig(**model_args) model = cls(config) else: - model_kwargs = {} - label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) - id2label = kwargs.get( - 'id2label', None if label2id is None else - {id: label - for label, id in label2id.items()}) - if id2label is not None and label2id is None: - label2id = {label: id for id, label in id2label.items()} - - num_labels = kwargs.get( - 'num_labels', None if label2id is None else len(label2id)) - if num_labels is not None: - model_kwargs['num_labels'] = num_labels - if label2id is not None: - model_kwargs['label2id'] = label2id - if id2label is not None: - model_kwargs['id2label'] = id2label model = super(Model, cls).from_pretrained( - pretrained_model_name_or_path=model_dir, **model_kwargs) + pretrained_model_name_or_path=model_dir, **model_args) model.model_dir = model_dir return model @@ -750,7 +731,7 @@ class BertModel(BertPreTrainedModel): output_attentions=None, output_hidden_states=None, return_dict=None, - **kwargs): + **kwargs) -> AttentionBackboneModelOutput: r""" Args: input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`): @@ -936,7 +917,7 @@ class BertModel(BertPreTrainedModel): if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, diff --git a/modelscope/models/nlp/bert/document_segmentation.py b/modelscope/models/nlp/bert/document_segmentation.py index ca27a166..36c39f43 100644 --- a/modelscope/models/nlp/bert/document_segmentation.py +++ b/modelscope/models/nlp/bert/document_segmentation.py @@ -5,37 +5,22 @@ from typing import Any, Dict import torch from torch import nn from torch.nn import CrossEntropyLoss -from transformers.modeling_outputs import TokenClassifierOutput -from transformers.models.bert.modeling_bert import (BertModel, - BertPreTrainedModel) from modelscope.metainfo import Models -from modelscope.models.base import Model +from modelscope.models import Model from modelscope.models.builder import MODELS +from modelscope.models.nlp.ponet import PoNetConfig +from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils.constant import Tasks +from .backbone import BertModel, BertPreTrainedModel +from .configuration import BertConfig __all__ = ['BertForDocumentSegmentation'] @MODELS.register_module( Tasks.document_segmentation, module_name=Models.bert_for_ds) -class BertForDocumentSegmentation(Model): - - def __init__(self, model_dir: str, model_config: Dict[str, Any], *args, - **kwargs): - super().__init__(model_dir, model_config, *args, **kwargs) - self.model_cfg = model_config - - def build_with_config(self, config): - self.bert_model = BertForDocumentSegmentationBase.from_pretrained( - self.model_dir, from_tf=False, config=config) - return self.bert_model - - def forward(self) -> Dict[str, Any]: - return self.model_cfg - - -class BertForDocumentSegmentationBase(BertPreTrainedModel): +class BertForDocumentSegmentation(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r'pooler'] @@ -103,9 +88,25 @@ class BertForDocumentSegmentationBase(BertPreTrainedModel): output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output - return TokenClassifierOutput( + return AttentionTokenClassificationModelOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + @classmethod + def _instantiate(cls, model_dir, model_config: Dict[str, Any], **kwargs): + if model_config['type'] == 'bert': + config = BertConfig.from_pretrained(model_dir, num_labels=2) + elif model_config['type'] == 'ponet': + config = PoNetConfig.from_pretrained(model_dir, num_labels=2) + else: + raise ValueError( + f'Expected config type bert and ponet, which is : {model_config["type"]}' + ) + model = super(Model, cls).from_pretrained( + model_dir, from_tf=False, config=config) + model.model_dir = model_dir + model.model_cfg = model_config + return model diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py index 4f81f62d..1f44365c 100644 --- a/modelscope/models/nlp/bert/fill_mask.py +++ b/modelscope/models/nlp/bert/fill_mask.py @@ -121,7 +121,7 @@ class BertForMaskedLM(BertPreTrainedModel): Preprocessor: This is the fill_mask model of Structbert, the preprocessor of this model - is `modelscope.preprocessors.NLPPreprocessor`. + is `modelscope.preprocessors.FillMaskTransformersPreprocessor`. Parameters: config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py index b1d18d0f..ff4a2418 100644 --- a/modelscope/models/nlp/bert/text_classification.py +++ b/modelscope/models/nlp/bert/text_classification.py @@ -51,7 +51,7 @@ class BertForSequenceClassification(BertPreTrainedModel): Preprocessor: This is the fill_mask model of Bert, the preprocessor of this model - is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`. Trainer: This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py index 5dc6b0ce..15ea3231 100644 --- a/modelscope/models/nlp/bert/token_classification.py +++ b/modelscope/models/nlp/bert/token_classification.py @@ -22,7 +22,7 @@ from torch.nn import CrossEntropyLoss from modelscope.metainfo import Models from modelscope.models.builder import MODELS -from modelscope.outputs import TokenClassifierOutput +from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils import logger as logging from modelscope.utils.constant import Tasks from .backbone import BertModel, BertPreTrainedModel @@ -47,7 +47,7 @@ class BertForTokenClassification(BertPreTrainedModel): Preprocessor: This is the fill_mask model of Bert, the preprocessor of this model - is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`. Trainer: This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, @@ -169,7 +169,7 @@ class BertForTokenClassification(BertPreTrainedModel): - 0 for tokens that are **masked**. Returns: - Returns `modelscope.outputs.TokenClassifierOutput` + Returns `modelscope.outputs.AttentionTokenClassificationModelOutput` Examples: >>> from modelscope.models import Model @@ -212,14 +212,25 @@ class BertForTokenClassification(BertPreTrainedModel): loss = loss_fct( logits.view(-1, self.num_labels), labels.view(-1)) + if label_mask is not None: + mask = label_mask + masked_lengths = mask.sum(-1).long() + masked_logits = torch.zeros_like(logits) + for i in range(len(mask)): + masked_logits[ + i, :masked_lengths[i], :] = logits[i].masked_select( + mask[i].unsqueeze(-1)).view(masked_lengths[i], -1) + logits = masked_logits + if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output - return TokenClassifierOutput( + return AttentionTokenClassificationModelOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, offset_mapping=offset_mapping, + label_mask=label_mask, ) diff --git a/modelscope/models/nlp/deberta_v2/backbone.py b/modelscope/models/nlp/deberta_v2/backbone.py index cca38133..0daa8c7d 100644 --- a/modelscope/models/nlp/deberta_v2/backbone.py +++ b/modelscope/models/nlp/deberta_v2/backbone.py @@ -22,7 +22,6 @@ import torch.utils.checkpoint from torch import nn from torch.nn import LayerNorm from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutput from transformers.modeling_utils import PreTrainedModel from transformers.pytorch_utils import softmax_backward_data @@ -574,7 +573,7 @@ class DebertaV2Encoder(nn.Module): return tuple( v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( + return AttentionBackboneModelOutput( last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions) diff --git a/modelscope/models/nlp/deberta_v2/fill_mask.py b/modelscope/models/nlp/deberta_v2/fill_mask.py index ed127d4c..e8adf1b5 100644 --- a/modelscope/models/nlp/deberta_v2/fill_mask.py +++ b/modelscope/models/nlp/deberta_v2/fill_mask.py @@ -44,7 +44,7 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): Preprocessor: This is the fill_mask model of Deberta_v2, the preprocessor of this model - is `modelscope.preprocessors.NLPPreprocessor`. + is `modelscope.preprocessors.FillMaskTransformersPreprocessor`. Parameters: config (`DebertaV2Config`): Model configuration class with all the parameters of the model. diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py index 45ab6621..c3fef28a 100644 --- a/modelscope/models/nlp/palm_v2/__init__.py +++ b/modelscope/models/nlp/palm_v2/__init__.py @@ -18,18 +18,16 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .configuration import PalmConfig - from .backbone import ( + from .text_generation import ( AbsSummarizer, - PalmForConditionalGeneration, + PalmForTextGeneration, Translator, ) - from .text_generation import PalmForTextGeneration else: _import_structure = { 'configuration': ['PalmConfig'], - 'backbone': - ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'], - 'text_generation': ['PalmForTextGeneration'], + 'text_generation': + ['AbsSummarizer', 'Translator', 'PalmForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/palm_v2/backbone.py b/modelscope/models/nlp/palm_v2/backbone.py deleted file mode 100644 index afee2e3f..00000000 --- a/modelscope/models/nlp/palm_v2/backbone.py +++ /dev/null @@ -1,1327 +0,0 @@ -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import codecs -import copy -import math -import os -import subprocess -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union - -import addict -import json -import numpy as np -import torch -import torch.nn.functional as F -from torch import Tensor, nn -from torch.nn.init import xavier_uniform_ -from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig, - RobertaModel, RobertaTokenizer) -from transformers.activations import ACT2FN -from transformers.modeling_utils import PreTrainedModel - -from modelscope.utils import logger as logging -from .configuration import PalmConfig -from .dureader_eval import compute_bleu_rouge, normalize - -CONFIG_NAME = 'config.json' -WEIGHTS_NAME = 'pytorch_model.bin' - - -class MultiHeadedAttention(nn.Module): # SelfAttention - """ - Multi-Head Attention module from - "Attention is All You Need" - :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. - - Similar to standard `dot` attention but uses - multiple attention distributions simulataneously - to select relevant items. - - .. mermaid:: - - graph BT - A[key] - B[value] - C[query] - O[output] - subgraph Attn - D[Attn 1] - E[Attn 2] - F[Attn N] - end - A --> D - C --> D - A --> E - C --> E - A --> F - C --> F - D --> O - E --> O - F --> O - B --> O - - Also includes several additional tricks. - - Args: - head_count (int): number of parallel heads - model_dim (int): the dimension of keys/values/queries, - must be divisible by head_count - dropout (float): dropout parameter - """ - - def __init__(self, - head_count, - model_dim, - dropout=0.1, - use_final_linear=True): - assert model_dim % head_count == 0 - self.dim_per_head = model_dim // head_count - self.model_dim = model_dim - - super().__init__() - self.head_count = head_count - - self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) - self.linear_values = nn.Linear(model_dim, - head_count * self.dim_per_head) - self.linear_query = nn.Linear(model_dim, - head_count * self.dim_per_head) - self.softmax = nn.Softmax(dim=-1) - self.dropout = nn.Dropout(dropout) - self.use_final_linear = use_final_linear - if (self.use_final_linear): - self.final_linear = nn.Linear(model_dim, model_dim) - - def forward(self, - key, - value, - query, - mask=None, - layer_cache=None, - type=None, - predefined_graph_1=None, - return_attn=False): - """ - Compute the context vector and the attention vectors. - - Args: - key (`FloatTensor`): set of `key_len` - key vectors `[batch, key_len, dim]` - value (`FloatTensor`): set of `key_len` - value vectors `[batch, key_len, dim]` - query (`FloatTensor`): set of `query_len` - query vectors `[batch, query_len, dim]` - mask: binary mask indicating which keys have - non-zero attention `[batch, query_len, key_len]` - Returns: - (`FloatTensor`, `FloatTensor`) : - - * output context vectors `[batch, query_len, dim]` - * one of the attention vectors `[batch, query_len, key_len]` - """ - - batch_size = key.size(0) - dim_per_head = self.dim_per_head - head_count = self.head_count - - def shape(x): - """ projection """ - return x.view(batch_size, -1, head_count, dim_per_head) \ - .transpose(1, 2) - - def unshape(x): - """ compute context """ - return x.transpose(1, 2).contiguous() \ - .view(batch_size, -1, head_count * dim_per_head) - - # 1) Project key, value, and query. - if layer_cache is not None: - if type == 'self': - query, key, value = self.linear_query(query), self.linear_keys( - query), self.linear_values(query) - - key = shape(key) - value = shape(value) - - device = key.device - if layer_cache['self_keys'] is not None: - key = torch.cat((layer_cache['self_keys'].to(device), key), - dim=2) - if layer_cache['self_values'] is not None: - value = torch.cat( - (layer_cache['self_values'].to(device), value), dim=2) - layer_cache['self_keys'] = key - layer_cache['self_values'] = value - elif type == 'context': - query = self.linear_query(query) - if layer_cache['memory_keys'] is None: - key, value = self.linear_keys(key), self.linear_values( - value) - key = shape(key) - value = shape(value) - else: - key, value = layer_cache['memory_keys'], layer_cache[ - 'memory_values'] - layer_cache['memory_keys'] = key - layer_cache['memory_values'] = value - else: - key = self.linear_keys(key) - value = self.linear_values(value) - query = self.linear_query(query) - key = shape(key) - value = shape(value) - - query = shape(query) - - # 2) Calculate and scale scores. - query = query / math.sqrt(dim_per_head) - scores = torch.matmul(query, key.transpose(2, 3)) - - if mask is not None: - mask = mask.unsqueeze(1).expand_as(scores) - scores = scores.masked_fill(mask, -1e18) - - # 3) Apply attention dropout and compute context vectors. - - attn = self.softmax(scores) - - if predefined_graph_1 is not None: - attn_masked = attn[:, -1] * predefined_graph_1 - attn_masked = attn_masked / ( - torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9) - - attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1) - - drop_attn = self.dropout(attn) - if self.use_final_linear: - context = unshape(torch.matmul(drop_attn, value)) - output = self.final_linear(context) - if return_attn: - return output, attn - else: - return output - else: - context = torch.matmul(drop_attn, value) - if return_attn: - return context, attn - else: - return context - - -class PositionwiseFeedForward(nn.Module): # Output - """ A two-layer Feed-Forward-Network with residual layer norm. - - Args: - d_model (int): the size of input for the first-layer of the FFN. - d_ff (int): the hidden layer size of the second-layer - of the FNN. - dropout (float): dropout probability in :math:`[0, 1)`. - """ - - def __init__(self, d_model, d_ff, dropout=0.1): - super().__init__() - self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) - self.w_1 = nn.Linear(d_model, d_ff) - self.actv = ACT2FN['gelu_new'] - self.dropout_1 = nn.Dropout(dropout) - self.w_2 = nn.Linear(d_ff, d_model) - self.dropout_2 = nn.Dropout(dropout) - - def forward(self, x): - inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x)))) - output = self.dropout_2(self.w_2(inter)) - return output + x - - -class TransformerDecoderLayer(nn.Module): # Layer - """ - Args: - d_model (int): the dimension of keys/values/queries in - MultiHeadedAttention, also the input size of - the first-layer of the PositionwiseFeedForward. - heads (int): the number of heads for MultiHeadedAttention. - d_ff (int): the second-layer of the PositionwiseFeedForward. - dropout (float): dropout probability(0-1.0). - self_attn_type (string): type of self-attention scaled-dot, average - """ - MAX_SIZE = 5000 - - def __init__(self, d_model, heads, d_ff, dropout): - super().__init__() - - self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) - - self.context_attn = MultiHeadedAttention( - heads, d_model, dropout=dropout) - self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) - self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) - self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) - self.drop = nn.Dropout(dropout) - mask = self._get_attn_subsequent_mask(self.MAX_SIZE) - # Register self.mask as a buffer in TransformerDecoderLayer, so - # it gets TransformerDecoderLayer's cuda behavior automatically. - self.register_buffer('mask', mask) - - def forward(self, - inputs, - memory_bank, - src_pad_mask, - tgt_pad_mask, - previous_input=None, - layer_cache=None, - step=None): - """ - Args: - inputs (`FloatTensor`): `[batch_size x 1 x model_dim]` - memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]` - src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]` - tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]` - - Returns: - (`FloatTensor`, `FloatTensor`, `FloatTensor`): - - * output `[batch_size x 1 x model_dim]` - * attn `[batch_size x 1 x src_len]` - * all_input `[batch_size x current_step x model_dim]` - - """ - dec_mask = torch.gt( - tgt_pad_mask.type(torch.uint8) - + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type( - torch.uint8), 0) - input_norm = self.layer_norm_1(inputs) - all_input = input_norm - if previous_input is not None: - all_input = torch.cat((previous_input, input_norm), dim=1) - dec_mask = None - - query = self.self_attn( - all_input, - all_input, - input_norm, - mask=dec_mask, - layer_cache=layer_cache, - type='self') - - query = self.drop(query) + inputs - - query_norm = self.layer_norm_2(query) - mid, attn = self.context_attn( - memory_bank, - memory_bank, - query_norm, - mask=src_pad_mask, - layer_cache=layer_cache, - type='context', - return_attn=True) - output = self.feed_forward(self.drop(mid) + query) - - return output, attn, all_input - - def _get_attn_subsequent_mask(self, size): - """ - Get an attention mask to avoid using the subsequent info. - - Args: - size: int - - Returns: - (`LongTensor`): - - * subsequent_mask `[1 x size x size]` - """ - attn_shape = (1, size, size) - subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') - subsequent_mask = torch.from_numpy(subsequent_mask) - return subsequent_mask - - -class PositionalEncoding(nn.Module): - - def __init__(self, dropout, dim, max_len=5000): - super().__init__() - pe = torch.zeros(max_len, dim) - position = torch.arange(0, max_len).unsqueeze(1) - div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) - * -(math.log(10000.0) / dim))) - pe[:, 0::2] = torch.sin(position.float() * div_term) - pe[:, 1::2] = torch.cos(position.float() * div_term) - pe = pe.unsqueeze(0) - self.register_buffer('pe', pe) - self.dropout = nn.Dropout(dropout) - self.dim = dim - - def forward(self, emb, step=None): - emb = emb * math.sqrt(self.dim) - if (step): - emb = emb + self.pe[:, step][:, None, :] - - else: - emb = emb + self.pe[:, :emb.size(1)] - emb = self.dropout(emb) - return emb - - def get_emb(self, emb): - return self.pe[:, :emb.size(1)] - - -class TransformerDecoderState: - - def __init__(self, src: Tensor, cache_num_layers: int = -1): - self.src: Tensor = src - self.previous_input: Tensor = None - self.previous_layer_inputs: Tensor = None - self.cache: Optional[Dict[str, Any]] = None - if cache_num_layers != -1: - self._init_cache(cache_num_layers) - - def update_state(self, new_input, previous_layer_inputs): - self.previous_input = new_input - self.previous_layer_inputs = previous_layer_inputs - self.cache = None - - def _init_cache(self, num_layers): - self.cache = {} - for num in range(num_layers): - layer_cache = {'memory_keys': None, 'memory_values': None} - layer_cache['self_keys'] = None - layer_cache['self_values'] = None - self.cache['layer_{}'.format(num)] = layer_cache - - def map_batch_fn(self, fn): - - def _recursive_map(struct, batch_dim=0): - for k, v in struct.items(): - if v is not None: - if isinstance(v, dict): - _recursive_map(v) - else: - struct[k] = fn(v, batch_dim) - - self.src = fn(self.src, 0) - if self.cache is not None: - _recursive_map(self.cache) - - -class TransformerDecoder(nn.Module): # Decoder - """ - The Transformer decoder from "Attention is All You Need". - - - .. mermaid:: - - graph BT - A[input] - B[multi-head self-attn] - BB[multi-head src-attn] - C[feed forward] - O[output] - A --> B - B --> BB - BB --> C - C --> O - - - Args: - num_layers (int): number of encoder layers. - d_model (int): size of the model - heads (int): number of heads - d_ff (int): size of the inner FF layer - dropout (float): dropout parameters - embeddings (:obj:`onmt.modules.Embeddings`): - embeddings to use, should have positional encodings - attn_type (str): if using a seperate copy attention - """ - decoder_type = 'transformer' - - def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings): - super().__init__() - - # Basic attributes. - self.num_layers = num_layers - self.embeddings = embeddings - self.pos_emb = PositionalEncoding(dropout, - self.embeddings.embedding_dim) - - # Build TransformerDecoder. - self.transformer_layers = nn.ModuleList([ - TransformerDecoderLayer(d_model, heads, d_ff, dropout) - for _ in range(num_layers) - ]) - self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) - self.state = None - - def forward(self, - state: TransformerDecoderState, - tgt: Tensor, - memory_bank: Tensor, - step: int = None, - memory_masks: Tensor = None): - src_words = state.src - tgt_words = tgt - src_batch, src_len = src_words.size() - tgt_batch, tgt_len = tgt_words.size() - - # Run the forward pass of the TransformerDecoder. - # emb = self.embeddings(tgt, step=step) - emb = self.embeddings(tgt) - assert emb.dim() == 3 # len x batch x embedding_dim - output = self.pos_emb(emb, step) - - src_memory_bank = memory_bank - padding_idx = self.embeddings.padding_idx - tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \ - .expand(tgt_batch, tgt_len, tgt_len) - - if memory_masks is not None: - src_len = memory_masks.size(-1) - src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len) - else: - src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \ - .expand(src_batch, tgt_len, src_len) - - if state.cache is None: - saved_inputs = [] - attns = [] - for i in range(self.num_layers): - prev_layer_input = None - if state.cache is None: - if state.previous_input is not None: - prev_layer_input = state.previous_layer_inputs[i] - output, attn, all_input \ - = self.transformer_layers[i]( - output, src_memory_bank, - src_pad_mask, tgt_pad_mask, - previous_input=prev_layer_input, - layer_cache=state.cache['layer_{}'.format(i)] - if state.cache is not None else None, - step=step) - if state.cache is None: - saved_inputs.append(all_input) - attns.append(attn) - - if state.cache is None: - saved_inputs = torch.stack(saved_inputs) - - output = self.layer_norm(output) - - # Process the result and update the attentions. - if state.cache is None: - state.update_state(tgt, saved_inputs) - - return output, attns, state - - -class PalmPointerGenerator(nn.Module): - - def __init__(self, hidden_size, vocab_size): - super().__init__() - self.dense = nn.Linear(hidden_size, vocab_size) - self.gen_func = nn.LogSoftmax(-1) - - def forward(self, x): - x = self.dense(x) - x = self.gen_func(x) - return x - - -class PalmPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = PalmConfig - base_model_prefix = 'palm' - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: Optional[Union[str, - os.PathLike]], - **kwargs): - config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) - config = PalmConfig.from_json_file(config_file) if os.path.isfile( - config_file) else PalmConfig() - config.encoder_pth = os.path.join(pretrained_model_name_or_path, - config.encoder_pth) - checkpoint_file = os.path.join(pretrained_model_name_or_path, - WEIGHTS_NAME) - checkpoint = torch.load(checkpoint_file) if os.path.isfile( - checkpoint_file) else None - return cls(config, checkpoint, **kwargs) - - -class AbsSummarizer(PalmPreTrainedModel): # Model - - def __init__(self, config, checkpoint=None): - super().__init__(config) - self.config = config - if config.encoder == 'bert' or config.encoder == 'zh_bert': - self.bert = BertModel( - BertConfig.from_pretrained(config.encoder_pth)) - elif config.encoder == 'roberta': - self.bert = RobertaModel( - RobertaConfig.from_pretrained(config.encoder_pth)) - - if (config.max_pos > 512): - my_pos_embeddings = nn.Embedding( - config.max_pos, self.bert.model.config.hidden_size) - my_pos_embeddings.weight.data[: - 512] = self.bert.embeddings.position_embeddings.weight.data - my_pos_embeddings.weight.data[ - 512:] = self.bert.embeddings.position_embeddings.weight.data[ - -1][None, :].repeat(config.max_pos - 512, 1) - self.bert.model.embeddings.position_embeddings = my_pos_embeddings - self.vocab_size = self.bert.config.vocab_size - tgt_embeddings = nn.Embedding( - self.vocab_size, - self.bert.config.hidden_size, - padding_idx=1 if config.encoder == 'roberta' else 0) - - if config.share_emb: - tgt_embeddings.weight = copy.deepcopy( - self.bert.model.embeddings.word_embeddings.weight) - self.decoder = TransformerDecoder( - config.dec_layers, - config.dec_hidden_size, - heads=config.dec_heads, - d_ff=config.dec_ff_size, - dropout=config.dec_dropout, - embeddings=tgt_embeddings) - self.generator = PalmPointerGenerator(config.dec_hidden_size, - self.vocab_size) - self.generator.dense.weight = self.decoder.embeddings.weight - - if checkpoint is not None: - if 'model' in checkpoint: - checkpoint = checkpoint['model'] - for key in list(checkpoint.keys()): - checkpoint[key.replace('model.palm.', '')] = checkpoint[key] - self.load_state_dict(checkpoint, strict=False) - else: - for module in self.decoder.modules(): - if isinstance(module, (nn.Linear, nn.Embedding)): - module.weight.data.normal_(mean=0.0, std=0.02) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - for p in self.generator.parameters(): - if p.dim() > 1: - xavier_uniform_(p) - else: - p.data.zero_() - if config.use_bert_emb: - if config.encoder == 'roberta': - tgt_embeddings = nn.Embedding( - self.vocab_size, - self.bert.config.hidden_size, - padding_idx=1) - else: - tgt_embeddings = nn.Embedding( - self.vocab_size, - self.bert.config.hidden_size, - padding_idx=0) - tgt_embeddings.weight = copy.deepcopy( - self.bert.embeddings.word_embeddings.weight) - self.decoder.embeddings = tgt_embeddings - self.generator.dense.weight = self.decoder.embeddings.weight - - def forward(self, src, tgt, mask_src): - top_vec, _ = self.bert(src, mask_src, return_dict=False) - state = TransformerDecoderState(src) - decoder_outputs, attns, _ = self.decoder(state, tgt[:, :-1], top_vec) - return decoder_outputs, attns[-1], top_vec - - -class LabelSmoothingLoss(nn.Module): - """ - With label smoothing, - KL-divergence between q_{smoothed ground truth prob.}(w) - and p_{prob. computed by model}(w) is minimized. - """ - - def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100): - assert 0.0 < label_smoothing <= 1.0 - self.padding_idx = ignore_index - super(LabelSmoothingLoss, self).__init__() - - smoothing_value = label_smoothing / (tgt_vocab_size - 2) - one_hot = torch.full((tgt_vocab_size, ), smoothing_value) - one_hot[self.padding_idx] = 0 - self.register_buffer('one_hot', one_hot.unsqueeze(0)) - self.confidence = 1.0 - label_smoothing - - def forward(self, output, target): - """ - output (FloatTensor): batch_size x n_classes - target (LongTensor): batch_size - """ - model_prob = self.one_hot.repeat(target.size(0), 1) - model_prob.scatter_(1, target.unsqueeze(1), self.confidence) - model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0) - - return F.kl_div(output, model_prob, reduction='sum') - - -class NMTLossCompute(nn.Module): - """ - Standard NMT Loss Computation. - """ - - def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0): - super().__init__() - self.generator = generator - self.padding_idx = symbols['PAD'] - if label_smoothing > 0: - self.criterion = LabelSmoothingLoss( - label_smoothing, vocab_size, ignore_index=self.padding_idx) - else: - self.criterion = nn.NLLLoss( - ignore_index=self.padding_idx, reduction='sum') - - def _bottle(self, _v): - return _v.view(-1, _v.size(2)) - - def _unbottle(self, _v, batch_size): - return _v.view(-1, batch_size, _v.size(1)) - - def forward(self, tgt, output): - target = tgt[:, 1:] - normalization = target.ne(self.padding_idx).sum() - bottled_output = self._bottle(output) - scores = self.generator(bottled_output) - gtruth = target.contiguous().view(-1) - loss = self.criterion(scores, gtruth) - loss.div(float(normalization)) - return loss - - -class PalmForConditionalGeneration(PalmPreTrainedModel): - - def __init__(self, config, checkpoint=None): - super().__init__(config) - self.config = config - if config.encoder == 'roberta': - tokenizer = RobertaTokenizer.from_pretrained( - config.encoder_pth, do_lower_case=False) - symbols = { - 'BOS': tokenizer.cls_token_id, - 'EOS': tokenizer.sep_token_id, - 'PAD': tokenizer.pad_token_id, - 'EOQ': tokenizer.unk_token_id - } - elif config.encoder == 'bert' or config.encoder == 'zh_bert': - tokenizer = BertTokenizer.from_pretrained( - config.encoder_pth, do_lower_case=True) - symbols = { - 'BOS': tokenizer.vocab['[CLS]'], - 'EOS': tokenizer.vocab['[SEP]'], - 'PAD': tokenizer.vocab['[PAD]'], - 'EOQ': tokenizer.vocab['[unused2]'] - } - self.tokenizer = tokenizer - self.symbols = symbols - self.palm = AbsSummarizer(config, checkpoint) - self.loss = NMTLossCompute(self.palm.generator, symbols, - self.palm.vocab_size, - config.label_smoothing) - - def forward(self, input_ids, attention_mask, labels): - output = self.palm( - src=input_ids, tgt=labels, mask_src=attention_mask)[0] - loss = self.loss(labels, output) - return addict.Dict(loss=loss) - - -class Translator(object): - """ - Uses a model to translate a batch of sentences. - """ - - @dataclass - class Batch: - batch_size: int - src: torch.Tensor - tgt: torch.Tensor - mask_src: torch.Tensor - query_id: List[None] = None - src_str: List[List[str]] = None - tgt_str: List[str] = None - - def __init__(self, - model: PalmForConditionalGeneration, - dataset: str = 'cnn'): - super().__init__() - self.logger = logging.get_logger(__name__) - self.args = model.config - self.args.dataset = dataset - self.model = model.palm - self.generator = self.model.generator - self.vocab = model.tokenizer - self.symbols = model.symbols - self.start_token = self.symbols['BOS'] - self.end_token = self.symbols['EOS'] - self.alpha = self.args.alpha - self.beam_size = self.args.beam_size - self.min_length = self.args.min_length - self.max_length = self.args.max_length - - def from_batch(self, translation_batch): - batch = translation_batch['batch'] - assert (len(translation_batch['gold_score']) == len( - translation_batch['predictions'])) - batch_size = batch.batch_size - - preds, pred_score, tgt_str, src, src_str = translation_batch[ - 'predictions'], translation_batch[ - 'scores'], batch.tgt_str, batch.src, batch.src_str - query_id = batch.query_id - ''' - try: - query_id = batch.query_id - except: - query_id = None - ''' - translations = [] - for b in range(batch_size): - if self.args.dataset == 'qg_ranking_test': - if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert': - pred_sents = [ - ' '.join( - self.vocab.convert_ids_to_tokens( - [int(n) for n in each])).replace(' ##', '') - for each in preds[b] - ] - elif self.args.encoder == 'roberta': - pred_sents = [ - self.vocab.decode([int(n) for n in each - ]).replace('', - '').replace('', '') - for each in preds[b] - ] - elif self.args.encoder == 'roberta': - pred_sents = self.vocab.decode([int(n) - for n in preds[b][0]]).replace( - '', - '').replace('', '') - elif self.args.encoder == 'bert': - pred_sents = self.vocab.convert_ids_to_tokens( - [int(n) for n in preds[b][0]]) - pred_sents = ' '.join(pred_sents).replace(' ##', '') - elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase': - pred_sents = [ - self.vocab.convert_ids_to_tokens([int(n) for n in pred]) - for pred in preds[b] - ] - pred_sents = [ - ''.join(pred).replace(' ##', '') for pred in pred_sents - ] - elif self.args.encoder == 'zh_bert': - pred_sents = self.vocab.convert_ids_to_tokens( - [int(n) for n in preds[b][0]]) - pred_sents = ''.join(pred_sents).replace('##', '') - gold_sent = tgt_str[b] - - if self.args.encoder == 'roberta': - raw_src = self.vocab.decode([int(t) for t in src[b]]) - raw_src = ' '.join(src_str[b]) - else: - raw_src = [self.vocab.ids_to_tokens[int(t)] - for t in src[b]][:500] - raw_src = ' '.join(raw_src) - if self.args.dataset == 'faq': - translation = (pred_sents, gold_sent, src_str[b], query_id[b], - pred_score[b]) - else: - translation = (pred_sents, gold_sent, raw_src, query_id[b], - pred_score[b]) - # translation = (pred_sents[0], gold_sent) - translations.append(translation) - - return translations - - def translate(self, data_iter, step): - gold_path = self.args.result_path + '.%d.gold' % step - can_path = self.args.result_path + '.%d.candidate' % step - self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8') - self.can_out_file = codecs.open(can_path, 'w', 'utf-8') - self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w', - 'utf-8') - if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta': - out = '\t'.join([ - 'query_id', 'source_query', 'target_query', 'predict_query' - ]) + '\n' - self.pred_json_score_out_file.write(out) - - raw_src_path = self.args.result_path + '.%d.raw_src' % step - self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8') - - pred_results, gold_results = [], [] - cnt = 0 - pred_dict, ref_dict = {}, {} - for i, batch in enumerate(data_iter): - self.logger.info(f'data: {i + 1} / {len(data_iter)}') - batch_data = self.translate_batch(batch) - translations = self.from_batch(batch_data) - - for trans in translations: - pred, gold, src, query_id, pred_score = trans - src = src.replace('', '').replace('##', '').strip() - if self.args.dataset == 'qg_ranking_test': - pred_str = '\t'.join([ - each.replace('[unused0]', '').replace( - '[PAD]', '').replace('[unused1]', '').replace( - r' +', ' ').replace('[SEP]', '').replace( - '[unused2]', - '').replace(r' +', ' ').replace( - '', - '').replace('', '').replace( - '', - '').replace('', '').replace( - '', ' ').strip() - for each in pred - ]) - else: - pred_str = pred.replace('[unused0]', '').replace( - '[PAD]', '').replace('[unused1]', '').replace( - r' +', ' ').replace('[SEP]', '').replace( - '[unused2]', '').replace('[CLS]', '').replace( - '[SEP]', '').replace('[UNK]', '').strip() - pred_str = pred_str.replace(r' +', ' ').replace( - '', - '').replace('', '').replace('', '').replace( - '', '').replace('', ' ').strip() - gold_str = gold.replace('', '').strip().replace( - '[UNK]', '').replace('[unused1]', '').replace( - '[unused2]', - '').replace('##', '').replace('[CLS]', '').replace( - '[SEP]', '').strip().replace('', '').replace( - '', '').replace('', ' ').strip() - if (self.args.recall_eval): - _pred_str = '' - for sent in pred_str.split(''): - can_pred_str = _pred_str + '' + sent.strip() - if len(can_pred_str.split()) >= len( - gold_str.split()) + 10: - pred_str = _pred_str - break - else: - _pred_str = can_pred_str - - if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': - pred_str = pred_str.replace('', ' ') - if query_id is not None: - pred_json = { - 'query_id': query_id, - 'answers': [pred_str] - } - gold_json = { - 'query_id': query_id, - 'answers': [gold_str] - } - pred_json_score = { - 'query_id': query_id, - 'answers': [pred_str], - 'scores': pred_score[0].cpu().numpy().tolist() - } - else: - pred_json = {'query_id': cnt, 'answers': [pred_str]} - gold_json = {'query_id': cnt, 'answers': [gold_str]} - pred_json_score = { - 'query_id': cnt, - 'answers': [pred_str], - 'scores': pred_score[0].cpu().numpy().tolist() - } - json.dump(pred_json, self.can_out_file) - self.can_out_file.write('\n') - json.dump(gold_json, self.gold_out_file) - self.gold_out_file.write('\n') - json.dump(pred_json_score, self.pred_json_score_out_file) - self.pred_json_score_out_file.write('\n') - self.src_out_file.write(src.strip() + '\n') - elif self.args.dataset == 'cnn': - self.can_out_file.write(pred_str + '\n') - self.gold_out_file.write(gold_str + '\n') - self.src_out_file.write(src.strip() + '\n') - elif self.args.dataset == 'dureader': - if query_id is None: - query_id = str(cnt) - pred_results.extend(normalize([pred_str])) - gold_results.extend(normalize([gold_str])) - self.can_out_file.write(pred_str + '\n') - self.gold_out_file.write('\t'.join([src[0], gold_str]) - + '\n') - - elif self.args.dataset == 'paraphrase': - if query_id is None: - query_id = str(cnt) - if self.args.encoder == 'roberta': - pred_str = [pred_str] - pred_dict[query_id] = normalize([pred_str[0]]) - ref_dict[query_id] = normalize([gold_str]) - self.pred_json_score_out_file.write( - '\t'.join([str(query_id), src, gold_str, pred_str[0]]) - + '\n') - elif self.args.dataset == 'faq': - if pred_score[0].cpu().numpy().tolist() < -3.5: - continue - self.can_out_file.write( - '\t'.join([str(query_id), src, pred_str]) + '\n') - self.gold_out_file.write( - '\t'.join([str(query_id), src, gold_str]) + '\n') - # passage, answer, question, score - self.pred_json_score_out_file.write('\t'.join([ - str(query_id), gold_str, src, pred_str, - str(pred_score[0].cpu().numpy().tolist()) - ]) + '\n') - elif self.args.dataset == 'qg_ranking_test': - self.can_out_file.write( - str(query_id) + '\t' + pred_str + '\n') - - cnt += 1 - self.can_out_file.flush() - self.gold_out_file.flush() - self.src_out_file.flush() - self.logger.info('cnt: %s' % cnt) - self.can_out_file.close() - self.gold_out_file.close() - self.src_out_file.close() - - if (step != -1): - if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': - cnn_results = subprocess.getoutput( - './run.sh %s %s' % (gold_path, can_path)) # run.sh ... - self.logger.info(cnn_results) - elif self.args.dataset == 'cnn': - self.logger.info('Calculating Rouge') - from rouge import Rouge - candidates = [ - line.strip() for line in open(can_path, encoding='utf-8') - ] - references = [ - line.strip() for line in open(gold_path, encoding='utf-8') - ] - rouge_score = Rouge().get_scores( - candidates, references, avg=True) - # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) - print(rouge_score) - elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': - - def postprocess_text(preds, labels): - preds = [pred.strip().replace('.', '') for pred in preds] - labels = [label.strip() for label in labels] - while '' in preds: - idx = preds.index('') - preds[idx] = '。' - return preds, labels - - pred_results, gold_results = postprocess_text( - pred_results, gold_results) - pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)} - gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)} - bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict) - print(bleu_rouge) - # unreachable - elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': - pred_results, gold_results = postprocess_text( - pred_results, gold_results) - bleu_score = cal_bleu(pred_results, gold_results) - from rouge import Rouge - rouge = Rouge() - rouge_score = rouge.get_scores( - pred_results, gold_results, avg=True) - print("'Dev eval result: Bleu-4={}, {}".format( - bleu_score, rouge_score)) - - def translate_batch(self, batch: 'Batch', fast: bool = False): - """ - Translate a batch of sentences. - - Mostly a wrapper around :obj:`Beam`. - - Args: - batch (:obj:`Batch`): a batch from a dataset object - data (:obj:`Dataset`): the dataset object - fast (bool): enables fast beam search (may not support all features) - - Todo: - Shouldn't need the original dataset. - """ - self.model.eval() - with torch.no_grad(): - return self._fast_translate_batch( - batch, self.max_length, min_length=self.min_length) - - def _tile(self, x, count, dim=0): - perm = list(range(len(x.size()))) - if dim != 0: - perm[0], perm[dim] = perm[dim], perm[0] - x = x.permute(perm).contiguous() - out_size = list(x.size()) - out_size[0] *= count - batch = x.size(0) - x = x.view(batch, -1) \ - .transpose(0, 1) \ - .repeat(count, 1) \ - .transpose(0, 1) \ - .contiguous() \ - .view(*out_size) - if dim != 0: - x = x.permute(perm).contiguous() - return x - - def _top_k_top_p_filtering(self, - logits, - top_k=10, - top_p=1.0, - filter_value=-float('Inf'), - min_tokens_to_keep=1): - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), - logits.size(-1)) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, - None] - logits[indices_to_remove] = filter_value - - if top_p < 1.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum( - F.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ - ..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter( - 1, sorted_indices, sorted_indices_to_remove) - logits[indices_to_remove] = filter_value - return logits - - def _fast_translate_batch(self, - batch: 'Batch', - max_length: int, - min_length: int = 0): - # TODO: faster code path for beam_size == 1. - # TODO: support these blacklisted features. - - beam_size = self.beam_size - batch_size = batch.batch_size - src = batch.src - mask_src = batch.mask_src - - src_features, _ = self.model.bert(src, mask_src, return_dict=False) - state = TransformerDecoderState(src, self.model.decoder.num_layers) - device = src_features.device - - # Tile states and memory beam_size times. - state.map_batch_fn( - lambda state, dim: self._tile(state, beam_size, dim=dim)) - src_features = self._tile(src_features, beam_size, dim=0) - batch_offset = torch.arange( - batch_size, dtype=torch.long, device=device) - beam_offset = torch.arange( - 0, - batch_size * beam_size, - step=beam_size, - dtype=torch.long, - device=device) - alive_seq = torch.full([batch_size * beam_size, 1], - self.start_token, - dtype=torch.long, - device=device) - - # Give full probability to the first beam on the first step. - topk_log_probs = ( - torch.tensor( - [0.0] + [float('-inf')] * (beam_size - 1), - device=device).repeat(batch_size)) - - # Structure that holds finished hypotheses. - hypotheses = [[] for _ in range(batch_size)] # noqa: F812 - - results = {} - results['predictions'] = [[] for _ in range(batch_size)] # noqa: F812 - results['scores'] = [[] for _ in range(batch_size)] # noqa: F812 - results['gold_score'] = [0] * batch_size - results['batch'] = batch - - for step in range(max_length): - decoder_input = alive_seq[:, -1].view(1, -1) - - # Decoder forward. - decoder_input = decoder_input.transpose(0, 1) - dec_out, attns, state = self.model.decoder( - state, decoder_input, src_features, step=step) - - # Generator forward. - log_probs = self.generator.forward( - dec_out.transpose(0, 1).squeeze(0)) - vocab_size = log_probs.size(-1) - - if step < min_length: - log_probs[:, self.end_token] = -1e20 - - # Multiply probs by the beam probability. - - length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha - if self.args.sample_topk: - temperature = self.args.temperature - _scores = log_probs / temperature - _scores = self._top_k_top_p_filtering( - _scores, - top_k=self.args.top_k, - top_p=self.args.top_p, - min_tokens_to_keep=1 - ) # (batch_size * num_beams, vocab_size) - # Sample 2 next words for each beam (so we have some spare tokens - # and match output of greedy beam search) - topk_ids = torch.multinomial( - F.softmax(_scores, dim=-1), - num_samples=1) # (batch_size * num_beams, 2) - # Compute next scores - _scores = F.log_softmax( - _scores, dim=1) # (batch_size * num_beams, vocab_size) - - _scores += topk_log_probs.view(-1).unsqueeze(1) - _scores = _scores / length_penalty - topk_scores = torch.gather( - _scores, -1, topk_ids) # (batch_size * num_beams, 2) - # Match shape of greedy beam search - topk_ids = topk_ids.view( - -1, beam_size) # (batch_size, 2 * num_beams) - topk_scores = topk_scores.view( - -1, beam_size) # (batch_size, 2 * num_beams) - else: - log_probs += topk_log_probs.view(-1).unsqueeze(1) - curr_scores = log_probs / length_penalty - - curr_scores = curr_scores.reshape(-1, beam_size * vocab_size) - topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1) - if self.args.block_trigram: - cur_len = alive_seq.size(1) - if cur_len > 3: - for i in range(alive_seq.size(0)): - fail = False - words = [int(w) for w in alive_seq[i]] - if self.args.encoder == 'roberta': - words = self.vocab.decode(words).strip().split() - else: - words = [ - self.vocab.ids_to_tokens[w] for w in words - ] - words = ' '.join(words).replace(' ##', '').split() - if len(words) <= 3: - continue - trigrams = [(words[i - 1], words[i], words[i + 1]) - for i in range(1, - len(words) - 1)] - trigram = tuple(trigrams[-1]) - if trigram in trigrams[:-1]: - fail = True - if fail: - curr_scores[i] = -10e20 - # Recover log probs. - topk_log_probs = topk_scores * length_penalty - - # Resolve beam origin and true word ids. - topk_beam_index = topk_ids // vocab_size - topk_ids = topk_ids.fmod(vocab_size) - - # Map beam_index to batch_index in the flat representation. - batch_index = ( - topk_beam_index - + beam_offset[:topk_beam_index.size(0)].unsqueeze(1)) - select_indices = batch_index.view(-1) - - # Append last prediction. - alive_seq = torch.cat([ - alive_seq.index_select(0, select_indices), - topk_ids.view(-1, 1) - ], -1) - - is_finished = topk_ids.eq(self.end_token) - if step + 1 == max_length: - is_finished.fill_(self.end_token) - # End condition is top beam is finished. - end_condition = is_finished[:, 0].eq(1) - # Save finished hypotheses. - if is_finished.any(): - predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1)) - for i in range(is_finished.size(0)): - b = batch_offset[i] - if end_condition[i]: - is_finished[i].fill_(self.end_token) - finished_hyp = is_finished[i].nonzero().view(-1) - # Store finished hypotheses for this batch. - for j in finished_hyp: - hypotheses[b].append( - (topk_scores[i, j], predictions[i, j, 1:])) - # If the batch reached the end, save the n_best hypotheses. - if end_condition[i]: - best_hyp = sorted( - hypotheses[b], key=lambda x: x[0], reverse=True) - if self.args.dataset == 'qg_ranking_test' or ( - self.args.dataset == 'paraphrase' - and not self.args.sample_topk): - for each in best_hyp[:beam_size]: - score, pred = each - results['scores'][b].append(score) - results['predictions'][b].append(pred) - else: - score, pred = best_hyp[0] - results['scores'][b].append(score) - results['predictions'][b].append(pred) - non_finished = end_condition.eq(0).nonzero().view(-1) - # If all sentences are translated, no need to go further. - if len(non_finished) == 0: - break - # Remove finished batches for the next step. - topk_log_probs = topk_log_probs.index_select(0, non_finished) - batch_index = batch_index.index_select(0, non_finished) - batch_offset = batch_offset.index_select(0, non_finished) - alive_seq = predictions.index_select(0, non_finished) \ - .view(-1, alive_seq.size(-1)) - # Reorder states. - select_indices = batch_index.view(-1) - src_features = src_features.index_select(0, select_indices) - state.map_batch_fn( - lambda state, dim: state.index_select(dim, select_indices)) - - return results - - def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, - **kwargs) -> Dict[str, torch.Tensor]: - batch = self.Batch( - batch_size=input_ids.size()[0], - src=input_ids, - tgt=None, - mask_src=attention_mask) - translation_batch = self.translate_batch(batch) - - preds = translation_batch['predictions'] - return {'predictions': preds} diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py index d83860db..f1c8e414 100644 --- a/modelscope/models/nlp/palm_v2/text_generation.py +++ b/modelscope/models/nlp/palm_v2/text_generation.py @@ -1,50 +1,1364 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Dict, List +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import codecs +import copy +import math +import os +import subprocess +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import json +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.init import xavier_uniform_ +from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig, + RobertaModel, RobertaTokenizer) +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel from modelscope.metainfo import Models -from modelscope.models.base import Tensor, TorchModel +from modelscope.models import Model +from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS -from modelscope.outputs import OutputKeys +from modelscope.outputs import TextGenerationModelOutput, TokenGeneratorOutput +from modelscope.utils import logger as logging from modelscope.utils.constant import Tasks +from .configuration import PalmConfig +from .dureader_eval import compute_bleu_rouge, normalize -__all__ = ['PalmForTextGeneration'] +CONFIG_NAME = 'config.json' +WEIGHTS_NAME = 'pytorch_model.bin' -@MODELS.register_module(Tasks.text_generation, module_name=Models.palm) -class PalmForTextGeneration(TorchModel): +class MultiHeadedAttention(nn.Module): # SelfAttention + """ + Multi-Head Attention module from + "Attention is All You Need" + :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. + + Similar to standard `dot` attention but uses + multiple attention distributions simulataneously + to select relevant items. + + .. mermaid:: + + graph BT + A[key] + B[value] + C[query] + O[output] + subgraph Attn + D[Attn 1] + E[Attn 2] + F[Attn N] + end + A --> D + C --> D + A --> E + C --> E + A --> F + C --> F + D --> O + E --> O + F --> O + B --> O - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the text generation model from the `model_dir` path. + Also includes several additional tricks. + + Args: + head_count (int): number of parallel heads + model_dim (int): the dimension of keys/values/queries, + must be divisible by head_count + dropout (float): dropout parameter + """ + + def __init__(self, + head_count, + model_dim, + dropout=0.1, + use_final_linear=True): + assert model_dim % head_count == 0 + self.dim_per_head = model_dim // head_count + self.model_dim = model_dim + + super().__init__() + self.head_count = head_count + + self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) + self.linear_values = nn.Linear(model_dim, + head_count * self.dim_per_head) + self.linear_query = nn.Linear(model_dim, + head_count * self.dim_per_head) + self.softmax = nn.Softmax(dim=-1) + self.dropout = nn.Dropout(dropout) + self.use_final_linear = use_final_linear + if (self.use_final_linear): + self.final_linear = nn.Linear(model_dim, model_dim) + + def forward(self, + key, + value, + query, + mask=None, + layer_cache=None, + type=None, + predefined_graph_1=None, + return_attn=False): + """ + Compute the context vector and the attention vectors. Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. + key (`FloatTensor`): set of `key_len` + key vectors `[batch, key_len, dim]` + value (`FloatTensor`): set of `key_len` + value vectors `[batch, key_len, dim]` + query (`FloatTensor`): set of `query_len` + query vectors `[batch, query_len, dim]` + mask: binary mask indicating which keys have + non-zero attention `[batch, query_len, key_len]` + Returns: + (`FloatTensor`, `FloatTensor`) : + + * output context vectors `[batch, query_len, dim]` + * one of the attention vectors `[batch, query_len, key_len]` """ - super().__init__(model_dir, *args, **kwargs) - from modelscope.models.nlp.palm_v2 import ( - PalmForConditionalGeneration, Translator) - self.model = PalmForConditionalGeneration.from_pretrained(model_dir) - self.tokenizer = self.model.tokenizer - self.generator = Translator(self.model) + batch_size = key.size(0) + dim_per_head = self.dim_per_head + head_count = self.head_count + + def shape(x): + """ projection """ + return x.view(batch_size, -1, head_count, dim_per_head) \ + .transpose(1, 2) + + def unshape(x): + """ compute context """ + return x.transpose(1, 2).contiguous() \ + .view(batch_size, -1, head_count * dim_per_head) + + # 1) Project key, value, and query. + if layer_cache is not None: + if type == 'self': + query, key, value = self.linear_query(query), self.linear_keys( + query), self.linear_values(query) + + key = shape(key) + value = shape(value) + + device = key.device + if layer_cache['self_keys'] is not None: + key = torch.cat((layer_cache['self_keys'].to(device), key), + dim=2) + if layer_cache['self_values'] is not None: + value = torch.cat( + (layer_cache['self_values'].to(device), value), dim=2) + layer_cache['self_keys'] = key + layer_cache['self_values'] = value + elif type == 'context': + query = self.linear_query(query) + if layer_cache['memory_keys'] is None: + key, value = self.linear_keys(key), self.linear_values( + value) + key = shape(key) + value = shape(value) + else: + key, value = layer_cache['memory_keys'], layer_cache[ + 'memory_values'] + layer_cache['memory_keys'] = key + layer_cache['memory_values'] = value + else: + key = self.linear_keys(key) + value = self.linear_values(value) + query = self.linear_query(query) + key = shape(key) + value = shape(value) + + query = shape(query) + + # 2) Calculate and scale scores. + query = query / math.sqrt(dim_per_head) + scores = torch.matmul(query, key.transpose(2, 3)) + + if mask is not None: + mask = mask.unsqueeze(1).expand_as(scores) + scores = scores.masked_fill(mask, -1e18) + + # 3) Apply attention dropout and compute context vectors. + + attn = self.softmax(scores) + + if predefined_graph_1 is not None: + attn_masked = attn[:, -1] * predefined_graph_1 + attn_masked = attn_masked / ( + torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9) + + attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1) + + drop_attn = self.dropout(attn) + if self.use_final_linear: + context = unshape(torch.matmul(drop_attn, value)) + output = self.final_linear(context) + if return_attn: + return output, attn + else: + return output + else: + context = torch.matmul(drop_attn, value) + if return_attn: + return context, attn + else: + return context + + +class PositionwiseFeedForward(nn.Module): # Output + """ A two-layer Feed-Forward-Network with residual layer norm. + + Args: + d_model (int): the size of input for the first-layer of the FFN. + d_ff (int): the hidden layer size of the second-layer + of the FNN. + dropout (float): dropout probability in :math:`[0, 1)`. + """ + + def __init__(self, d_model, d_ff, dropout=0.1): + super().__init__() + self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) + self.w_1 = nn.Linear(d_model, d_ff) + self.actv = ACT2FN['gelu_new'] + self.dropout_1 = nn.Dropout(dropout) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x): + inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x)))) + output = self.dropout_2(self.w_2(inter)) + return output + x + + +class TransformerDecoderLayer(nn.Module): # Layer + """ + Args: + d_model (int): the dimension of keys/values/queries in + MultiHeadedAttention, also the input size of + the first-layer of the PositionwiseFeedForward. + heads (int): the number of heads for MultiHeadedAttention. + d_ff (int): the second-layer of the PositionwiseFeedForward. + dropout (float): dropout probability(0-1.0). + self_attn_type (string): type of self-attention scaled-dot, average + """ + MAX_SIZE = 5000 + + def __init__(self, d_model, heads, d_ff, dropout): + super().__init__() + + self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) + + self.context_attn = MultiHeadedAttention( + heads, d_model, dropout=dropout) + self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) + self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) + self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) + self.drop = nn.Dropout(dropout) + mask = self._get_attn_subsequent_mask(self.MAX_SIZE) + # Register self.mask as a buffer in TransformerDecoderLayer, so + # it gets TransformerDecoderLayer's cuda behavior automatically. + self.register_buffer('mask', mask) + + def forward(self, + inputs, + memory_bank, + src_pad_mask, + tgt_pad_mask, + previous_input=None, + layer_cache=None, + step=None): + """ + Args: + inputs (`FloatTensor`): `[batch_size x 1 x model_dim]` + memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]` + src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]` + tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]` + + Returns: + (`FloatTensor`, `FloatTensor`, `FloatTensor`): + + * output `[batch_size x 1 x model_dim]` + * attn `[batch_size x 1 x src_len]` + * all_input `[batch_size x current_step x model_dim]` + + """ + dec_mask = torch.gt( + tgt_pad_mask.type(torch.uint8) + + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type( + torch.uint8), 0) + input_norm = self.layer_norm_1(inputs) + all_input = input_norm + if previous_input is not None: + all_input = torch.cat((previous_input, input_norm), dim=1) + dec_mask = None + + query = self.self_attn( + all_input, + all_input, + input_norm, + mask=dec_mask, + layer_cache=layer_cache, + type='self') + + query = self.drop(query) + inputs + + query_norm = self.layer_norm_2(query) + mid, attn = self.context_attn( + memory_bank, + memory_bank, + query_norm, + mask=src_pad_mask, + layer_cache=layer_cache, + type='context', + return_attn=True) + output = self.feed_forward(self.drop(mid) + query) + + return output, attn, all_input + + def _get_attn_subsequent_mask(self, size): + """ + Get an attention mask to avoid using the subsequent info. + + Args: + size: int + + Returns: + (`LongTensor`): + + * subsequent_mask `[1 x size x size]` + """ + attn_shape = (1, size, size) + subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') + subsequent_mask = torch.from_numpy(subsequent_mask) + return subsequent_mask + + +class PositionalEncoding(nn.Module): + + def __init__(self, dropout, dim, max_len=5000): + super().__init__() + pe = torch.zeros(max_len, dim) + position = torch.arange(0, max_len).unsqueeze(1) + div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) + * -(math.log(10000.0) / dim))) + pe[:, 0::2] = torch.sin(position.float() * div_term) + pe[:, 1::2] = torch.cos(position.float() * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + self.dropout = nn.Dropout(dropout) + self.dim = dim + + def forward(self, emb, step=None): + emb = emb * math.sqrt(self.dim) + if (step): + emb = emb + self.pe[:, step][:, None, :] + + else: + emb = emb + self.pe[:, :emb.size(1)] + emb = self.dropout(emb) + return emb + + def get_emb(self, emb): + return self.pe[:, :emb.size(1)] + + +class TransformerDecoderState: + + def __init__(self, src: Tensor, cache_num_layers: int = -1): + self.src: Tensor = src + self.previous_input: Tensor = None + self.previous_layer_inputs: Tensor = None + self.cache: Optional[Dict[str, Any]] = None + if cache_num_layers != -1: + self._init_cache(cache_num_layers) + + def update_state(self, new_input, previous_layer_inputs): + self.previous_input = new_input + self.previous_layer_inputs = previous_layer_inputs + self.cache = None + + def _init_cache(self, num_layers): + self.cache = {} + for num in range(num_layers): + layer_cache = {'memory_keys': None, 'memory_values': None} + layer_cache['self_keys'] = None + layer_cache['self_values'] = None + self.cache['layer_{}'.format(num)] = layer_cache + + def map_batch_fn(self, fn): + + def _recursive_map(struct, batch_dim=0): + for k, v in struct.items(): + if v is not None: + if isinstance(v, dict): + _recursive_map(v) + else: + struct[k] = fn(v, batch_dim) + + self.src = fn(self.src, 0) + if self.cache is not None: + _recursive_map(self.cache) + + +class TransformerDecoder(nn.Module): # Decoder + """ + The Transformer decoder from "Attention is All You Need". + + + .. mermaid:: + + graph BT + A[input] + B[multi-head self-attn] + BB[multi-head src-attn] + C[feed forward] + O[output] + A --> B + B --> BB + BB --> C + C --> O + + + Args: + num_layers (int): number of encoder layers. + d_model (int): size of the model + heads (int): number of heads + d_ff (int): size of the inner FF layer + dropout (float): dropout parameters + embeddings (:obj:`onmt.modules.Embeddings`): + embeddings to use, should have positional encodings + attn_type (str): if using a seperate copy attention + """ + decoder_type = 'transformer' + + def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings): + super().__init__() + + # Basic attributes. + self.num_layers = num_layers + self.embeddings = embeddings + self.pos_emb = PositionalEncoding(dropout, + self.embeddings.embedding_dim) + + # Build TransformerDecoder. + self.transformer_layers = nn.ModuleList([ + TransformerDecoderLayer(d_model, heads, d_ff, dropout) + for _ in range(num_layers) + ]) + self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) + self.state = None + + def forward(self, + state: TransformerDecoderState, + tgt: Tensor, + memory_bank: Tensor, + step: int = None, + memory_masks: Tensor = None): + src_words = state.src + tgt_words = tgt + src_batch, src_len = src_words.size() + tgt_batch, tgt_len = tgt_words.size() + + # Run the forward pass of the TransformerDecoder. + # emb = self.embeddings(tgt, step=step) + emb = self.embeddings(tgt) + assert emb.dim() == 3 # len x batch x embedding_dim + output = self.pos_emb(emb, step) + + src_memory_bank = memory_bank + padding_idx = self.embeddings.padding_idx + tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \ + .expand(tgt_batch, tgt_len, tgt_len) + + if memory_masks is not None: + src_len = memory_masks.size(-1) + src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len) + else: + src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \ + .expand(src_batch, tgt_len, src_len) + + if state.cache is None: + saved_inputs = [] + attns = [] + for i in range(self.num_layers): + prev_layer_input = None + if state.cache is None: + if state.previous_input is not None: + prev_layer_input = state.previous_layer_inputs[i] + output, attn, all_input \ + = self.transformer_layers[i]( + output, src_memory_bank, + src_pad_mask, tgt_pad_mask, + previous_input=prev_layer_input, + layer_cache=state.cache['layer_{}'.format(i)] + if state.cache is not None else None, + step=step) + if state.cache is None: + saved_inputs.append(all_input) + attns.append(attn) + + if state.cache is None: + saved_inputs = torch.stack(saved_inputs) + + output = self.layer_norm(output) + + # Process the result and update the attentions. + if state.cache is None: + state.update_state(tgt, saved_inputs) - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - """return the result by the model + return output, attns, state + + +class PalmPointerGenerator(nn.Module): + + def __init__(self, hidden_size, vocab_size): + super().__init__() + self.dense = nn.Linear(hidden_size, vocab_size) + self.gen_func = nn.LogSoftmax(-1) + + def forward(self, x): + x = self.dense(x) + x = self.gen_func(x) + return x + + +class PalmPreTrainedModel(TorchModel, PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = PalmConfig + base_model_prefix = 'palm' + + def __init__(self, config, **kwargs): + super().__init__(config.name_or_path, **kwargs) + super(Model, self).__init__(config) + + @classmethod + def _from_pretrained( + cls, pretrained_model_name_or_path: Optional[Union[str, + os.PathLike]], + **kwargs): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + config = PalmConfig.from_json_file(config_file) if os.path.isfile( + config_file) else PalmConfig() + config.encoder_pth = os.path.join(pretrained_model_name_or_path, + config.encoder_pth) + checkpoint_file = os.path.join(pretrained_model_name_or_path, + WEIGHTS_NAME) + checkpoint = torch.load(checkpoint_file) if os.path.isfile( + checkpoint_file) else None + return cls(config, checkpoint, **kwargs) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. Args: - input (Dict[str, Tensor]): the preprocessed data + kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). Returns: - Dict[str, Tensor]: results - Example: - { - 'loss': Tensor([12.34]), # loss for backward - } + The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained """ - return self.model(**input) - def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + model_dir = kwargs.pop('model_dir') + model = cls._from_pretrained( + pretrained_model_name_or_path=model_dir, **kwargs) + model.model_dir = model_dir + return model + + +class AbsSummarizer(PalmPreTrainedModel): # Model + + def __init__(self, config, checkpoint=None, **kwargs): + super().__init__(config, **kwargs) + self.config = config + if config.encoder == 'bert' or config.encoder == 'zh_bert': + self.bert = BertModel( + BertConfig.from_pretrained(config.encoder_pth)) + elif config.encoder == 'roberta': + self.bert = RobertaModel( + RobertaConfig.from_pretrained(config.encoder_pth)) + + if config.max_pos > 512: + my_pos_embeddings = nn.Embedding( + config.max_pos, self.bert.model.config.hidden_size) + my_pos_embeddings.weight.data[: + 512] = self.bert.embeddings.position_embeddings.weight.data + my_pos_embeddings.weight.data[ + 512:] = self.bert.embeddings.position_embeddings.weight.data[ + -1][None, :].repeat(config.max_pos - 512, 1) + self.bert.model.embeddings.position_embeddings = my_pos_embeddings + self.vocab_size = self.bert.config.vocab_size + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=1 if config.encoder == 'roberta' else 0) + + if config.share_emb: + tgt_embeddings.weight = copy.deepcopy( + self.bert.model.embeddings.word_embeddings.weight) + self.decoder = TransformerDecoder( + config.dec_layers, + config.dec_hidden_size, + heads=config.dec_heads, + d_ff=config.dec_ff_size, + dropout=config.dec_dropout, + embeddings=tgt_embeddings) + self.generator = PalmPointerGenerator(config.dec_hidden_size, + self.vocab_size) + self.generator.dense.weight = self.decoder.embeddings.weight + + if checkpoint is not None: + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for key in list(checkpoint.keys()): + checkpoint[key.replace('model.palm.', '')] = checkpoint[key] + self.load_state_dict(checkpoint, strict=False) + else: + for module in self.decoder.modules(): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + for p in self.generator.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + else: + p.data.zero_() + if config.use_bert_emb: + if config.encoder == 'roberta': + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=1) + else: + tgt_embeddings = nn.Embedding( + self.vocab_size, + self.bert.config.hidden_size, + padding_idx=0) + tgt_embeddings.weight = copy.deepcopy( + self.bert.embeddings.word_embeddings.weight) + self.decoder.embeddings = tgt_embeddings + self.generator.dense.weight = self.decoder.embeddings.weight + + def forward(self, src, tgt, mask_src): + top_vec, _ = self.bert(src, mask_src, return_dict=False) + state = TransformerDecoderState(src) + decoder_outputs, attns, _ = self.decoder(state, tgt[:, :-1], top_vec) + return decoder_outputs, attns[-1], top_vec + + +class LabelSmoothingLoss(nn.Module): + """ + With label smoothing, + KL-divergence between q_{smoothed ground truth prob.}(w) + and p_{prob. computed by model}(w) is minimized. + """ + + def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100): + assert 0.0 < label_smoothing <= 1.0 + self.padding_idx = ignore_index + super(LabelSmoothingLoss, self).__init__() + + smoothing_value = label_smoothing / (tgt_vocab_size - 2) + one_hot = torch.full((tgt_vocab_size, ), smoothing_value) + one_hot[self.padding_idx] = 0 + self.register_buffer('one_hot', one_hot.unsqueeze(0)) + self.confidence = 1.0 - label_smoothing + + def forward(self, output, target): + """ + output (FloatTensor): batch_size x n_classes + target (LongTensor): batch_size + """ + model_prob = self.one_hot.repeat(target.size(0), 1) + model_prob.scatter_(1, target.unsqueeze(1), self.confidence) + model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0) + + return F.kl_div(output, model_prob, reduction='sum') + + +class NMTLossCompute(nn.Module): + """ + Standard NMT Loss Computation. + """ + + def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0): + super().__init__() + self.generator = generator + self.padding_idx = symbols['PAD'] + if label_smoothing > 0: + self.criterion = LabelSmoothingLoss( + label_smoothing, vocab_size, ignore_index=self.padding_idx) + else: + self.criterion = nn.NLLLoss( + ignore_index=self.padding_idx, reduction='sum') + + def _bottle(self, _v): + return _v.view(-1, _v.size(2)) + + def _unbottle(self, _v, batch_size): + return _v.view(-1, batch_size, _v.size(1)) + + def forward(self, tgt, output): + target = tgt[:, 1:] + normalization = target.ne(self.padding_idx).sum() + bottled_output = self._bottle(output) + scores = self.generator(bottled_output) + gtruth = target.contiguous().view(-1) + loss = self.criterion(scores, gtruth) + loss.div(float(normalization)) + return loss + + +class Translator(object): + """ + Uses a model to translate a batch of sentences. + """ + + @dataclass + class Batch: + batch_size: int + src: torch.Tensor + tgt: torch.Tensor + mask_src: torch.Tensor + query_id: List[None] = None + src_str: List[List[str]] = None + tgt_str: List[str] = None + + def __init__(self, model, dataset: str = 'cnn'): + super().__init__() + self.logger = logging.get_logger(__name__) + self.args = model.config + self.args.dataset = dataset + self.model = model.palm + self.generator = self.model.generator + self.vocab = model.tokenizer + self.symbols = model.symbols + self.start_token = self.symbols['BOS'] + self.end_token = self.symbols['EOS'] + self.alpha = self.args.alpha + self.beam_size = self.args.beam_size + self.min_length = self.args.min_length + self.max_length = self.args.max_length + + def from_batch(self, translation_batch): + batch = translation_batch['batch'] + assert (len(translation_batch['gold_score']) == len( + translation_batch['predictions'])) + batch_size = batch.batch_size + + preds, pred_score, tgt_str, src, src_str = translation_batch[ + 'predictions'], translation_batch[ + 'scores'], batch.tgt_str, batch.src, batch.src_str + query_id = batch.query_id + ''' + try: + query_id = batch.query_id + except: + query_id = None + ''' + translations = [] + for b in range(batch_size): + if self.args.dataset == 'qg_ranking_test': + if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert': + pred_sents = [ + ' '.join( + self.vocab.convert_ids_to_tokens( + [int(n) for n in each])).replace(' ##', '') + for each in preds[b] + ] + elif self.args.encoder == 'roberta': + pred_sents = [ + self.vocab.decode([int(n) for n in each + ]).replace('', + '').replace('', '') + for each in preds[b] + ] + elif self.args.encoder == 'roberta': + pred_sents = self.vocab.decode([int(n) + for n in preds[b][0]]).replace( + '', + '').replace('', '') + elif self.args.encoder == 'bert': + pred_sents = self.vocab.convert_ids_to_tokens( + [int(n) for n in preds[b][0]]) + pred_sents = ' '.join(pred_sents).replace(' ##', '') + elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase': + pred_sents = [ + self.vocab.convert_ids_to_tokens([int(n) for n in pred]) + for pred in preds[b] + ] + pred_sents = [ + ''.join(pred).replace(' ##', '') for pred in pred_sents + ] + elif self.args.encoder == 'zh_bert': + pred_sents = self.vocab.convert_ids_to_tokens( + [int(n) for n in preds[b][0]]) + pred_sents = ''.join(pred_sents).replace('##', '') + gold_sent = tgt_str[b] + + if self.args.encoder == 'roberta': + raw_src = self.vocab.decode([int(t) for t in src[b]]) + raw_src = ' '.join(src_str[b]) + else: + raw_src = [self.vocab.ids_to_tokens[int(t)] + for t in src[b]][:500] + raw_src = ' '.join(raw_src) + if self.args.dataset == 'faq': + translation = (pred_sents, gold_sent, src_str[b], query_id[b], + pred_score[b]) + else: + translation = (pred_sents, gold_sent, raw_src, query_id[b], + pred_score[b]) + # translation = (pred_sents[0], gold_sent) + translations.append(translation) + + return translations + + def translate(self, data_iter, step): + gold_path = self.args.result_path + '.%d.gold' % step + can_path = self.args.result_path + '.%d.candidate' % step + self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8') + self.can_out_file = codecs.open(can_path, 'w', 'utf-8') + self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w', + 'utf-8') + if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta': + out = '\t'.join([ + 'query_id', 'source_query', 'target_query', 'predict_query' + ]) + '\n' + self.pred_json_score_out_file.write(out) + + raw_src_path = self.args.result_path + '.%d.raw_src' % step + self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8') + + pred_results, gold_results = [], [] + cnt = 0 + pred_dict, ref_dict = {}, {} + for i, batch in enumerate(data_iter): + self.logger.info(f'data: {i + 1} / {len(data_iter)}') + batch_data = self.translate_batch(batch) + translations = self.from_batch(batch_data) + + for trans in translations: + pred, gold, src, query_id, pred_score = trans + src = src.replace('', '').replace('##', '').strip() + if self.args.dataset == 'qg_ranking_test': + pred_str = '\t'.join([ + each.replace('[unused0]', '').replace( + '[PAD]', '').replace('[unused1]', '').replace( + r' +', ' ').replace('[SEP]', '').replace( + '[unused2]', + '').replace(r' +', ' ').replace( + '', + '').replace('', '').replace( + '', + '').replace('', '').replace( + '', ' ').strip() + for each in pred + ]) + else: + pred_str = pred.replace('[unused0]', '').replace( + '[PAD]', '').replace('[unused1]', '').replace( + r' +', ' ').replace('[SEP]', '').replace( + '[unused2]', '').replace('[CLS]', '').replace( + '[SEP]', '').replace('[UNK]', '').strip() + pred_str = pred_str.replace(r' +', ' ').replace( + '', + '').replace('', '').replace('', '').replace( + '', '').replace('', ' ').strip() + gold_str = gold.replace('', '').strip().replace( + '[UNK]', '').replace('[unused1]', '').replace( + '[unused2]', + '').replace('##', '').replace('[CLS]', '').replace( + '[SEP]', '').strip().replace('', '').replace( + '', '').replace('', ' ').strip() + if self.args.recall_eval: + _pred_str = '' + for sent in pred_str.split(''): + can_pred_str = _pred_str + '' + sent.strip() + if len(can_pred_str.split()) >= len( + gold_str.split()) + 10: + pred_str = _pred_str + break + else: + _pred_str = can_pred_str + + if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': + pred_str = pred_str.replace('', ' ') + if query_id is not None: + pred_json = { + 'query_id': query_id, + 'answers': [pred_str] + } + gold_json = { + 'query_id': query_id, + 'answers': [gold_str] + } + pred_json_score = { + 'query_id': query_id, + 'answers': [pred_str], + 'scores': pred_score[0].cpu().numpy().tolist() + } + else: + pred_json = {'query_id': cnt, 'answers': [pred_str]} + gold_json = {'query_id': cnt, 'answers': [gold_str]} + pred_json_score = { + 'query_id': cnt, + 'answers': [pred_str], + 'scores': pred_score[0].cpu().numpy().tolist() + } + json.dump(pred_json, self.can_out_file) + self.can_out_file.write('\n') + json.dump(gold_json, self.gold_out_file) + self.gold_out_file.write('\n') + json.dump(pred_json_score, self.pred_json_score_out_file) + self.pred_json_score_out_file.write('\n') + self.src_out_file.write(src.strip() + '\n') + elif self.args.dataset == 'cnn': + self.can_out_file.write(pred_str + '\n') + self.gold_out_file.write(gold_str + '\n') + self.src_out_file.write(src.strip() + '\n') + elif self.args.dataset == 'dureader': + if query_id is None: + query_id = str(cnt) + pred_results.extend(normalize([pred_str])) + gold_results.extend(normalize([gold_str])) + self.can_out_file.write(pred_str + '\n') + self.gold_out_file.write('\t'.join([src[0], gold_str]) + + '\n') + + elif self.args.dataset == 'paraphrase': + if query_id is None: + query_id = str(cnt) + if self.args.encoder == 'roberta': + pred_str = [pred_str] + pred_dict[query_id] = normalize([pred_str[0]]) + ref_dict[query_id] = normalize([gold_str]) + self.pred_json_score_out_file.write( + '\t'.join([str(query_id), src, gold_str, pred_str[0]]) + + '\n') + elif self.args.dataset == 'faq': + if pred_score[0].cpu().numpy().tolist() < -3.5: + continue + self.can_out_file.write( + '\t'.join([str(query_id), src, pred_str]) + '\n') + self.gold_out_file.write( + '\t'.join([str(query_id), src, gold_str]) + '\n') + # passage, answer, question, score + self.pred_json_score_out_file.write('\t'.join([ + str(query_id), gold_str, src, pred_str, + str(pred_score[0].cpu().numpy().tolist()) + ]) + '\n') + elif self.args.dataset == 'qg_ranking_test': + self.can_out_file.write( + str(query_id) + '\t' + pred_str + '\n') + + cnt += 1 + self.can_out_file.flush() + self.gold_out_file.flush() + self.src_out_file.flush() + self.logger.info('cnt: %s' % cnt) + self.can_out_file.close() + self.gold_out_file.close() + self.src_out_file.close() + + if step != -1: + if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking': + cnn_results = subprocess.getoutput( + './run.sh %s %s' % (gold_path, can_path)) # run.sh ... + self.logger.info(cnn_results) + elif self.args.dataset == 'cnn': + self.logger.info('Calculating Rouge') + from rouge import Rouge + candidates = [ + line.strip() for line in open(can_path, encoding='utf-8') + ] + references = [ + line.strip() for line in open(gold_path, encoding='utf-8') + ] + rouge_score = Rouge().get_scores( + candidates, references, avg=True) + # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) + print(rouge_score) + elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': + + def postprocess_text(preds, labels): + preds = [pred.strip().replace('.', '') for pred in preds] + labels = [label.strip() for label in labels] + while '' in preds: + idx = preds.index('') + preds[idx] = '。' + return preds, labels + + pred_results, gold_results = postprocess_text( + pred_results, gold_results) + pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)} + gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)} + bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict) + print(bleu_rouge) + # unreachable + elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase': + pred_results, gold_results = postprocess_text( + pred_results, gold_results) + bleu_score = cal_bleu(pred_results, gold_results) + from rouge import Rouge + rouge = Rouge() + rouge_score = rouge.get_scores( + pred_results, gold_results, avg=True) + print("'Dev eval result: Bleu-4={}, {}".format( + bleu_score, rouge_score)) + + def translate_batch(self, batch: 'Batch', fast: bool = False): + """ + Translate a batch of sentences. + + Mostly a wrapper around :obj:`Beam`. + + Args: + batch (:obj:`Batch`): a batch from a dataset object + data (:obj:`Dataset`): the dataset object + fast (bool): enables fast beam search (may not support all features) + + Todo: + Shouldn't need the original dataset. + """ + self.model.eval() + with torch.no_grad(): + return self._fast_translate_batch( + batch, self.max_length, min_length=self.min_length) + + def _tile(self, x, count, dim=0): + perm = list(range(len(x.size()))) + if dim != 0: + perm[0], perm[dim] = perm[dim], perm[0] + x = x.permute(perm).contiguous() + out_size = list(x.size()) + out_size[0] *= count + batch = x.size(0) + x = x.view(batch, -1) \ + .transpose(0, 1) \ + .repeat(count, 1) \ + .transpose(0, 1) \ + .contiguous() \ + .view(*out_size) + if dim != 0: + x = x.permute(perm).contiguous() + return x + + def _top_k_top_p_filtering(self, + logits, + top_k=10, + top_p=1.0, + filter_value=-float('Inf'), + min_tokens_to_keep=1): + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), + logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, + None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum( + F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ + ..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits + + def _fast_translate_batch(self, + batch: 'Batch', + max_length: int, + min_length: int = 0): + # TODO: faster code path for beam_size == 1. + # TODO: support these blacklisted features. + + beam_size = self.beam_size + batch_size = batch.batch_size + src = batch.src + mask_src = batch.mask_src + + src_features, _ = self.model.bert(src, mask_src, return_dict=False) + state = TransformerDecoderState(src, self.model.decoder.num_layers) + device = src_features.device + + # Tile states and memory beam_size times. + state.map_batch_fn( + lambda state, dim: self._tile(state, beam_size, dim=dim)) + src_features = self._tile(src_features, beam_size, dim=0) + batch_offset = torch.arange( + batch_size, dtype=torch.long, device=device) + beam_offset = torch.arange( + 0, + batch_size * beam_size, + step=beam_size, + dtype=torch.long, + device=device) + alive_seq = torch.full([batch_size * beam_size, 1], + self.start_token, + dtype=torch.long, + device=device) + + # Give full probability to the first beam on the first step. + topk_log_probs = ( + torch.tensor( + [0.0] + [float('-inf')] * (beam_size - 1), + device=device).repeat(batch_size)) + + # Structure that holds finished hypotheses. + hypotheses = [[] for _ in range(batch_size)] # noqa: F812 + + results = {} + results['predictions'] = [[] for _ in range(batch_size)] # noqa: F812 + results['scores'] = [[] for _ in range(batch_size)] # noqa: F812 + results['gold_score'] = [0] * batch_size + results['batch'] = batch + + for step in range(max_length): + decoder_input = alive_seq[:, -1].view(1, -1) + + # Decoder forward. + decoder_input = decoder_input.transpose(0, 1) + dec_out, attns, state = self.model.decoder( + state, decoder_input, src_features, step=step) + + # Generator forward. + log_probs = self.generator.forward( + dec_out.transpose(0, 1).squeeze(0)) + vocab_size = log_probs.size(-1) + + if step < min_length: + log_probs[:, self.end_token] = -1e20 + + # Multiply probs by the beam probability. + + length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha + if self.args.sample_topk: + temperature = self.args.temperature + _scores = log_probs / temperature + _scores = self._top_k_top_p_filtering( + _scores, + top_k=self.args.top_k, + top_p=self.args.top_p, + min_tokens_to_keep=1 + ) # (batch_size * num_beams, vocab_size) + # Sample 2 next words for each beam (so we have some spare tokens + # and match output of greedy beam search) + topk_ids = torch.multinomial( + F.softmax(_scores, dim=-1), + num_samples=1) # (batch_size * num_beams, 2) + # Compute next scores + _scores = F.log_softmax( + _scores, dim=1) # (batch_size * num_beams, vocab_size) + + _scores += topk_log_probs.view(-1).unsqueeze(1) + _scores = _scores / length_penalty + topk_scores = torch.gather( + _scores, -1, topk_ids) # (batch_size * num_beams, 2) + # Match shape of greedy beam search + topk_ids = topk_ids.view( + -1, beam_size) # (batch_size, 2 * num_beams) + topk_scores = topk_scores.view( + -1, beam_size) # (batch_size, 2 * num_beams) + else: + log_probs += topk_log_probs.view(-1).unsqueeze(1) + curr_scores = log_probs / length_penalty + + curr_scores = curr_scores.reshape(-1, beam_size * vocab_size) + topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1) + if self.args.block_trigram: + cur_len = alive_seq.size(1) + if cur_len > 3: + for i in range(alive_seq.size(0)): + fail = False + words = [int(w) for w in alive_seq[i]] + if self.args.encoder == 'roberta': + words = self.vocab.decode(words).strip().split() + else: + words = [ + self.vocab.ids_to_tokens[w] for w in words + ] + words = ' '.join(words).replace(' ##', '').split() + if len(words) <= 3: + continue + trigrams = [(words[i - 1], words[i], words[i + 1]) + for i in range(1, + len(words) - 1)] + trigram = tuple(trigrams[-1]) + if trigram in trigrams[:-1]: + fail = True + if fail: + curr_scores[i] = -10e20 + # Recover log probs. + topk_log_probs = topk_scores * length_penalty + + # Resolve beam origin and true word ids. + topk_beam_index = topk_ids // vocab_size + topk_ids = topk_ids.fmod(vocab_size) + + # Map beam_index to batch_index in the flat representation. + batch_index = ( + topk_beam_index + + beam_offset[:topk_beam_index.size(0)].unsqueeze(1)) + select_indices = batch_index.view(-1) + + # Append last prediction. + alive_seq = torch.cat([ + alive_seq.index_select(0, select_indices), + topk_ids.view(-1, 1) + ], -1) + + is_finished = topk_ids.eq(self.end_token) + if step + 1 == max_length: + is_finished.fill_(self.end_token) + # End condition is top beam is finished. + end_condition = is_finished[:, 0].eq(1) + # Save finished hypotheses. + if is_finished.any(): + predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1)) + for i in range(is_finished.size(0)): + b = batch_offset[i] + if end_condition[i]: + is_finished[i].fill_(self.end_token) + finished_hyp = is_finished[i].nonzero().view(-1) + # Store finished hypotheses for this batch. + for j in finished_hyp: + hypotheses[b].append( + (topk_scores[i, j], predictions[i, j, 1:])) + # If the batch reached the end, save the n_best hypotheses. + if end_condition[i]: + best_hyp = sorted( + hypotheses[b], key=lambda x: x[0], reverse=True) + if self.args.dataset == 'qg_ranking_test' or ( + self.args.dataset == 'paraphrase' + and not self.args.sample_topk): + for each in best_hyp[:beam_size]: + score, pred = each + results['scores'][b].append(score) + results['predictions'][b].append(pred) + else: + score, pred = best_hyp[0] + results['scores'][b].append(score) + results['predictions'][b].append(pred) + non_finished = end_condition.eq(0).nonzero().view(-1) + # If all sentences are translated, no need to go further. + if len(non_finished) == 0: + break + # Remove finished batches for the next step. + topk_log_probs = topk_log_probs.index_select(0, non_finished) + batch_index = batch_index.index_select(0, non_finished) + batch_offset = batch_offset.index_select(0, non_finished) + alive_seq = predictions.index_select(0, non_finished) \ + .view(-1, alive_seq.size(-1)) + # Reorder states. + select_indices = batch_index.view(-1) + src_features = src_features.index_select(0, select_indices) + state.map_batch_fn( + lambda state, dim: state.index_select(dim, select_indices)) + + return results + + def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, + **kwargs) -> Dict[str, torch.Tensor]: + batch = self.Batch( + batch_size=input_ids.size()[0], + src=input_ids, + tgt=None, + mask_src=attention_mask) + translation_batch = self.translate_batch(batch) + + preds = translation_batch['predictions'] + return {'predictions': preds} + + +@MODELS.register_module(Tasks.text_generation, module_name=Models.palm) +class PalmForTextGeneration(PalmPreTrainedModel): + + def __init__(self, config, checkpoint=None, **kwargs): + super().__init__(config, **kwargs) + self.config = config + if config.encoder == 'roberta': + tokenizer = RobertaTokenizer.from_pretrained( + config.encoder_pth, do_lower_case=False) + symbols = { + 'BOS': tokenizer.cls_token_id, + 'EOS': tokenizer.sep_token_id, + 'PAD': tokenizer.pad_token_id, + 'EOQ': tokenizer.unk_token_id + } + elif config.encoder == 'bert' or config.encoder == 'zh_bert': + tokenizer = BertTokenizer.from_pretrained( + config.encoder_pth, do_lower_case=True) + symbols = { + 'BOS': tokenizer.vocab['[CLS]'], + 'EOS': tokenizer.vocab['[SEP]'], + 'PAD': tokenizer.vocab['[PAD]'], + 'EOQ': tokenizer.vocab['[unused2]'] + } + self.tokenizer = tokenizer + self.symbols = symbols + self.palm = AbsSummarizer(config, checkpoint) + self.loss = NMTLossCompute(self.palm.generator, symbols, + self.palm.vocab_size, + config.label_smoothing) + self.generator = Translator(self) + + def forward(self, input_ids, attention_mask, labels): + output = self.palm(src=input_ids, tgt=labels, mask_src=attention_mask) + loss = self.loss(labels, output[0]) + return TextGenerationModelOutput( + loss=loss, + logits=output[0], + ) + + def generate(self, input: Dict[str, Tensor]) -> TokenGeneratorOutput: outputs = self.generator(**input) preds = outputs['predictions'] - return {'sequences': [pred[0] for pred in preds]} + return TokenGeneratorOutput(sequences=[pred[0] for pred in preds]) diff --git a/modelscope/models/nlp/ponet/backbone.py b/modelscope/models/nlp/ponet/backbone.py index f13b362b..22114f28 100644 --- a/modelscope/models/nlp/ponet/backbone.py +++ b/modelscope/models/nlp/ponet/backbone.py @@ -23,8 +23,6 @@ import torch.utils.checkpoint from packaging import version from torch import nn from transformers.activations import ACT2FN -from transformers.modeling_outputs import \ - BaseModelOutputWithPastAndCrossAttentions from transformers.modeling_utils import (PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, @@ -573,7 +571,7 @@ class PoNetEncoder(nn.Module): all_self_attentions, all_cross_attentions, ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, @@ -642,34 +640,6 @@ class PoNetPreTrainedModel(TorchModel, PreTrainedModel): return model -class PoNetPreTrainedModelV2(PreTrainedModel): - """ - A base class to handle weights initialization and a simple interface for loading pretrained models. - """ - - config_class = PoNetConfig - base_model_prefix = 'ponet' - _keys_to_ignore_on_load_missing = [r'position_ids'] - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, nn.Linear): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - @MODELS.register_module(Tasks.backbone, module_name=Models.ponet) class PoNetModel(PoNetPreTrainedModel): """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top. diff --git a/modelscope/models/nlp/ponet/document_segmentation.py b/modelscope/models/nlp/ponet/document_segmentation.py index 2ef8c8b8..5e933491 100644 --- a/modelscope/models/nlp/ponet/document_segmentation.py +++ b/modelscope/models/nlp/ponet/document_segmentation.py @@ -5,13 +5,15 @@ from typing import Any, Dict import torch from torch import nn from torch.nn import CrossEntropyLoss -from transformers.modeling_outputs import TokenClassifierOutput from modelscope.metainfo import Models from modelscope.models.base import Model from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertConfig +from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils.constant import Tasks -from .backbone import PoNetModel, PoNetPreTrainedModelV2 +from .backbone import PoNetModel, PoNetPreTrainedModel +from .configuration import PoNetConfig __all__ = ['PoNetForDocumentSegmentation'] @@ -20,23 +22,7 @@ __all__ = ['PoNetForDocumentSegmentation'] Tasks.document_segmentation, module_name=Models.ponet_for_ds) @MODELS.register_module( Tasks.extractive_summarization, module_name=Models.ponet_for_ds) -class PoNetForDocumentSegmentation(Model): - - def __init__(self, model_dir: str, model_config: Dict[str, Any], *args, - **kwargs): - super().__init__(model_dir, model_config, *args, **kwargs) - self.model_cfg = model_config - - def build_with_config(self, config): - self.ponet_model = PoNetForDocumentSegmentationBase.from_pretrained( - self.model_dir, config=config) - return self.ponet_model - - def forward(self) -> Dict[str, Any]: - return self.model_cfg - - -class PoNetForDocumentSegmentationBase(PoNetPreTrainedModelV2): +class PoNetForDocumentSegmentation(PoNetPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r'pooler'] def __init__(self, config): @@ -107,9 +93,24 @@ class PoNetForDocumentSegmentationBase(PoNetPreTrainedModelV2): output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output - return TokenClassifierOutput( + return AttentionTokenClassificationModelOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + @classmethod + def _instantiate(cls, model_dir, model_config: Dict[str, Any], **kwargs): + if model_config['type'] == 'bert': + config = BertConfig.from_pretrained(model_dir, num_labels=2) + elif model_config['type'] == 'ponet': + config = PoNetConfig.from_pretrained(model_dir, num_labels=2) + else: + raise ValueError( + f'Expected config type bert and ponet, which is : {model_config["type"]}' + ) + model = super(Model, cls).from_pretrained(model_dir, config=config) + model.model_dir = model_dir + model.model_cfg = model_config + return model diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py index e3b358d4..e90c2b5a 100644 --- a/modelscope/models/nlp/space/model/tokenization_space.py +++ b/modelscope/models/nlp/space/model/tokenization_space.py @@ -15,14 +15,14 @@ # limitations under the License """Tokenization classes for Space. mainly copied from :module:`~transformers.tokenization_xlm_roberta`""" -from modelscope.models.nlp.structbert import (BasicTokenizer, SbertTokenizer, - WordpieceTokenizer) +from transformers import BasicTokenizer, BertTokenizer, WordpieceTokenizer + from modelscope.utils import logger as logging logger = logging.get_logger(__name__) -class SpaceTokenizer(SbertTokenizer): +class SpaceTokenizer(BertTokenizer): """ This class overrides [`SpaceTokenizer`]. Please check the superclass for the appropriate documentation alongside usage examples. diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py index 60d369e0..1d81116e 100644 --- a/modelscope/models/nlp/structbert/__init__.py +++ b/modelscope/models/nlp/structbert/__init__.py @@ -24,9 +24,6 @@ if TYPE_CHECKING: from .fill_mask import SbertForMaskedLM from .text_classification import SbertForSequenceClassification from .token_classification import SbertForTokenClassification - from .tokenization import (BasicTokenizer, SbertTokenizer, - WordpieceTokenizer) - from .tokenization_fast import SbertTokenizerFast else: _import_structure = { 'backbone': ['SbertModel', 'SbertPreTrainedModel'], @@ -35,9 +32,6 @@ else: 'faq_question_answering': ['SbertForFaqQuestionAnswering'], 'text_classification': ['SbertForSequenceClassification'], 'token_classification': ['SbertForTokenClassification'], - 'tokenization': - ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'], - 'tokenization_fast': ['SbertTokenizerFast'], } import sys diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py index 039db3ce..9d50dc1f 100755 --- a/modelscope/models/nlp/structbert/backbone.py +++ b/modelscope/models/nlp/structbert/backbone.py @@ -18,15 +18,13 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn import torch.utils.checkpoint from packaging import version from transformers.activations import ACT2FN -from transformers.modeling_outputs import \ - BaseModelOutputWithPastAndCrossAttentions from transformers.modeling_utils import (PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, @@ -37,8 +35,8 @@ from modelscope.models import Model, TorchModel from modelscope.models.builder import MODELS from modelscope.outputs import AttentionBackboneModelOutput from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping from modelscope.utils.logger import get_logger +from modelscope.utils.nlp.utils import parse_labels_in_order from .configuration import SbertConfig logger = get_logger(__name__) @@ -563,7 +561,7 @@ class SbertEncoder(nn.Module): all_self_attentions, all_cross_attentions, ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( + return AttentionBackboneModelOutput( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, @@ -641,29 +639,15 @@ class SbertPreTrainedModel(TorchModel, PreTrainedModel): """ model_dir = kwargs.pop('model_dir', None) + cfg = kwargs.pop('cfg', None) + model_args = parse_labels_in_order(model_dir, cfg, **kwargs) + if model_dir is None: - config = SbertConfig(**kwargs) + config = SbertConfig(**model_args) model = cls(config) else: - model_kwargs = {} - label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) - id2label = kwargs.get( - 'id2label', None if label2id is None else - {id: label - for label, id in label2id.items()}) - if id2label is not None and label2id is None: - label2id = {label: id for id, label in id2label.items()} - - num_labels = kwargs.get( - 'num_labels', None if label2id is None else len(label2id)) - if num_labels is not None: - model_kwargs['num_labels'] = num_labels - if label2id is not None: - model_kwargs['label2id'] = label2id - if id2label is not None: - model_kwargs['id2label'] = id2label model = super(Model, cls).from_pretrained( - pretrained_model_name_or_path=model_dir, **model_kwargs) + pretrained_model_name_or_path=model_dir, **model_args) return model diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py index c8dbf302..a37b8b2d 100644 --- a/modelscope/models/nlp/structbert/faq_question_answering.py +++ b/modelscope/models/nlp/structbert/faq_question_answering.py @@ -14,6 +14,7 @@ from modelscope.metainfo import Models from modelscope.models.builder import MODELS from modelscope.models.nlp.structbert import SbertConfig, SbertModel from modelscope.models.nlp.task_models.task_model import BaseTaskModel +from modelscope.outputs import FaqQuestionAnsweringOutput from modelscope.utils.config import Config, ConfigFields from modelscope.utils.constant import ModelFile, Tasks @@ -208,10 +209,10 @@ class SbertForFaqQuestionAnswering(BaseTaskModel): Predicted scores of all classes for each query. Examples: >>> from modelscope.hub.snapshot_download import snapshot_download - >>> from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor + >>> from modelscope.preprocessors import FaqQuestionAnsweringTransformersPreprocessor >>> from modelscope.models.nlp import SbertForFaqQuestionAnswering >>> cache_path = snapshot_download('damo/nlp_structbert_faq-question-answering_chinese-base') - >>> preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(cache_path) + >>> preprocessor = FaqQuestionAnsweringTransformersPreprocessor.from_pretrained(cache_path) >>> model = SbertForFaqQuestionAnswering.from_pretrained(cache_path) >>> param = { >>> 'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'], @@ -270,7 +271,7 @@ class SbertForFaqQuestionAnswering(BaseTaskModel): scores = self.metrics_layer(z_query, protos).view([n_query, num_cls]) if self.metrics_layer.name == 'relation': scores = torch.sigmoid(scores) - return {'scores': scores} + return FaqQuestionAnsweringOutput(scores=scores) def _get_onehot_labels(self, labels, support_size, num_cls): labels_ = labels.view(support_size, 1) diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py index e611aa88..ded32020 100644 --- a/modelscope/models/nlp/structbert/fill_mask.py +++ b/modelscope/models/nlp/structbert/fill_mask.py @@ -105,7 +105,7 @@ class SbertForMaskedLM(SbertPreTrainedModel): Preprocessor: This is the fill_mask model of StructBERT, the preprocessor of this model - is `modelscope.preprocessors.NLPPreprocessor`. + is `modelscope.preprocessors.FillMaskTransformersPreprocessor`. Parameters: config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with @@ -213,9 +213,9 @@ class SbertForMaskedLM(SbertPreTrainedModel): Examples: >>> from modelscope.models import Model - >>> from modelscope.preprocessors import Preprocessor, NLPPreprocessor + >>> from modelscope.preprocessors import Preprocessor, FillMaskTransformersPreprocessor >>> model = Model.from_pretrained('damo/nlp_structbert_fill-mask_chinese-large') - >>> preprocessor = NLPPreprocessor('damo/nlp_structbert_fill-mask_chinese-large') + >>> preprocessor = FillMaskTransformersPreprocessor('damo/nlp_structbert_fill-mask_chinese-large') >>> # Call the model, return some tensors >>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。'))) >>> # Call the pipeline diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py index 8797beb3..ab5b127e 100644 --- a/modelscope/models/nlp/structbert/text_classification.py +++ b/modelscope/models/nlp/structbert/text_classification.py @@ -55,7 +55,7 @@ class SbertForSequenceClassification(SbertPreTrainedModel): Preprocessor: This is the text classification model of StructBERT, the preprocessor of this model - is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`. Trainer: This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py index a040ff3e..677dcf31 100644 --- a/modelscope/models/nlp/structbert/token_classification.py +++ b/modelscope/models/nlp/structbert/token_classification.py @@ -22,7 +22,7 @@ from torch.nn import CrossEntropyLoss from modelscope.metainfo import Models from modelscope.models.builder import MODELS -from modelscope.outputs import TokenClassifierOutput +from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils import logger as logging from modelscope.utils.constant import Tasks from .adv_utils import compute_adv_loss @@ -50,7 +50,7 @@ class SbertForTokenClassification(SbertPreTrainedModel): Preprocessor: This is the token-classification model of StructBERT, the preprocessor of this model - is `modelscope.preprocessors.TokenClassificationPreprocessor`. + is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`. Trainer: This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer, @@ -168,7 +168,7 @@ class SbertForTokenClassification(SbertPreTrainedModel): - 0 for tokens that are **masked**. Returns: - Returns `modelscope.outputs.TokenClassifierOutput` + Returns `modelscope.outputs.AttentionTokenClassificationModelOutput` Examples: >>> from modelscope.models import Model @@ -220,10 +220,21 @@ class SbertForTokenClassification(SbertPreTrainedModel): with_attention_mask=attention_mask is not None, **outputs.kwargs) - return TokenClassifierOutput( + if label_mask is not None: + mask = label_mask + masked_lengths = mask.sum(-1).long() + masked_logits = torch.zeros_like(logits) + for i in range(len(mask)): + masked_logits[ + i, :masked_lengths[i], :] = logits[i].masked_select( + mask[i].unsqueeze(-1)).view(masked_lengths[i], -1) + logits = masked_logits + + return AttentionTokenClassificationModelOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, offset_mapping=offset_mapping, + label_mask=label_mask, ) diff --git a/modelscope/models/nlp/structbert/tokenization.py b/modelscope/models/nlp/structbert/tokenization.py deleted file mode 100644 index 3171e31d..00000000 --- a/modelscope/models/nlp/structbert/tokenization.py +++ /dev/null @@ -1,519 +0,0 @@ -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert`""" - -import collections -import os -import unicodedata -from typing import List, Optional, Tuple - -from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control, - _is_punctuation, _is_whitespace) - -from modelscope.utils.constant import ModelFile -from modelscope.utils.logger import get_logger - -logger = get_logger(__name__) - -VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE} - -PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} - -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'nlp_structbert_backbone_large_std': 512, - 'nlp_structbert_backbone_base_std': 512, - 'nlp_structbert_backbone_lite_std': 512, - 'nlp_structbert_backbone_tiny_std': 512, -} - -PRETRAINED_INIT_CONFIGURATION = { - 'english_sbert-large-std-512': { - 'do_lower_case': True - }, -} - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - with open(vocab_file, 'r', encoding='utf-8') as reader: - tokens = reader.readlines() - for index, token in enumerate(tokens): - token = token.rstrip('\n') - vocab[token] = index - return vocab - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class SbertTokenizer(PreTrainedTokenizer): - r""" - Construct a SBERT tokenizer. Based on WordPiece. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. - Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - File containing the vocabulary. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to do basic tokenization before WordPiece. - never_split (:obj:`Iterable`, `optional`): - Collection of tokens which will never be split during tokenization. Only has an effect when - :obj:`do_basic_tokenize=True` - unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to tokenize Chinese characters. - - This should likely be deactivated for Japanese (see this `issue - `__). - strip_accents: (:obj:`bool`, `optional`): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for :obj:`lowercase` (as in the original BERT). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - - def __init__(self, - vocab_file, - do_lower_case=True, - do_basic_tokenize=True, - never_split=None, - unk_token='[UNK]', - sep_token='[SEP]', - pad_token='[PAD]', - cls_token='[CLS]', - mask_token='[MASK]', - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs): - super().__init__( - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - - if not os.path.isfile(vocab_file): - raise ValueError( - f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained " - 'model use `tokenizer = SbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' - ) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict([ - (ids, tok) for tok, ids in self.vocab.items() - ]) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - ) - self.wordpiece_tokenizer = WordpieceTokenizer( - vocab=self.vocab, unk_token=self.unk_token) - - @property - def do_lower_case(self): - return self.basic_tokenizer.do_lower_case - - @property - def vocab_size(self): - return len(self.vocab) - - def get_vocab(self): - return dict(self.vocab, **self.added_tokens_encoder) - - def _tokenize(self, text): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize( - text, never_split=self.all_special_tokens): - - # If the token is part of the never_split set - if token in self.basic_tokenizer.never_split: - split_tokens.append(token) - else: - split_tokens += self.wordpiece_tokenizer.tokenize(token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.vocab.get(token, self.vocab.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.ids_to_tokens.get(index, self.unk_token) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - out_string = ' '.join(tokens).replace(' ##', '').strip() - return out_string - - def build_inputs_with_special_tokens( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A SBERT sequence has the following format: - - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=True) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ( - [0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence - pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 - + sep) * [1] - - def save_vocabulary(self, - save_directory: str, - filename_prefix: Optional[str] = None) -> Tuple[str]: - index = 0 - if os.path.isdir(save_directory): - vocab_file = os.path.join( - save_directory, - (filename_prefix + '-' if filename_prefix else '') - + VOCAB_FILES_NAMES['vocab_file']) - else: - vocab_file = (filename_prefix - + '-' if filename_prefix else '') + save_directory - with open(vocab_file, 'w', encoding='utf-8') as writer: - for token, token_index in sorted( - self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.' - ' Please check that the vocabulary is not corrupted!') - index = token_index - writer.write(token + '\n') - index += 1 - return (vocab_file, ) - - -class BasicTokenizer(object): - """ - Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). - - Args: - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - never_split (:obj:`Iterable`, `optional`): - Collection of tokens which will never be split during tokenization. Only has an effect when - :obj:`do_basic_tokenize=True` - tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to tokenize Chinese characters. - - This should likely be deactivated for Japanese (see this `issue - `__). - strip_accents: (:obj:`bool`, `optional`): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for :obj:`lowercase` (as in the original BERT). - """ - - def __init__(self, - do_lower_case=True, - never_split=None, - tokenize_chinese_chars=True, - strip_accents=None): - if never_split is None: - never_split = [] - self.do_lower_case = do_lower_case - self.never_split = set(never_split) - self.tokenize_chinese_chars = tokenize_chinese_chars - self.strip_accents = strip_accents - - def tokenize(self, text, never_split=None): - """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. - - Args: - **never_split**: (`optional`) list of str - Kept for backward compatibility purposes. Now implemented directly at the base class level (see - :func:`PreTrainedTokenizer.tokenize`) List of token not to split. - """ - # union() returns a new set by concatenating the two sets. - never_split = self.never_split.union( - set(never_split)) if never_split else self.never_split - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - if self.tokenize_chinese_chars: - text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if token not in never_split: - if self.do_lower_case: - token = token.lower() - if self.strip_accents is not False: - token = self._run_strip_accents(token) - elif self.strip_accents: - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token, never_split)) - - output_tokens = whitespace_tokenize(' '.join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize('NFD', text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == 'Mn': - continue - output.append(char) - return ''.join(output) - - def _run_split_on_punc(self, text, never_split=None): - """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return [''.join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(' ') - output.append(char) - output.append(' ') - else: - output.append(char) - return ''.join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF) - or (0x20000 <= cp <= 0x2A6DF) or (0x2A700 <= cp <= 0x2B73F) - or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF) - or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)): - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xFFFD or _is_control(char): - continue - if _is_whitespace(char): - output.append(' ') - else: - output.append(char) - return ''.join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token, max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """ - Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform - tokenization using the given vocabulary. - - For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = ''.join(chars[start:end]) - if start > 0: - substr = '##' + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens diff --git a/modelscope/models/nlp/structbert/tokenization_fast.py b/modelscope/models/nlp/structbert/tokenization_fast.py deleted file mode 100644 index 6f7b7ba7..00000000 --- a/modelscope/models/nlp/structbert/tokenization_fast.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fast Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert_fast`""" - -from typing import List, Optional, Tuple - -import json -import transformers -from tokenizers import normalizers -from transformers.tokenization_utils_fast import PreTrainedTokenizerFast - -from modelscope.utils.constant import ModelFile -from modelscope.utils.logger import get_logger -from .tokenization import SbertTokenizer - -logger = get_logger(__name__) - -VOCAB_FILES_NAMES = { - 'vocab_file': ModelFile.VOCAB_FILE, - 'tokenizer_file': 'tokenizer.json' -} - -PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': {}, - 'tokenizer_file': {}, -} - -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'nlp_structbert_backbone_large_std': 512, - 'nlp_structbert_backbone_base_std': 512, - 'nlp_structbert_backbone_lite_std': 512, - 'nlp_structbert_backbone_tiny_std': 512, -} - -PRETRAINED_INIT_CONFIGURATION = { - 'english_sbert-large-std-512': { - 'do_lower_case': True - }, -} - -transformers.SLOW_TO_FAST_CONVERTERS[ - 'SbertTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS['BertTokenizer'] - - -class SbertTokenizerFast(PreTrainedTokenizerFast): - r""" - Construct a "fast" SBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - File containing the vocabulary. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to clean the text before tokenization by removing any control characters and replacing all - whitespaces by the classic one. - tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this - issue `__). - strip_accents: (:obj:`bool`, `optional`): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for :obj:`lowercase` (as in the original BERT). - wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`): - The prefix for subwords. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - slow_tokenizer_class = SbertTokenizer - - def __init__(self, - vocab_file=None, - tokenizer_file=None, - do_lower_case=True, - unk_token='[UNK]', - sep_token='[SEP]', - pad_token='[PAD]', - cls_token='[CLS]', - mask_token='[MASK]', - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs): - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - do_lower_case=do_lower_case, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - - pre_tok_state = json.loads( - self.backend_tokenizer.normalizer.__getstate__()) - if (pre_tok_state.get('lowercase', do_lower_case) != do_lower_case - or pre_tok_state.get('strip_accents', - strip_accents) != strip_accents): - pre_tok_class = getattr(normalizers, pre_tok_state.pop('type')) - pre_tok_state['lowercase'] = do_lower_case - pre_tok_state['strip_accents'] = strip_accents - self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state) - - self.do_lower_case = do_lower_case - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A SBERT sequence has the following format: - - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - - if token_ids_1: - output += token_ids_1 + [self.sep_token_id] - - return output - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence - pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 - + sep) * [1] - - def save_vocabulary(self, - save_directory: str, - filename_prefix: Optional[str] = None) -> Tuple[str]: - files = self._tokenizer.model.save( - save_directory, name=filename_prefix) - return tuple(files) diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py index 9360ec08..f6214e9c 100644 --- a/modelscope/models/nlp/task_models/feature_extraction.py +++ b/modelscope/models/nlp/task_models/feature_extraction.py @@ -5,12 +5,10 @@ import numpy as np from modelscope.metainfo import TaskModels from modelscope.models.builder import MODELS -from modelscope.models.nlp.bert import BertConfig from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase -from modelscope.outputs import OutputKeys +from modelscope.outputs import FeatureExtractionOutput, OutputKeys from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping __all__ = ['FeatureExtractionModel'] @@ -31,9 +29,9 @@ class FeatureExtractionModel(SingleBackboneTaskModelBase): self.build_backbone(self.backbone_cfg) - def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> FeatureExtractionOutput: # backbone do not need labels, only head need for loss compute input.pop(OutputKeys.LABELS, None) outputs = super().forward(input) sequence_output = outputs.last_hidden_state - return {OutputKeys.TEXT_EMBEDDING: sequence_output} + return FeatureExtractionOutput(text_embedding=sequence_output) diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py index ce0e21a3..3a8380a6 100644 --- a/modelscope/models/nlp/task_models/information_extraction.py +++ b/modelscope/models/nlp/task_models/information_extraction.py @@ -7,7 +7,7 @@ from modelscope.metainfo import TaskModels from modelscope.models.builder import MODELS from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase -from modelscope.outputs import OutputKeys +from modelscope.outputs import InformationExtractionOutput, OutputKeys from modelscope.utils.constant import Tasks __all__ = ['InformationExtractionModel'] @@ -31,9 +31,9 @@ class InformationExtractionModel(SingleBackboneTaskModelBase): self.build_backbone(self.backbone_cfg) self.build_head(self.head_cfg) - def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> InformationExtractionOutput: outputs = super().forward(input) sequence_output = outputs.last_hidden_state outputs = self.head.forward(sequence_output, input['text'], input['offsets']) - return {OutputKeys.SPO_LIST: outputs} + return InformationExtractionOutput(spo_list=outputs) diff --git a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py index 79ce365d..864a04d3 100644 --- a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py @@ -12,7 +12,7 @@ from transformers import AutoConfig, AutoModel from modelscope.metainfo import Models from modelscope.models import TorchModel from modelscope.models.builder import MODELS -from modelscope.outputs import TokenClassifierWithPredictionsOutput +from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils.constant import ModelFile, Tasks __all__ = [ @@ -115,7 +115,7 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel): - 0 for tokens that are **masked**. Returns: - Returns `modelscope.outputs.TokenClassifierOutput` + Returns `modelscope.outputs.AttentionTokenClassificationModelOutput` Examples: >>> from modelscope.models import Model @@ -138,17 +138,16 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel): def postprocess(self, input: Dict[str, Any], **kwargs): predicts = self.model.decode(input) - offset_len = len(input['offset_mapping']) - predictions = torch.narrow( - predicts, 1, 0, - offset_len) # index_select only move loc, not resize - return TokenClassifierWithPredictionsOutput( + offset_mapping = input.get('offset_mapping') + mask = input.get('label_mask') + return AttentionTokenClassificationModelOutput( loss=None, logits=None, hidden_states=None, attentions=None, - offset_mapping=input['offset_mapping'], - predictions=predictions, + label_mask=mask, + offset_mapping=offset_mapping, + predictions=predicts, ) diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py index 982bce32..0e216496 100644 --- a/modelscope/models/nlp/task_models/token_classification.py +++ b/modelscope/models/nlp/task_models/token_classification.py @@ -1,18 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict -import numpy as np import torch from modelscope.metainfo import Models, TaskModels from modelscope.models.builder import MODELS from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase -from modelscope.outputs import OutputKeys, TokenClassifierOutput +from modelscope.outputs import (AttentionTokenClassificationModelOutput, + OutputKeys) from modelscope.utils.constant import Tasks from modelscope.utils.hub import parse_label_mapping -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) __all__ = ['TokenClassificationModel'] @@ -48,7 +46,10 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): self.build_backbone(self.backbone_cfg) self.build_head(self.head_cfg) - def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward( + self, + **input: Dict[str, + Any]) -> AttentionTokenClassificationModelOutput: labels = None if OutputKeys.LABEL in input: labels = input.pop(OutputKeys.LABEL) @@ -62,16 +63,23 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): if labels in input: loss = self.compute_loss(outputs, labels) - # apply label mask to logits - logits = logits[input['label_mask']].unsqueeze(0) + if 'label_mask' in input: + mask = input['label_mask'] + masked_lengths = mask.sum(-1).long() + masked_logits = torch.zeros_like(logits) + for i in range(len(mask)): + masked_logits[ + i, :masked_lengths[i], :] = logits[i].masked_select( + mask[i].unsqueeze(-1)).view(masked_lengths[i], -1) + logits = masked_logits - return TokenClassifierOutput( + return AttentionTokenClassificationModelOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - offset_mapping=input['offset_mapping'], - ) + offset_mapping=input.get('offset_mapping'), + label_mask=input.get('label_mask')) def extract_logits(self, outputs): return outputs[OutputKeys.LOGITS].cpu().detach() diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py index 0774e9b4..5f70f3f6 100644 --- a/modelscope/models/nlp/veco/__init__.py +++ b/modelscope/models/nlp/veco/__init__.py @@ -23,8 +23,6 @@ if TYPE_CHECKING: from .text_classification import VecoForSequenceClassification from .token_classification import VecoForTokenClassification from .fill_mask import VecoForMaskedLM - from .tokenization import VecoTokenizer - from .tokenization_fast import VecoTokenizerFast else: _import_structure = { 'configuration': ['VecoConfig'], @@ -32,8 +30,6 @@ else: 'text_classification': ['VecoForSequenceClassification'], 'fill_mask': ['VecoForMaskedLM'], 'token_classification': ['VecoForTokenClassification'], - 'tokenization': ['VecoTokenizer'], - 'tokenization_fast': ['VecoTokenizerFast'], } import sys diff --git a/modelscope/models/nlp/veco/fill_mask.py b/modelscope/models/nlp/veco/fill_mask.py index de2cdb4a..fc37f920 100644 --- a/modelscope/models/nlp/veco/fill_mask.py +++ b/modelscope/models/nlp/veco/fill_mask.py @@ -40,7 +40,7 @@ class VecoForMaskedLM(TorchModel, RobertaForMaskedLM): Preprocessor: This is the fill_mask model of StructBERT, the preprocessor of this model - is `modelscope.preprocessors.NLPPreprocessor`. + is `modelscope.preprocessors.FillMaskTransformersPreprocessor`. Parameters: config ([`VecoConfig`]): Model configuration class with all the parameters of the diff --git a/modelscope/models/nlp/veco/text_classification.py b/modelscope/models/nlp/veco/text_classification.py index e4e74d8f..64f3aadd 100644 --- a/modelscope/models/nlp/veco/text_classification.py +++ b/modelscope/models/nlp/veco/text_classification.py @@ -22,7 +22,7 @@ from modelscope.models import Model, TorchModel from modelscope.models.builder import MODELS from modelscope.outputs import AttentionTextClassificationModelOutput from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.nlp.utils import parse_labels_in_order from .configuration import VecoConfig @@ -46,7 +46,7 @@ class VecoForSequenceClassification(TorchModel, Preprocessor: This is the text classification model of Veco, the preprocessor of this model - is `modelscope.preprocessors.SequenceClassificationPreprocessor`. + is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`. Trainer: This model should be trained by dataset which has mixed languages, @@ -124,27 +124,13 @@ class VecoForSequenceClassification(TorchModel, """ model_dir = kwargs.pop('model_dir', None) + cfg = kwargs.pop('cfg', None) + model_args = parse_labels_in_order(model_dir, cfg, **kwargs) + if model_dir is None: - config = VecoConfig(**kwargs) + config = VecoConfig(**model_args) model = cls(config) else: - model_kwargs = {} - label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) - id2label = kwargs.get( - 'id2label', None if label2id is None else - {id: label - for label, id in label2id.items()}) - if id2label is not None and label2id is None: - label2id = {label: id for id, label in id2label.items()} - - num_labels = kwargs.get( - 'num_labels', None if label2id is None else len(label2id)) - if num_labels is not None: - model_kwargs['num_labels'] = num_labels - if label2id is not None: - model_kwargs['label2id'] = label2id - if id2label is not None: - model_kwargs['id2label'] = id2label model = super(Model, cls).from_pretrained( - pretrained_model_name_or_path=model_dir, **model_kwargs) + pretrained_model_name_or_path=model_dir, **model_args) return model diff --git a/modelscope/models/nlp/veco/token_classification.py b/modelscope/models/nlp/veco/token_classification.py index f6252209..4fc96c71 100644 --- a/modelscope/models/nlp/veco/token_classification.py +++ b/modelscope/models/nlp/veco/token_classification.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch from transformers import RobertaForTokenClassification from modelscope.metainfo import Models @@ -22,7 +23,7 @@ from modelscope.models import Model, TorchModel from modelscope.models.builder import MODELS from modelscope.outputs import AttentionTokenClassificationModelOutput from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping +from modelscope.utils.nlp.utils import parse_labels_in_order from .configuration import VecoConfig @@ -58,6 +59,7 @@ class VecoForTokenClassification(TorchModel, RobertaForTokenClassification): def forward(self, *args, **kwargs): kwargs['return_dict'] = True outputs = super(Model, self).forward(*args, **kwargs) + return AttentionTokenClassificationModelOutput( loss=outputs.loss, logits=outputs.logits, @@ -81,27 +83,13 @@ class VecoForTokenClassification(TorchModel, RobertaForTokenClassification): """ model_dir = kwargs.pop('model_dir', None) + cfg = kwargs.pop('cfg', None) + model_args = parse_labels_in_order(model_dir, cfg, **kwargs) + if model_dir is None: - config = VecoConfig(**kwargs) + config = VecoConfig(**model_args) model = cls(config) else: - model_kwargs = {} - label2id = kwargs.get('label2id', parse_label_mapping(model_dir)) - id2label = kwargs.get( - 'id2label', None if label2id is None else - {id: label - for label, id in label2id.items()}) - if id2label is not None and label2id is None: - label2id = {label: id for id, label in id2label.items()} - - num_labels = kwargs.get( - 'num_labels', None if label2id is None else len(label2id)) - if num_labels is not None: - model_kwargs['num_labels'] = num_labels - if label2id is not None: - model_kwargs['label2id'] = label2id - if id2label is not None: - model_kwargs['id2label'] = id2label model = super(Model, cls).from_pretrained( - pretrained_model_name_or_path=model_dir, **model_kwargs) + pretrained_model_name_or_path=model_dir, **model_args) return model diff --git a/modelscope/models/nlp/veco/tokenization.py b/modelscope/models/nlp/veco/tokenization.py deleted file mode 100644 index 21711456..00000000 --- a/modelscope/models/nlp/veco/tokenization.py +++ /dev/null @@ -1,321 +0,0 @@ -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -"""Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta`""" - -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm -from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer - -from modelscope.utils import logger as logging - -logger = logging.get_logger(__name__) - -SPIECE_UNDERLINE = '▁' - -VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} - -PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} - -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} - - -class VecoTokenizer(PreTrainedTokenizer): - """ - Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on - [SentencePiece](https://github.com/google/sentencepiece). - - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. - Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of - sequence. The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - sp_model_kwargs (`dict`, *optional*): - Will be passed to the `SentencePieceProcessor.__init__()` method. - The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) - can be used, among other things, to set: - - - `enable_sampling`: Enable subword regularization. - - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - - - `nbest_size = {0,1}`: No sampling is performed. - - `nbest_size > 1`: samples from the nbest_size results. - - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - using forward-filtering-and-backward-sampling algorithm. - - - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - BPE-dropout. - - Attributes: - sp_model (`SentencePieceProcessor`): - The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ['input_ids', 'attention_mask'] - - def __init__(self, - vocab_file, - bos_token='', - eos_token='', - sep_token='', - cls_token='', - unk_token='', - pad_token='', - mask_token='', - sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs) -> None: - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken( - mask_token, lstrip=True, rstrip=False) if isinstance( - mask_token, str) else mask_token - - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) - self.vocab_file = vocab_file - - # Original fairseq vocab and spm vocab must be "aligned": - # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 - # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- - # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' - # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' - - # Mimic fairseq token-to-id alignment for the first 4 token - self.fairseq_tokens_to_ids = { - '': 0, - '': 1, - '': 2, - '': 3 - } - - # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab - self.fairseq_offset = 1 - - self.fairseq_tokens_to_ids[''] = len( - self.sp_model) + self.fairseq_offset - self.fairseq_ids_to_tokens = { - v: k - for k, v in self.fairseq_tokens_to_ids.items() - } - - def __getstate__(self): - state = self.__dict__.copy() - state['sp_model'] = None - state['sp_model_proto'] = self.sp_model.serialized_model_proto() - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, 'sp_model_kwargs'): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.LoadFromSerializedProto(self.sp_model_proto) - - def build_inputs_with_special_tokens( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An Veco sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=True) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ( - [0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does - not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of zeros. - - """ - - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - def vocab_size(self): - return len( - self.sp_model) + self.fairseq_offset + 1 # Add the token - - def get_vocab(self): - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text: str) -> List[str]: - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - spm_id = self.sp_model.PieceToId(token) - - # Need to return unknown token if the SP model returned 0 - return spm_id + self.fairseq_offset if spm_id else self.unk_token_id - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() - return out_string - - def save_vocabulary(self, - save_directory: str, - filename_prefix: Optional[str] = None) -> Tuple[str]: - if not os.path.isdir(save_directory): - logger.error( - f'Vocabulary path ({save_directory}) should be a directory') - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + '-' if filename_prefix else '') - + VOCAB_FILES_NAMES['vocab_file']) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file, ) diff --git a/modelscope/models/nlp/veco/tokenization_fast.py b/modelscope/models/nlp/veco/tokenization_fast.py deleted file mode 100644 index b41a5c3b..00000000 --- a/modelscope/models/nlp/veco/tokenization_fast.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. -# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -"""Fast Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta_fast`""" - -import os -from shutil import copyfile -from typing import List, Optional, Tuple - -import transformers -from transformers.file_utils import is_sentencepiece_available -from transformers.tokenization_utils import AddedToken -from transformers.tokenization_utils_fast import PreTrainedTokenizerFast - -from modelscope.utils import logger as logging - -if is_sentencepiece_available(): - from .tokenization import VecoTokenizer -else: - VecoTokenizer = None - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = { - 'vocab_file': 'sentencepiece.bpe.model', - 'tokenizer_file': 'tokenizer.json' -} - -PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': {}, - 'tokenizer_file': {}, -} - -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} - -transformers.SLOW_TO_FAST_CONVERTERS[ - 'VecoTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS[ - 'XLMRobertaTokenizer'] - - -class VecoTokenizerFast(PreTrainedTokenizerFast): - """ - Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. - Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of - sequence. The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ['input_ids', 'attention_mask'] - slow_tokenizer_class = VecoTokenizer - - def __init__(self, - vocab_file=None, - tokenizer_file=None, - bos_token='', - eos_token='', - sep_token='', - cls_token='', - unk_token='', - pad_token='', - mask_token='', - **kwargs): - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken( - mask_token, lstrip=True, rstrip=False) if isinstance( - mask_token, str) else mask_token - - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - **kwargs, - ) - - self.vocab_file = vocab_file - self.can_save_slow_tokenizer = False if not self.vocab_file else True - - def build_inputs_with_special_tokens( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An Veco sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does - not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of zeros. - - """ - - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def save_vocabulary(self, - save_directory: str, - filename_prefix: Optional[str] = None) -> Tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow ' - 'tokenizer.') - - if not os.path.isdir(save_directory): - logger.error( - f'Vocabulary path ({save_directory}) should be a directory.') - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + '-' if filename_prefix else '') - + VOCAB_FILES_NAMES['vocab_file']) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file, ) diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp/model_outputs.py index 46267007..464ba7ef 100644 --- a/modelscope/outputs/nlp/model_outputs.py +++ b/modelscope/outputs/nlp/model_outputs.py @@ -1,179 +1,13 @@ from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import Optional, Tuple, Union + +import numpy as np from modelscope.outputs.outputs import ModelOutputBase Tensor = Union['torch.Tensor', 'tf.Tensor'] -@dataclass -class TextClassificationModelOutput(ModelOutputBase): - """The output class for text classification models. - - Args: - logits (`Tensor`): The logits output of the model. loss (`Tensor`, - *optional*) The loss of the model, available when training. - hidden_states (`Tensor`, *optional*) Hidden-states of the model at the - output of each layer plus the optional initial embedding outputs. - """ - - logits: Tensor = None - loss: Tensor = None - - -@dataclass -class TokenClassificationModelOutput(ModelOutputBase): - """The output class for token classification models. - logits (`Tensor`): The logits output of the model. - loss (`Tensor`, *optional*) The loss of the model, available when training. - """ - - logits: Tensor = None - loss: Tensor = None - offset_mapping: Tensor = None - - -@dataclass -class FillMaskModelOutput(ModelOutputBase): - """The output class for text classification models. - - Args: - logits (`Tensor`): The logits output of the model. - loss (`Tensor`, *optional*) The loss of the model, available when training. - input_ids (`Tensor`, *optional*) The input id tensor fed into the model. - hidden_states (`Tensor`, *optional*) Hidden-states of the model at the - output of each layer plus the optional initial embedding outputs. - """ - - logits: Tensor = None - loss: Tensor = None - input_ids: Tensor = None - hidden_states: Tensor = None - - -@dataclass -class TokenClassifierOutput(ModelOutputBase): - """ - Base class for outputs of token classification models. - - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when - `labels` is provided) : - Classification loss. - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, - config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, - sequence_length)`, `optional`): - Indices of positions of each input sequence tokens in the sentence. - Selected in the range ``[0, sequence_length - 1]``. - - """ - - loss: Tensor = None - logits: Tensor = None - hidden_states: Tensor = None - attentions: Tensor = None - offset_mapping: Tensor = None - - -@dataclass -class TokenClassifierWithPredictionsOutput(ModelOutputBase): - """ - Base class for outputs of token classification models. - - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when - `labels` is provided) : - Classification loss. - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, - config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, - sequence_length)`, `optional`): - Indices of positions of each input sequence tokens in the sentence. - Selected in the range ``[0, sequence_length - 1]``. - predictions: A PyTorch tensor of the best tag sequence for each batch of shape - (nbest, batch_size, seq_length) - - """ - - loss: Tensor = None - logits: Tensor = None - hidden_states: Tensor = None - attentions: Tensor = None - offset_mapping: Tensor = None - predictions: Tensor = None - - -@dataclass -class BaseModelOutput(ModelOutputBase): - """ - Base class for model's outputs, with potential hidden states and attentions. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - """ - - last_hidden_state: Tensor = None - hidden_states: Optional[Tuple[Tensor]] = None - attentions: Optional[Tuple[Tensor]] = None - - @dataclass class BackboneModelOutput(ModelOutputBase): """The output class for text classification models. @@ -196,81 +30,6 @@ class AttentionBackboneModelOutput(BackboneModelOutput): """The output class for backbones of attention based models. Args: - attentions (`tuple(Tensor)`, *optional* Attentions weights after the - attention softmax, used to compute the weighted average in the - self-attention heads. - """ - attentions: Tensor = None - past_key_values: Tensor = None - cross_attentions: Tensor = None - - -@dataclass -class AttentionTextClassificationModelOutput(TextClassificationModelOutput): - """The output class for backbones of attention based models. - - Args: - attentions (`tuple(Tensor)`, *optional* Attentions weights after the - attention softmax, used to compute the weighted average in the - self-attention heads. - """ - attentions: Tensor = None - hidden_states: Tensor = None - - -@dataclass -class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput): - """The output class for backbones of attention based models. - - Args: - attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax, - used to compute the weighted average in the self-attention heads. - """ - attentions: Tensor = None - hidden_states: Tensor = None - - -@dataclass -class AttentionFillMaskModelOutput(FillMaskModelOutput): - """The output class for the fill mask and attention based models. - - Args: - attentions (`tuple(Tensor)`, *optional* Attentions weights after the - attention softmax, used to compute the weighted average in the - self-attention heads. - """ - attentions: Tensor = None - - -@dataclass -class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase): - """ - Base class for model's outputs that also contains a pooling of the last - hidden states. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - model. - pooler_output (`torch.FloatTensor` of shape `(batch_size, - hidden_size)`): - Last layer hidden-state of the first token of the sequence - (classification token) after further processing through the layers - used for the auxiliary pretraining task. E.g. for BERT-family of - models, this returns the classification token after processing - through a linear layer and a tanh activation function. The linear - layer weights are trained from the next sentence prediction - (classification) objective during pretraining. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): @@ -303,75 +62,8 @@ class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase): can be used (see `past_key_values` input) to speed up sequential decoding. """ - - last_hidden_state: Tensor = None - pooler_output: Tensor = None - hidden_states: Tensor = None - past_key_values: Tensor = None attentions: Tensor = None - cross_attentions: Tensor = None - - -@dataclass -class BaseModelOutputWithPastAndCrossAttentions(ModelOutputBase): - """ - Base class for model's outputs that may also contain a past key/values (to - speed up sequential decoding). - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - model. - - If `past_key_values` is used only the last hidden-state of the - sequences of shape `(batch_size, 1, hidden_size)` is output. - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned - when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, - with each tuple having 2 tensors of shape `(batch_size, num_heads, - sequence_length, embed_size_per_head)`) and optionally if - `config.is_encoder_decoder=True` 2 additional tensors of shape - `(batch_size, num_heads, encoder_sequence_length, - embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the - self-attention blocks and optionally if - `config.is_encoder_decoder=True` in the cross-attention blocks) that - can be used (see `past_key_values` input) to speed up sequential - decoding. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` and `config.add_cross_attention=True` is passed - or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights of the decoder's cross-attention layer, after the - attention softmax, used to compute the weighted average in the - cross-attention heads. - """ - - last_hidden_state: Tensor = None past_key_values: Tensor = None - hidden_states: Tensor = None - attentions: Tensor = None cross_attentions: Tensor = None @@ -459,6 +151,60 @@ class Seq2SeqModelOutput(ModelOutputBase): encoder_attentions: Optional[Tuple[Tensor]] = None +@dataclass +class FaqQuestionAnsweringOutput(ModelOutputBase): + """The output class for faq QA models. + """ + + scores: Tensor = None + + +@dataclass +class FeatureExtractionOutput(ModelOutputBase): + """The output class for feature extraction models. + """ + + text_embedding: Tensor = None + + +@dataclass +class FillMaskModelOutput(ModelOutputBase): + """The output class for text classification models. + + Args: + logits (`Tensor`): The logits output of the model. + loss (`Tensor`, *optional*) The loss of the model, available when training. + input_ids (`Tensor`, *optional*) The input id tensor fed into the model. + hidden_states (`Tensor`, *optional*) Hidden-states of the model at the + output of each layer plus the optional initial embedding outputs. + """ + + logits: Tensor = None + loss: Tensor = None + input_ids: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class AttentionFillMaskModelOutput(FillMaskModelOutput): + """The output class for the fill mask and attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the + attention softmax, used to compute the weighted average in the + self-attention heads. + """ + attentions: Tensor = None + + +@dataclass +class InformationExtractionOutput(ModelOutputBase): + """The output class for information extraction models. + """ + + spo_list: np.ndarray = None + + @dataclass class Seq2SeqLMOutput(ModelOutputBase): """ @@ -543,6 +289,42 @@ class Seq2SeqLMOutput(ModelOutputBase): encoder_attentions: Optional[Tuple[Tensor]] = None +@dataclass +class TextClassificationModelOutput(ModelOutputBase): + """The output class for text classification models. + + Args: + logits (`Tensor`): The logits output of the model. loss (`Tensor`, + *optional*) The loss of the model, available when training. + hidden_states (`Tensor`, *optional*) Hidden-states of the model at the + output of each layer plus the optional initial embedding outputs. + """ + + logits: Tensor = None + loss: Tensor = None + + +@dataclass +class AttentionTextClassificationModelOutput(TextClassificationModelOutput): + """The output class for backbones of attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the + attention softmax, used to compute the weighted average in the + self-attention heads. + """ + attentions: Tensor = None + hidden_states: Tensor = None + + +@dataclass +class TextErrorCorrectionOutput(ModelOutputBase): + """The output class for information extraction models. + """ + + predictions: np.ndarray = None + + @dataclass class TextGenerationModelOutput(ModelOutputBase): """The output class for text generation models. @@ -588,3 +370,35 @@ class TokenGeneratorOutput(ModelOutputBase): scores: Optional[Tuple[Tensor]] = None attentions: Optional[Tuple[Tuple[Tensor]]] = None hidden_states: Optional[Tuple[Tuple[Tensor]]] = None + + +@dataclass +class TokenClassificationModelOutput(ModelOutputBase): + """The output class for token classification models. + logits (`Tensor`): The logits output of the model. + loss (`Tensor`, *optional*) The loss of the model, available when training. + predictions: A PyTorch tensor of the best tag sequence for each batch of shape + (nbest, batch_size, seq_length) + offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, + sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the sentence. + Selected in the range ``[0, sequence_length - 1]``. + """ + + logits: Tensor = None + loss: Tensor = None + offset_mapping: Tensor = None + predictions: Tensor = None + label_mask: Tensor = None + + +@dataclass +class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput): + """The output class for backbones of attention based models. + + Args: + attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax, + used to compute the weighted average in the self-attention heads. + """ + attentions: Tensor = None + hidden_states: Tensor = None diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index af264bf0..8cb8600a 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -12,7 +12,7 @@ import numpy as np from modelscope.models.base import Model from modelscope.msdatasets import MsDataset -from modelscope.outputs import TASK_OUTPUTS +from modelscope.outputs import TASK_OUTPUTS, ModelOutputBase from modelscope.pipeline_inputs import TASK_INPUTS, check_input_type from modelscope.preprocessors import Preprocessor from modelscope.utils.config import Config @@ -321,6 +321,8 @@ class Pipeline(ABC): return output_keys = TASK_OUTPUTS[task_name] missing_keys = [] + input = input.keys() if isinstance(input, + (dict, ModelOutputBase)) else input for k in output_keys: if k not in input: missing_keys.append(k) diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 097ff9ee..c1634a9c 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -298,6 +298,7 @@ def pipeline(task: str = None, raise ValueError('task or pipeline_name is required') model = normalize_model_input(model, model_revision) + pipeline_props = {'type': pipeline_name} if pipeline_name is None: # get default pipeline for this task if isinstance(model, str) \ @@ -309,7 +310,7 @@ def pipeline(task: str = None, model, str) else read_config( model[0], revision=model_revision) check_config(cfg) - pipeline_name = cfg.pipeline.type + pipeline_props = cfg.pipeline elif model is not None: # get pipeline info from Model object first_model = model[0] if isinstance(model, list) else model @@ -318,13 +319,15 @@ def pipeline(task: str = None, cfg = read_config(first_model.model_dir) check_config(cfg) first_model.pipeline = cfg.pipeline - pipeline_name = first_model.pipeline.type + pipeline_props = first_model.pipeline else: pipeline_name, default_model_repo = get_default_pipeline_info(task) model = normalize_model_input(default_model_repo, model_revision) + pipeline_props = {'type': pipeline_name} - cfg = ConfigDict(type=pipeline_name, model=model) - cfg.device = device + pipeline_props['model'] = model + pipeline_props['device'] = device + cfg = ConfigDict(pipeline_props) if kwargs: cfg.update(kwargs) diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py index 37cae4ce..cde70fff 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/base.py +++ b/modelscope/pipelines/cv/easycv_pipelines/base.py @@ -61,6 +61,8 @@ class EasyCVPipeline(object): self.cfg = Config.from_file(self.config_file) if 'device' in kwargs: kwargs['device'] = create_device(kwargs['device']) + if 'predictor_config' in kwargs: + kwargs.pop('predictor_config') self.predict_op = self._build_predict_op(**kwargs) def _build_predict_op(self, **kwargs): diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index fd731ef6..eaff2144 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -12,22 +12,19 @@ if TYPE_CHECKING: from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline from .document_segmentation_pipeline import DocumentSegmentationPipeline from .extractive_summarization_pipeline import ExtractiveSummarizationPipeline - from .fasttext_sequence_classification_pipeline import FasttextSequenceClassificationPipeline + from .fasttext_text_classification_pipeline import FasttextSequenceClassificationPipeline from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline from .feature_extraction_pipeline import FeatureExtractionPipeline from .fill_mask_pipeline import FillMaskPipeline from .information_extraction_pipeline import InformationExtractionPipeline - from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline, \ - NamedEntityRecognitionThaiPipeline, \ - NamedEntityRecognitionVietPipeline + from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline from .text_ranking_pipeline import TextRankingPipeline from .sentence_embedding_pipeline import SentenceEmbeddingPipeline from .text_classification_pipeline import TextClassificationPipeline from .summarization_pipeline import SummarizationPipeline from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline from .text_error_correction_pipeline import TextErrorCorrectionPipeline - from .text_generation_pipeline import TextGenerationPipeline - from .text2text_generation_pipeline import Text2TextGenerationPipeline + from .text_generation_pipeline import TextGenerationPipeline, TextGenerationT5Pipeline from .token_classification_pipeline import TokenClassificationPipeline from .translation_pipeline import TranslationPipeline from .word_segmentation_pipeline import WordSegmentationPipeline, WordSegmentationThaiPipeline @@ -56,8 +53,6 @@ else: 'information_extraction_pipeline': ['InformationExtractionPipeline'], 'named_entity_recognition_pipeline': [ 'NamedEntityRecognitionPipeline', - 'NamedEntityRecognitionThaiPipeline', - 'NamedEntityRecognitionVietPipeline' ], 'text_ranking_pipeline': ['TextRankingPipeline'], 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'], @@ -66,7 +61,8 @@ else: ['TableQuestionAnsweringPipeline'], 'text_classification_pipeline': ['TextClassificationPipeline'], 'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'], - 'text_generation_pipeline': ['TextGenerationPipeline'], + 'text_generation_pipeline': + ['TextGenerationPipeline', 'TextGenerationT5Pipeline'], 'text2text_generation_pipeline': ['Text2TextGenerationPipeline'], 'token_classification_pipeline': ['TokenClassificationPipeline'], 'translation_pipeline': ['TranslationPipeline'], diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py index afd5e29f..33e06685 100644 --- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py +++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py @@ -24,18 +24,27 @@ class ConversationalTextToSqlPipeline(Pipeline): def __init__(self, model: Union[StarForTextToSql, str], preprocessor: ConversationalTextToSqlPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """use `model` and `preprocessor` to create a conversational text-to-sql prediction pipeline Args: - model (StarForTextToSql): a model instance - preprocessor (ConversationalTextToSqlPreprocessor): - a preprocessor instance + model (StarForTextToSql): A model instance + preprocessor (ConversationalTextToSqlPreprocessor): A preprocessor instance + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = ConversationalTextToSqlPreprocessor( - self.model.model_dir) + self.model.model_dir, **kwargs) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py index c803663b..f53f186c 100644 --- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py @@ -22,6 +22,9 @@ class DialogIntentPredictionPipeline(Pipeline): def __init__(self, model: Union[SpaceForDialogIntent, str], preprocessor: DialogIntentPredictionPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """Use `model` and `preprocessor` to create a dialog intent prediction pipeline @@ -29,11 +32,18 @@ class DialogIntentPredictionPipeline(Pipeline): model (str or SpaceForDialogIntent): Supply either a local model dir or a model id from the model hub, or a SpaceForDialogIntent instance. preprocessor (DialogIntentPredictionPreprocessor): An optional preprocessor instance. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = DialogIntentPredictionPreprocessor( - self.model.model_dir) + self.model.model_dir, **kwargs) self.categories = self.preprocessor.categories def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py index c0cd52dd..c2cf2493 100644 --- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py @@ -21,6 +21,9 @@ class DialogModelingPipeline(Pipeline): def __init__(self, model: Union[SpaceForDialogModeling, str], preprocessor: DialogModelingPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """Use `model` and `preprocessor` to create a dialog modeling pipeline for dialog response generation @@ -28,11 +31,18 @@ class DialogModelingPipeline(Pipeline): model (str or SpaceForDialogModeling): Supply either a local model dir or a model id from the model hub, or a SpaceForDialogModeling instance. preprocessor (DialogModelingPreprocessor): An optional preprocessor instance. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = DialogModelingPreprocessor( - self.model.model_dir) + self.model.model_dir, **kwargs) def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index b7adf904..207b4f81 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -22,6 +22,9 @@ class DialogStateTrackingPipeline(Pipeline): def __init__(self, model: Union[SpaceForDST, str], preprocessor: DialogStateTrackingPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """use `model` and `preprocessor` to create a dialog state tracking pipeline for observation of dialog states tracking after many turns of open domain dialogue @@ -30,11 +33,20 @@ class DialogStateTrackingPipeline(Pipeline): model (str or SpaceForDialogStateTracking): Supply either a local model dir or a model id from the model hub, or a SpaceForDialogStateTracking instance. preprocessor (DialogStateTrackingPreprocessor): An optional preprocessor instance. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + if preprocessor is None: self.preprocessor = DialogStateTrackingPreprocessor( - self.model.model_dir) + self.model.model_dir, **kwargs) self.tokenizer = self.preprocessor.tokenizer self.config = self.preprocessor.config diff --git a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py index 325d3303..216d5302 100644 --- a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py +++ b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py @@ -21,8 +21,16 @@ class DistributedGPT3Pipeline(DistributedPipeline): model = None def __init__(self, model, preprocessor=None, **kwargs): + """ + + Args: + model: The model piece, str is not supported. + preprocessor: The preprocessor matched with the model. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. + """ if preprocessor is None: - preprocessor = TextGenerationJiebaPreprocessor(model) + preprocessor = TextGenerationJiebaPreprocessor(model, **kwargs) super().__init__(model, preprocessor=preprocessor, **kwargs) assert hasattr(preprocessor, 'tokenizer') diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py index 8499f7ff..fe42e472 100644 --- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py +++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py @@ -8,7 +8,7 @@ from modelscope.metainfo import Pipelines from modelscope.models.nlp.plug import DistributedPlug from modelscope.pipelines.base import DistributedPipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import TextGenerationPreprocessor +from modelscope.preprocessors import TextGenerationTransformersPreprocessor from modelscope.utils.constant import Tasks @@ -24,11 +24,12 @@ class DistributedPlugPipeline(DistributedPipeline): model, preprocessor=None, first_sequence='sentence', + sequence_length=512, **kwargs): """Create a plug pipeline instance. Args: - model: The model_id of plug(damo/nlp_plug_text-generation_27B). + model: The model_id of plug(damo/nlp_plug_text-generation_27B). The default path to damo/nlp_plug_text-generation_27B can be obtained by function get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to this path before calling this class by model_id. @@ -53,17 +54,16 @@ class DistributedPlugPipeline(DistributedPipeline): |_ mp_rank_05_model_states.pt |_ mp_rank_06_model_states.pt |_ mp_rank_07_model_states.pt - preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will + preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will be used as default. - first_sequence: The first_sequence key name if the input format is a dict. - kwargs: - sequence_length: The input sequence_length. + kwargs (dict, `optional`): Extra kwargs passed into the preprocessor's constructor. """ if preprocessor is None: - preprocessor = TextGenerationPreprocessor( + preprocessor = TextGenerationTransformersPreprocessor( model, first_sequence=first_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) + sequence_length=sequence_length, + **kwargs) super().__init__(model, preprocessor=preprocessor, **kwargs) assert hasattr(preprocessor, 'tokenizer') self.cls_token_id = preprocessor.tokenizer.cls_token_id diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py index b29dcca7..6e2121c3 100644 --- a/modelscope/pipelines/nlp/document_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py @@ -14,7 +14,8 @@ from modelscope.models.nlp.ponet.configuration import PoNetConfig from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import DocumentSegmentationPreprocessor +from modelscope.preprocessors import \ + DocumentSegmentationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -27,26 +28,34 @@ __all__ = ['DocumentSegmentationPipeline'] Tasks.document_segmentation, module_name=Pipelines.document_segmentation) class DocumentSegmentationPipeline(Pipeline): - def __init__(self, - model: Union[Model, str], - preprocessor: DocumentSegmentationPreprocessor = None, - **kwargs): - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + def __init__( + self, + model: Union[Model, str], + preprocessor: DocumentSegmentationTransformersPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + **kwargs): + """The document segmentation pipeline. - self.model_dir = self.model.model_dir - self.model_cfg = self.model.forward() - - if self.model_cfg['type'] == 'bert': - config = BertConfig.from_pretrained(self.model_dir, num_labels=2) - elif self.model_cfg['type'] == 'ponet': - config = PoNetConfig.from_pretrained(self.model_dir, num_labels=2) - - self.document_segmentation_model = self.model.build_with_config( - config=config) + Args: + model (str or Model): Supply either a local model dir or a model id from the model hub + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + """ + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + self.model_dir = self.model.model_dir + self.model_cfg = self.model.model_cfg if preprocessor is None: - self.preprocessor = DocumentSegmentationPreprocessor( - self.model.model_dir, config) + self.preprocessor = DocumentSegmentationTransformersPreprocessor( + self.model_dir, self.model.config.max_position_embeddings, + **kwargs) def __call__( self, documents: Union[List[List[str]], List[str], @@ -85,8 +94,7 @@ class DocumentSegmentationPipeline(Pipeline): key: torch.tensor(val) for key, val in predict_dataset.items() } - predictions = self.document_segmentation_model.forward( - **input).logits + predictions = self.model.forward(**input).logits predictions = np.argmax(predictions, axis=2) assert len(sentences) == len( diff --git a/modelscope/pipelines/nlp/extractive_summarization_pipeline.py b/modelscope/pipelines/nlp/extractive_summarization_pipeline.py index b35ecc78..1581690e 100644 --- a/modelscope/pipelines/nlp/extractive_summarization_pipeline.py +++ b/modelscope/pipelines/nlp/extractive_summarization_pipeline.py @@ -6,15 +6,14 @@ from typing import Any, Dict, List, Union import numpy as np import torch from datasets import Dataset -from transformers.models.bert.modeling_bert import BertConfig from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.nlp.ponet.configuration import PoNetConfig from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import DocumentSegmentationPreprocessor +from modelscope.preprocessors import \ + DocumentSegmentationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -28,31 +27,29 @@ __all__ = ['ExtractiveSummarizationPipeline'] module_name=Pipelines.extractive_summarization) class ExtractiveSummarizationPipeline(Pipeline): - def __init__(self, - model: Union[Model, str], - preprocessor: DocumentSegmentationPreprocessor = None, - **kwargs): - - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - - self.model_dir = model.model_dir - self.model_cfg = model.forward() - - if self.model_cfg['type'] == 'bert': - config = BertConfig.from_pretrained(model.model_dir, num_labels=2) - elif self.model_cfg['type'] == 'ponet': - config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2) - - self.extractive_summarization_model = model.build_with_config( - config=config) + def __init__( + self, + model: Union[Model, str], + preprocessor: DocumentSegmentationTransformersPreprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + **kwargs): + + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + + self.model_dir = self.model.model_dir + self.model_cfg = self.model.model_cfg if preprocessor is None: - preprocessor = DocumentSegmentationPreprocessor( - self.model_dir, config) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - - self.preprocessor = preprocessor + self.preprocessor = DocumentSegmentationTransformersPreprocessor( + self.model_dir, self.model.config.max_position_embeddings, + **kwargs) def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]: output = self.predict(documents) @@ -80,8 +77,7 @@ class ExtractiveSummarizationPipeline(Pipeline): key: torch.tensor(val) for key, val in predict_dataset.items() } - logits = self.extractive_summarization_model.forward( - **input).logits + logits = self.model.forward(**input).logits predictions = np.argmax(logits, axis=2) assert len(sentences) == len( diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index 46d75f49..5675144a 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -20,8 +20,24 @@ class FaqQuestionAnsweringPipeline(Pipeline): def __init__(self, model: Union[str, Model], preprocessor: Preprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + """The faq question answering pipeline. + + Args: + model (str or Model): A model instance or a model local dir or a model id in the model hub. + preprocessor (Preprocessor, `optional`): a preprocessor instance + kwargs (dict, `optional`): + The preprocessor kwargs passed into the preprocessor's constructor. + """ + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, **kwargs) diff --git a/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py b/modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py similarity index 85% rename from modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py rename to modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py index f10af88f..a3138490 100644 --- a/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py @@ -9,11 +9,9 @@ from fasttext import load_model from fasttext.FastText import _FastText from modelscope.metainfo import Pipelines -from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import ModelFile, Tasks __all__ = ['FasttextSequenceClassificationPipeline'] @@ -36,8 +34,7 @@ class FasttextSequenceClassificationPipeline(Pipeline): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction Args: - model: a model directory including model.bin and spm.model - preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + model: A model directory including model.bin and spm.model """ super().__init__(model=model) model_file = os.path.join(model, ModelFile.TORCH_MODEL_BIN_FILE) @@ -53,8 +50,11 @@ class FasttextSequenceClassificationPipeline(Pipeline): text_sp = sentencepiece_tokenize(self.spm, text) return {'text_sp': text_sp, 'text': text} - def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - topk = inputs.get('topk', -1) + def forward(self, + inputs: Dict[str, Any], + topk: int = None) -> Dict[str, Any]: + if topk is None: + topk = inputs.get('topk', -1) label, probs = self.model.predict(inputs['text_sp'], k=topk) label = [x.replace('__label__', '') for x in label] result = { diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py index aed78868..2ea264f0 100644 --- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py @@ -9,7 +9,8 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NLPPreprocessor, Preprocessor +from modelscope.preprocessors import (FillMaskTransformersPreprocessor, + Preprocessor) from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -23,7 +24,11 @@ class FeatureExtractionPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, - first_sequence='sentence', + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + padding=False, + sequence_length=128, **kwargs): """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction @@ -32,11 +37,8 @@ class FeatureExtractionPipeline(Pipeline): no-head model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - first_sequence: The key to read the sentence in. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline @@ -46,19 +48,21 @@ class FeatureExtractionPipeline(Pipeline): """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: - self.preprocessor = NLPPreprocessor( + self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - padding=kwargs.pop('padding', False), - sequence_length=kwargs.pop('sequence_length', 128)) + padding=padding, + sequence_length=sequence_length, + **kwargs) self.model.eval() - self.config = Config.from_file( - os.path.join(self.model.model_dir, ModelFile.CONFIGURATION)) - self.tokenizer = self.preprocessor.tokenizer - def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index d7dc70f8..af731d00 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -23,7 +23,11 @@ class FillMaskPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, - first_sequence: str = 'sentence', + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + first_sequence='sentence', + sequence_length=128, **kwargs): """The inference pipeline for all the fill mask sub-tasks. @@ -31,11 +35,8 @@ class FillMaskPipeline(Pipeline): model (`str` or `Model` or module instance): A model instance or a model local dir or a model id in the model hub. preprocessor (`Preprocessor`, `optional`): A Preprocessor instance. - first_sequence (`str`, `optional`): The key to read the sentence in. - sequence_length (`int`, `optional`): Max sequence length in the user's custom scenario, default 128. - - NOTE1: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example1: >>> from modelscope.pipelines import pipeline @@ -51,20 +52,25 @@ class FillMaskPipeline(Pipeline): NOTE2: Please pay attention to the model's special tokens. If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'. If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is ''. - To view other examples plese check the tests/pipelines/test_fill_mask.py. + To view other examples plese check tests/pipelines/test_fill_mask.py. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + if preprocessor is None: self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, first_sequence=first_sequence, - second_sequence=None, - sequence_length=kwargs.pop('sequence_length', 128)) - assert hasattr( - self.preprocessor, 'mask_id' - ), 'The input preprocessor should have the mask_id attribute.' - + sequence_length=sequence_length, + **kwargs) self.model.eval() + assert hasattr( + self.preprocessor, 'mask_id' + ), 'The input preprocessor should have the mask_id attribute.' def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py index cf96fd36..0c726c9a 100644 --- a/modelscope/pipelines/nlp/information_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py @@ -8,8 +8,7 @@ from modelscope.metainfo import Pipelines from modelscope.models import Model from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (Preprocessor, - RelationExtractionPreprocessor) +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks __all__ = ['InformationExtractionPipeline'] @@ -24,12 +23,33 @@ class InformationExtractionPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=512, **kwargs): - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if preprocessor is None: - self.preprocessor = RelationExtractionPreprocessor( + """ + + Args: + model (str or Model): Supply either a local model dir which supported information extraction task, or a + model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. + """ + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + + if self.preprocessor is None: + self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 512)) + sequence_length=sequence_length, + **kwargs) self.model.eval() def forward(self, inputs: Dict[str, Any], diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 74b380ec..9c5600fd 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -1,36 +1,35 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Optional, Union - -import torch +from typing import Optional, Union from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.pipelines.nlp import TokenClassificationPipeline -from modelscope.preprocessors import (NERPreprocessorThai, NERPreprocessorViet, - Preprocessor, - TokenClassificationPreprocessor) +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) -__all__ = [ - 'NamedEntityRecognitionPipeline', 'NamedEntityRecognitionThaiPipeline', - 'NamedEntityRecognitionVietPipeline' -] +__all__ = ['NamedEntityRecognitionPipeline'] @PIPELINES.register_module( Tasks.named_entity_recognition, module_name=Pipelines.named_entity_recognition) +@PIPELINES.register_module( + Tasks.named_entity_recognition, + module_name=Pipelines.named_entity_recognition_thai) +@PIPELINES.register_module( + Tasks.named_entity_recognition, + module_name=Pipelines.named_entity_recognition_viet) class NamedEntityRecognitionPipeline(TokenClassificationPipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=128, **kwargs): """Use `model` and `preprocessor` to create a nlp NER pipeline for prediction @@ -39,8 +38,8 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline): model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. - + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline >>> pipeline_ins = pipeline(task='named-entity-recognition', @@ -50,44 +49,17 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline): To view other examples plese check the tests/pipelines/test_named_entity_recognition.py. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: - self.preprocessor = TokenClassificationPreprocessor( + self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 128)) + sequence_length=sequence_length, + **kwargs) self.model.eval() - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label - - -@PIPELINES.register_module( - Tasks.named_entity_recognition, - module_name=Pipelines.named_entity_recognition_thai) -class NamedEntityRecognitionThaiPipeline(NamedEntityRecognitionPipeline): - - def __init__(self, - model: Union[Model, str], - preprocessor: Optional[Preprocessor] = None, - **kwargs): - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if preprocessor is None: - self.preprocessor = NERPreprocessorThai( - self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 512)) - - -@PIPELINES.register_module( - Tasks.named_entity_recognition, - module_name=Pipelines.named_entity_recognition_viet) -class NamedEntityRecognitionVietPipeline(NamedEntityRecognitionPipeline): - - def __init__(self, - model: Union[Model, str], - preprocessor: Optional[Preprocessor] = None, - **kwargs): - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if preprocessor is None: - self.preprocessor = NERPreprocessorViet( - self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 512)) + assert hasattr(self.preprocessor, 'id2label') + self.id2label = self.preprocessor.id2label diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py index adac7f1b..424a9abc 100644 --- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py @@ -22,7 +22,10 @@ class SentenceEmbeddingPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, - first_sequence='first_sequence', + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=128, **kwargs): """Use `model` and `preprocessor` to create a nlp text dual encoder then generates the text representation. Args: @@ -30,15 +33,20 @@ class SentenceEmbeddingPipeline(Pipeline): or a model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = Preprocessor.from_pretrained( - self.model.model_dir - if isinstance(self.model, Model) else model, - first_sequence=first_sequence, - sequence_length=kwargs.pop('sequence_length', 128)) + self.model.model_dir, + sequence_length=sequence_length, + **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py index 6ea7cd5f..7c8355f9 100644 --- a/modelscope/pipelines/nlp/summarization_pipeline.py +++ b/modelscope/pipelines/nlp/summarization_pipeline.py @@ -1,12 +1,11 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict, Optional, Union -from modelscope.metainfo import Pipelines -from modelscope.models.multi_modal import OfaForAllTasks +from modelscope.metainfo import Pipelines, Preprocessors from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import OfaPreprocessor, Preprocessor -from modelscope.utils.constant import Tasks +from modelscope.preprocessors import Preprocessor +from modelscope.utils.constant import Fields, Tasks from modelscope.utils.logger import get_logger logger = get_logger() @@ -19,6 +18,9 @@ class SummarizationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """Use `model` and `preprocessor` to create a Summarization pipeline for prediction. @@ -26,11 +28,25 @@ class SummarizationPipeline(Pipeline): model (str or Model): Supply either a local model dir which supported the summarization task, or a model id from the model hub, or a model instance. preprocessor (Preprocessor): An optional preprocessor instance. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) self.model.eval() - if preprocessor is None and isinstance(self.model, OfaForAllTasks): - self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir) + if preprocessor is None: + if self.model.__class__.__name__ == 'OfaForAllTasks': + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, + type=Preprocessors.ofa_tasks_preprocessor, + field=Fields.multi_modal) + else: + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, **kwargs) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index 36f4c08a..917a70d4 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -33,6 +33,9 @@ class TableQuestionAnsweringPipeline(Pipeline): model: Union[TableQuestionAnswering, str], preprocessor: TableQuestionAnsweringPreprocessor = None, db: Database = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """use `model` and `preprocessor` to create a table question answering prediction pipeline @@ -40,11 +43,19 @@ class TableQuestionAnsweringPipeline(Pipeline): model (TableQuestionAnswering): a model instance preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance db (Database): a database to store tables in the database + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + if preprocessor is None: self.preprocessor = TableQuestionAnsweringPreprocessor( - self.model.model_dir) + self.model.model_dir, **kwargs) # initilize tokenizer self.tokenizer = BertTokenizer( diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py deleted file mode 100644 index 9bf226b9..00000000 --- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, List, Optional, Union - -import torch -from numpy import isin - -from modelscope.metainfo import Pipelines -from modelscope.models.base import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Input, Pipeline, Tensor -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import Text2TextGenerationPreprocessor -from modelscope.utils.config import use_task_specific_params -from modelscope.utils.constant import Tasks - -__all__ = ['Text2TextGenerationPipeline'] - -TRANSLATE_PIPELINES = [ - Pipelines.translation_en_to_de, - Pipelines.translation_en_to_ro, - Pipelines.translation_en_to_fr, -] - - -@PIPELINES.register_module( - Tasks.text2text_generation, module_name=Pipelines.text2text_generation) -@PIPELINES.register_module( - Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de) -@PIPELINES.register_module( - Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro) -@PIPELINES.register_module( - Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr) -class Text2TextGenerationPipeline(Pipeline): - - def __init__( - self, - model: Union[Model, str], - preprocessor: Optional[Text2TextGenerationPreprocessor] = None, - first_sequence='sentence', - **kwargs): - """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction. - - Args: - model (str or Model): Supply either a local model dir which supported the text generation task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the first sentence in. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='text2text-generation', - >>> model='damo/nlp_t5_text2text-generation_chinese-base') - >>> sentence1 = '中国的首都位于。' - >>> print(pipeline_ins(sentence1)) - >>> # Or use the dict input: - >>> print(pipeline_ins({'sentence': sentence1})) - >>> # 北京 - - To view other examples plese check the tests/pipelines/test_text_generation.py. - """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if preprocessor is None: - self.preprocessor = Text2TextGenerationPreprocessor( - self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 128)) - self.tokenizer = self.preprocessor.tokenizer - self.pipeline = self.model.pipeline.type - self.model.eval() - - def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: - """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks - """ - if not isinstance(inputs, str): - raise ValueError(f'Not supported input type: {type(inputs)}') - - if self.pipeline in TRANSLATE_PIPELINES: - use_task_specific_params(self.model, self.pipeline) - inputs = self.model.config.prefix + inputs - - return super().preprocess(inputs, **preprocess_params) - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - - forward_params['min_length'] = forward_params.get( - 'min_length', self.model.config.min_length) - forward_params['max_length'] = forward_params.get( - 'max_length', self.model.config.max_length) - - with torch.no_grad(): - output_ids = self.model.generate(**inputs, **forward_params) - return {'output_ids': output_ids} - - def postprocess(self, inputs: Dict[str, Tensor], - **postprocess_params) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, str]: the prediction results - """ - output = self.tokenizer.decode( - inputs['output_ids'][0], - skip_special_tokens=True, - ) - return {OutputKeys.TEXT: output} diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py index fd223c76..24c07d69 100644 --- a/modelscope/pipelines/nlp/text_classification_pipeline.py +++ b/modelscope/pipelines/nlp/text_classification_pipeline.py @@ -5,11 +5,14 @@ import numpy as np from modelscope.metainfo import Pipelines, Preprocessors from modelscope.models.base import Model -from modelscope.outputs import OutputKeys +from modelscope.outputs import OutputKeys, TextClassificationModelOutput from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Fields, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) @PIPELINES.register_module( @@ -31,6 +34,9 @@ class TextClassificationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Preprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, **kwargs): """The inference pipeline for all the text classification sub-tasks. @@ -38,10 +44,8 @@ class TextClassificationPipeline(Pipeline): model (`str` or `Model` or module instance): A model instance or a model local dir or a model id in the model hub. preprocessor (`Preprocessor`, `optional`): A Preprocessor instance. - first_sequence (`str`, `optional`): The key of the first sentence. - second_sequence (`str`, `optional`): The key of the second sentence. - sequence_length (`int`, `optional`): The sequence length. - id2label (`dict`, `optional`): The id-label mapping. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline @@ -49,31 +53,38 @@ class TextClassificationPipeline(Pipeline): model='damo/nlp_structbert_sentence-similarity_chinese-base') >>> input = ('这是个测试', '这也是个测试') >>> print(pipeline_ins(input)) - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence' - param will have no affection. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: if self.model.__class__.__name__ == 'OfaForAllTasks': self.preprocessor = Preprocessor.from_pretrained( model_name_or_path=self.model.model_dir, type=Preprocessors.ofa_tasks_preprocessor, - field=Fields.multi_modal) + field=Fields.multi_modal, + **kwargs) else: first_sequence = kwargs.pop('first_sequence', 'first_sequence') second_sequence = kwargs.pop('second_sequence', None) + sequence_length = kwargs.pop('sequence_length', 512) self.preprocessor = Preprocessor.from_pretrained( - self.model - if isinstance(self.model, str) else self.model.model_dir, - first_sequence=first_sequence, - second_sequence=second_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label + self.model.model_dir, **{ + 'first_sequence': first_sequence, + 'second_sequence': second_sequence, + 'sequence_length': sequence_length, + **kwargs + }) + assert hasattr(self.preprocessor, 'id2label') + self.id2label = self.preprocessor.id2label + if self.id2label is None: + logger.warn( + 'The id2label mapping is None, will return original ids.' + ) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: @@ -82,16 +93,17 @@ class TextClassificationPipeline(Pipeline): return self.model(**inputs, **forward_params) def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results + inputs: Union[Dict[str, Any], + TextClassificationModelOutput], + topk: int = None) -> Dict[str, Any]: + """Process the prediction results Args: inputs (`Dict[str, Any]` or `TextClassificationModelOutput`): The model output, please check the `TextClassificationModelOutput` class for details. topk (int): The topk probs to take Returns: - Dict[str, str]: the prediction results. + Dict[str, Any]: the prediction results. scores: The probabilities of each label. labels: The real labels. Label at index 0 is the smallest probability. @@ -99,8 +111,6 @@ class TextClassificationPipeline(Pipeline): if self.model.__class__.__name__ == 'OfaForAllTasks': return inputs else: - assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ - 'as a parameter or make sure the preprocessor has the attribute.' logits = inputs[OutputKeys.LOGITS].cpu().numpy() if logits.shape[0] == 1: logits = logits[0] @@ -111,20 +121,24 @@ class TextClassificationPipeline(Pipeline): probs = softmax(logits) num_classes = probs.shape[-1] - topk = min(topk, num_classes) + topk = min(topk, num_classes) if topk is not None else num_classes top_indices = np.argpartition(probs, -topk)[-topk:] probs = np.take_along_axis(probs, top_indices, axis=-1).tolist() def map_to_label(id): - if id in self.id2label: - return self.id2label[id] - elif str(id) in self.id2label: - return self.id2label[str(id)] + if self.id2label is not None: + if id in self.id2label: + return self.id2label[id] + elif str(id) in self.id2label: + return self.id2label[str(id)] + else: + raise Exception( + f'id {id} not found in id2label: {self.id2label}') else: - raise Exception('id not found in id2label') + return id v_func = np.vectorize(map_to_label) - return { - OutputKeys.SCORES: probs, - OutputKeys.LABELS: v_func(top_indices).tolist() - } + top_indices = v_func(top_indices).tolist() + probs = list(reversed(probs)) + top_indices = list(reversed(top_indices)) + return {OutputKeys.SCORES: probs, OutputKeys.LABELS: top_indices} diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py index ee8cb711..1e6d525a 100644 --- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py +++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py @@ -10,7 +10,7 @@ from modelscope.models.nlp import BartForTextErrorCorrection from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import TextErrorCorrectionPreprocessor +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks __all__ = ['TextErrorCorrectionPipeline'] @@ -20,17 +20,20 @@ __all__ = ['TextErrorCorrectionPipeline'] Tasks.text_error_correction, module_name=Pipelines.text_error_correction) class TextErrorCorrectionPipeline(Pipeline): - def __init__( - self, - model: Union[BartForTextErrorCorrection, str], - preprocessor: Optional[TextErrorCorrectionPreprocessor] = None, - **kwargs): + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + **kwargs): """use `model` and `preprocessor` to create a nlp text correction pipeline. Args: model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub. preprocessor (TextErrorCorrectionPreprocessor): An optional preprocessor instance. - + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline >>> pipeline_ins = pipeline( @@ -38,13 +41,17 @@ class TextErrorCorrectionPipeline(Pipeline): >>> sentence1 = '随着中国经济突飞猛近,建造工业与日俱增' >>> print(pipeline_ins(sentence1)) - To view other examples plese check the tests/pipelines/test_text_error_correction.py. + To view other examples plese check tests/pipelines/test_text_error_correction.py. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: - self.preprocessor = TextErrorCorrectionPreprocessor( - self.model.model_dir) + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, **kwargs) self.vocab = self.preprocessor.vocab def forward(self, inputs: Dict[str, Any], diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index bf1162bf..566ca359 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -1,20 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os from typing import Any, Dict, Optional, Union import torch from modelscope.metainfo import Pipelines from modelscope.models.base import Model -from modelscope.outputs import OutputKeys +from modelscope.outputs import (ModelOutputBase, OutputKeys, + TokenGeneratorOutput) from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import Preprocessor, build_preprocessor +from modelscope.preprocessors import Preprocessor from modelscope.utils.chinese_utils import remove_space_between_chinese_chars -from modelscope.utils.constant import Fields, Tasks -from modelscope.utils.hub import read_config +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import Config, read_config -__all__ = ['TextGenerationPipeline'] +__all__ = ['TextGenerationPipeline', 'TextGenerationT5Pipeline'] @PIPELINES.register_module( @@ -24,7 +26,11 @@ class TextGenerationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, first_sequence='sentence', + sequence_length=128, **kwargs): """Use `model` and `preprocessor` to create a generation pipeline for prediction. @@ -33,11 +39,8 @@ class TextGenerationPipeline(Pipeline): or a model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - first_sequence: The key to read the first sentence in. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline @@ -49,26 +52,29 @@ class TextGenerationPipeline(Pipeline): >>> # Or use the dict input: >>> print(pipeline_ins({'sentence': sentence1})) - To view other examples plese check the tests/pipelines/test_text_generation.py. + To view other examples plese check tests/pipelines/test_text_generation.py. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - cfg = read_config(self.model.model_dir) - self.postprocessor = cfg.pop('postprocessor', 'decode') + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) + if preprocessor is None: - preprocessor_cfg = cfg.preprocessor - preprocessor_cfg.update({ - 'model_dir': + self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - 'first_sequence': - first_sequence, - 'second_sequence': - None, - 'sequence_length': - kwargs.pop('sequence_length', 128) - }) - self.preprocessor = build_preprocessor(preprocessor_cfg, - Fields.nlp) + first_sequence=first_sequence, + sequence_length=sequence_length, + **kwargs) self.model.eval() + self.postprocessor = kwargs.pop('postprocessor', None) + if self.postprocessor is None and hasattr(self.model, 'model_dir'): + # Compatible with old code + cfg = read_config(self.model.model_dir) + self.postprocessor = cfg.get('postprocessor') + if self.postprocessor is None: + self.postprocessor = 'decode' def _sanitize_parameters(self, **pipeline_parameters): return {}, pipeline_parameters, {} @@ -79,20 +85,19 @@ class TextGenerationPipeline(Pipeline): return self.model.generate(inputs, **forward_params) def decode(self, inputs) -> str: - tokenizer = self.preprocessor.tokenizer - return tokenizer.decode(inputs.tolist(), skip_special_tokens=True) + return self.preprocessor.decode( + inputs.tolist(), skip_special_tokens=True) def sentence_piece(self, inputs) -> str: - tokenizer = self.preprocessor.tokenizer - return tokenizer.decode(inputs.tolist()) + return self.preprocessor.decode(inputs.tolist()) def roberta(self, inputs) -> str: - tokenizer = self.preprocessor.tokenizer - decoded = tokenizer.decode(inputs.tolist()) + decoded = self.preprocessor.decode(inputs.tolist()) return decoded.replace('', '. ').replace('', '. ').replace('', '') - def postprocess(self, inputs: Dict[str, Tensor], + def postprocess(self, inputs: Union[Dict[str, Tensor], + TokenGeneratorOutput], **postprocess_params) -> Dict[str, str]: """process the prediction results @@ -102,9 +107,72 @@ class TextGenerationPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ - inputs = inputs['sequences'] + if isinstance(inputs, (dict, ModelOutputBase)): + inputs = inputs['sequences'] if isinstance(inputs, list) or len(inputs.shape) > 1: inputs = inputs[0] decoded = getattr(self, self.postprocessor)(inputs) text = remove_space_between_chinese_chars(decoded) return {OutputKeys.TEXT: text} + + +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.text2text_generation) +class TextGenerationT5Pipeline(TextGenerationPipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + sub_task=None, + **kwargs): + super().__init__(model, preprocessor, **kwargs) + self.sub_task = sub_task + self.task_specific_params = self._parse_specific_model_params( + getattr(self.model, 'model_dir', None), 'task_specific_params') + self.min_length = self._parse_specific_model_params( + getattr(self.model, 'model_dir', None), 'min_length') + self.max_length = self._parse_specific_model_params( + getattr(self.model, 'model_dir', None), 'max_length') + + def _parse_specific_model_params(self, model_dir, key): + if model_dir is None: + return + + cfg: Config = read_config(model_dir) + params = cfg.safe_get(f'model.{key}') + if params is None: + cfg: Config = read_config(os.path.join(model_dir, 'config.json')) + params = cfg.safe_get(key) + return params + + def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]: + if not isinstance(inputs, str): + raise ValueError(f'Not supported input type: {type(inputs)}') + + if self.task_specific_params is not None: + sub_task = self.sub_task or self.model.pipeline.type + if sub_task in self.task_specific_params: + self.model.config.update(self.task_specific_params[sub_task]) + if 'prefix' in self.task_specific_params[sub_task]: + inputs = self.task_specific_params[sub_task].prefix + inputs + + return super().preprocess(inputs, **preprocess_params) + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + + min_length = forward_params.get('min_length', self.min_length) + max_length = forward_params.get('max_length', self.max_length) + if min_length is not None: + forward_params['min_length'] = min_length + if max_length is not None: + forward_params['max_length'] = max_length + + with torch.no_grad(): + return self.model.generate(**inputs, **forward_params) diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py index fe627e5f..dfd0d433 100644 --- a/modelscope/pipelines/nlp/text_ranking_pipeline.py +++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py @@ -9,7 +9,8 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import Preprocessor, TextRankingPreprocessor +from modelscope.preprocessors import (Preprocessor, + TextRankingTransformersPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['TextRankingPipeline'] @@ -22,6 +23,10 @@ class TextRankingPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=128, **kwargs): """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction. @@ -30,14 +35,21 @@ class TextRankingPipeline(Pipeline): or a model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 128)) + sequence_length=sequence_length, + **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 86cc49b7..63f241a2 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -1,7 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union +import numpy as np import torch from modelscope.metainfo import Pipelines @@ -32,24 +33,35 @@ class TokenClassificationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=128, **kwargs): """use `model` and `preprocessor` to create a token classification pipeline for prediction Args: model (str or Model): A model instance or a model local dir or a model id in the model hub. preprocessor (Preprocessor): a preprocessor instance, must not be None. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 128)) + sequence_length=sequence_length, + **kwargs) self.model.eval() - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label + assert hasattr(self.preprocessor, 'id2label') + self.id2label = self.preprocessor.id2label def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: @@ -60,53 +72,59 @@ class TokenClassificationPipeline(Pipeline): } def postprocess(self, inputs: Dict[str, Any], - **postprocess_params) -> Dict[str, str]: - """process the prediction results + **postprocess_params) -> Dict[str, Any]: + """Process the prediction results Args: inputs (Dict[str, Any]): should be tensors from model Returns: - Dict[str, str]: the prediction results + Dict[str, Any]: the prediction results """ chunks = self._chunk_process(inputs, **postprocess_params) - - # for cws outputs - if len(chunks) > 0 and chunks[0]['type'].lower() == 'cws': - spans = [ - chunk['span'] for chunk in chunks if chunk['span'].strip() - ] - seg_result = [span for span in spans] - outputs = {OutputKeys.OUTPUT: seg_result} - - # for ner outputs - else: - outputs = {OutputKeys.OUTPUT: chunks} - return outputs + return {OutputKeys.OUTPUT: chunks} def _chunk_process(self, inputs: Dict[str, Any], - **postprocess_params) -> Dict[str, str]: + **postprocess_params) -> List: """process the prediction results and output as chunks Args: inputs (Dict[str, Any]): should be tensors from model Returns: - Dict[str, str]: the prediction results + List: The output chunks """ text = inputs['text'] + # TODO post_process does not support batch for now. if OutputKeys.PREDICTIONS not in inputs: logits = inputs[OutputKeys.LOGITS] - predictions = torch.argmax(logits[0], dim=-1) + if len(logits.shape) == 3: + logits = logits[0] + predictions = torch.argmax(logits, dim=-1) else: - predictions = inputs[OutputKeys.PREDICTIONS].squeeze( - 0).cpu().numpy() + predictions = inputs[OutputKeys.PREDICTIONS] + if len(predictions.shape) == 2: + predictions = predictions[0] + + offset_mapping = inputs['offset_mapping'] + if len(offset_mapping.shape) == 3: + offset_mapping = offset_mapping[0] + + label_mask = inputs.get('label_mask') + if label_mask is not None: + masked_lengths = label_mask.sum(-1).long().cpu().item() + offset_mapping = torch.narrow( + offset_mapping, 0, 0, + masked_lengths) # index_select only move loc, not resize + predictions = torch.narrow( + predictions, 0, 0, + masked_lengths) # index_select only move loc, not resize + + offset_mapping = torch_nested_numpify( + torch_nested_detach(offset_mapping)) predictions = torch_nested_numpify(torch_nested_detach(predictions)) - offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] - labels = [self.id2label[x] for x in predictions] - if len(labels) > len(offset_mapping): - labels = labels[1:-1] + chunks = [] chunk = {} for label, offsets in zip(labels, offset_mapping): diff --git a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py index 57fc646a..41f833dc 100644 --- a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py @@ -2,19 +2,15 @@ import io import os -from typing import Any, Dict, Union +from typing import Any, Dict -import numpy as np import torch from transformers import XLMRobertaTokenizer from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import BertForSequenceClassification from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import ModelFile, Tasks __all__ = ['TranslationQualityEstimationPipeline'] diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 9fe2ad93..ee49d9a5 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -10,9 +10,9 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.pipelines.nlp import TokenClassificationPipeline -from modelscope.preprocessors import (Preprocessor, - TokenClassificationPreprocessor, - WordSegmentationPreprocessorThai) +from modelscope.preprocessors import ( + Preprocessor, TokenClassificationTransformersPreprocessor, + WordSegmentationPreprocessorThai) from modelscope.utils.constant import Tasks from modelscope.utils.tensor_utils import (torch_nested_detach, torch_nested_numpify) @@ -23,42 +23,49 @@ __all__ = ['WordSegmentationPipeline', 'WordSegmentationThaiPipeline'] @PIPELINES.register_module( Tasks.word_segmentation, module_name=Pipelines.word_segmentation) class WordSegmentationPipeline(TokenClassificationPipeline): + """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction. - def __init__(self, - model: Union[Model, str], - preprocessor: Optional[Preprocessor] = None, - **kwargs): - """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction. + NOTE: The preprocessor will first split the sentence into single characters, + then feed them into the tokenizer with the parameter is_split_into_words=True. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='word-segmentation', + >>> model='damo/nlp_structbert_word-segmentation_chinese-base') + >>> sentence1 = '今天天气不错,适合出去游玩' + >>> print(pipeline_ins(sentence1)) + + To view other examples plese check tests/pipelines/test_word_segmentation.py. + """ + + def postprocess(self, + inputs: Dict[str, Any], + output_final_sentence=True, + **postprocess_params) -> Dict[str, Any]: + """Process the prediction results Args: - model (str or Model): Supply either a local model dir which supported the WS task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. - - NOTE: The preprocessor will first split the sentence into single characters, - then feed them into the tokenizer with the parameter is_split_into_words=True. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='word-segmentation', - >>> model='damo/nlp_structbert_word-segmentation_chinese-base') - >>> sentence1 = '今天天气不错,适合出去游玩' - >>> print(pipeline_ins(sentence1)) - - To view other examples plese check the tests/pipelines/test_word_segmentation.py. + inputs (Dict[str, Any]): should be tensors from model + output_final_sentence (bool): Output the cut sentence splitted by blanks or not. + If False, the pipeline will output the original token-label information. + + Returns: + Dict[str, Any]: The prediction results. """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - if preprocessor is None: - self.preprocessor = TokenClassificationPreprocessor( - self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 128)) - self.model.eval() + chunks = self._chunk_process(inputs, **postprocess_params) + + # for cws outputs + if output_final_sentence: + spans = [ + chunk['span'] for chunk in chunks if chunk['span'].strip() + ] + seg_result = [span for span in spans] + outputs = {OutputKeys.OUTPUT: seg_result} - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label + # for ner outputs + else: + outputs = {OutputKeys.OUTPUT: chunks} + return outputs @PIPELINES.register_module( @@ -66,8 +73,10 @@ class WordSegmentationPipeline(TokenClassificationPipeline): module_name=Pipelines.multilingual_word_segmentation) class MultilingualWordSegmentationPipeline(WordSegmentationPipeline): - def postprocess(self, inputs: Dict[str, Any], - **postprocess_params) -> Dict[str, str]: + def postprocess(self, + inputs: Dict[str, Any], + output_final_sentence=True, + **postprocess_params) -> Dict[str, Any]: chunks = self._chunk_process(inputs, **postprocess_params) word_segments = [entity['span'] for entity in chunks] return {OutputKeys.OUTPUT: word_segments} @@ -80,14 +89,22 @@ class WordSegmentationThaiPipeline(MultilingualWordSegmentationPipeline): def __init__(self, model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=512, **kwargs): - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) if preprocessor is None: - preprocessor = WordSegmentationPreprocessorThai( - model.model_dir, - sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.preprocessor = WordSegmentationPreprocessorThai( + self.model.model_dir, + sequence_length=sequence_length, + **kwargs) def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index 31b556d7..3db73d8b 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -10,8 +10,7 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (Preprocessor, - ZeroShotClassificationPreprocessor) +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks __all__ = ['ZeroShotClassificationPipeline'] @@ -25,6 +24,10 @@ class ZeroShotClassificationPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: Preprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + sequence_length=512, **kwargs): """Use `model` and `preprocessor` to create a nlp zero shot classifiction for prediction. @@ -44,7 +47,8 @@ class ZeroShotClassificationPipeline(Pipeline): or a model id from the model hub, or a torch model instance. preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for the model if supplied. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. Example: >>> from modelscope.pipelines import pipeline @@ -55,17 +59,22 @@ class ZeroShotClassificationPipeline(Pipeline): >>> template = '这篇文章的标题是{}' >>> print(pipeline_ins(sentence1, candidate_labels=labels, hypothesis_template=template)) - To view other examples plese check the tests/pipelines/test_zero_shot_classification.py. + To view other examples plese check tests/pipelines/test_zero_shot_classification.py. """ - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or Model' - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__( + model=model, + preprocessor=preprocessor, + config_file=config_file, + device=device, + auto_collate=auto_collate) self.entailment_id = 0 self.contradiction_id = 2 if preprocessor is None: - self.preprocessor = ZeroShotClassificationPreprocessor( + sequence_length = kwargs.pop('sequence_length', 512) + self.preprocessor = Preprocessor.from_pretrained( self.model.model_dir, - sequence_length=kwargs.pop('sequence_length', 512)) + sequence_length=sequence_length, + **kwargs) self.model.eval() def _sanitize_parameters(self, **kwargs): diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index ce053459..b4adf935 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -16,15 +16,19 @@ if TYPE_CHECKING: from .kws import WavToLists from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) from .nlp import ( - DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor, - FillMaskPoNetPreprocessor, NLPPreprocessor, - NLPTokenizerPreprocessorBase, PassageRankingPreprocessor, - TextRankingPreprocessor, RelationExtractionPreprocessor, - SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor, - TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor, - TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize, + DocumentSegmentationTransformersPreprocessor, + FaqQuestionAnsweringTransformersPreprocessor, + FillMaskPoNetPreprocessor, FillMaskTransformersPreprocessor, + TextRankingTransformersPreprocessor, + RelationExtractionTransformersPreprocessor, + SentenceEmbeddingTransformersPreprocessor, + TextClassificationTransformersPreprocessor, + TokenClassificationTransformersPreprocessor, + TextErrorCorrectionPreprocessor, TextGenerationT5Preprocessor, + TextGenerationTransformersPreprocessor, Tokenize, WordSegmentationBlankSetToLabelPreprocessor, CodeGeeXPreprocessor, - MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor, + MGLMSummarizationPreprocessor, + ZeroShotClassificationTransformersPreprocessor, TextGenerationJiebaPreprocessor, SentencePiecePreprocessor, DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor, @@ -47,18 +51,21 @@ else: 'kws': ['WavToLists'], 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], 'nlp': [ - 'DocumentSegmentationPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'FillMaskPoNetPreprocessor', - 'NLPPreprocessor', 'NLPTokenizerPreprocessorBase', - 'TextRankingPreprocessor', 'RelationExtractionPreprocessor', - 'SentenceEmbeddingPreprocessor', - 'SequenceClassificationPreprocessor', - 'TokenClassificationPreprocessor', - 'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor', - 'Tokenize', 'Text2TextGenerationPreprocessor', + 'DocumentSegmentationTransformersPreprocessor', + 'FaqQuestionAnsweringTransformersPreprocessor', + 'FillMaskPoNetPreprocessor', 'FillMaskTransformersPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'TextRankingTransformersPreprocessor', + 'RelationExtractionTransformersPreprocessor', + 'SentenceEmbeddingTransformersPreprocessor', + 'TextClassificationTransformersPreprocessor', + 'TokenClassificationTransformersPreprocessor', + 'TextErrorCorrectionPreprocessor', + 'TextGenerationTransformersPreprocessor', 'Tokenize', + 'TextGenerationT5Preprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'MGLMSummarizationPreprocessor', 'CodeGeeXPreprocessor', - 'ZeroShotClassificationPreprocessor', + 'ZeroShotClassificationTransformersPreprocessor', 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', 'NERPreprocessorViet', 'NERPreprocessorThai', 'WordSegmentationPreprocessorThai', diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index e9b85424..277c26cc 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -2,9 +2,10 @@ import os from abc import ABC, abstractmethod from copy import deepcopy -from typing import Any, Dict, Optional, Sequence +from typing import Any, Callable, Dict, Optional, Sequence, Union from modelscope.metainfo import Models, Preprocessors +from modelscope.utils.checkpoint import save_configuration from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke, ModeKeys, Tasks) @@ -98,6 +99,8 @@ PREPROCESSOR_MAP = { Preprocessors.sen_cls_tokenizer, (Models.structbert, Tasks.part_of_speech): Preprocessors.token_cls_tokenizer, + (Models.token_classification_for_ner, Tasks.named_entity_recognition): + Preprocessors.token_cls_tokenizer, (Models.structbert, Tasks.token_classification): Preprocessors.token_cls_tokenizer, (Models.structbert, Tasks.word_segmentation): @@ -117,7 +120,15 @@ PREPROCESSOR_MAP = { (Models.veco, Tasks.sentence_similarity): Preprocessors.sen_cls_tokenizer, - # space + # taskmodels + (Models.lcrf, Tasks.named_entity_recognition): + Preprocessors.sequence_labeling_tokenizer, + (Models.lcrf_wseg, Tasks.word_segmentation): + Preprocessors.sequence_labeling_tokenizer, + (Models.tcrf_wseg, Tasks.word_segmentation): + Preprocessors.sequence_labeling_tokenizer, + (Models.tcrf, Tasks.named_entity_recognition): + Preprocessors.sequence_labeling_tokenizer, } @@ -125,6 +136,8 @@ class Preprocessor(ABC): def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs): self._mode = mode + assert self._mode in (ModeKeys.INFERENCE, ModeKeys.TRAIN, + ModeKeys.EVAL) self.device = int( os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None pass @@ -264,4 +277,41 @@ class Preprocessor(ABC): }) preprocessor = build_preprocessor(sub_cfg, field_name) preprocessor.mode = preprocessor_mode + sub_cfg.pop('model_dir', None) + if not hasattr(preprocessor, 'cfg'): + preprocessor.cfg = cfg return preprocessor + + def save_pretrained(self, + target_folder: Union[str, os.PathLike], + config: Optional[dict] = None, + save_config_function: Callable = save_configuration): + """Save the preprocessor, its configuration and other related files to a directory, + so that it can be re-loaded + + By default, this method will save the preprocessor's config with mode `inference`. + + Args: + target_folder (Union[str, os.PathLike]): + Directory to which to save. Will be created if it doesn't exist. + + config (Optional[dict], optional): + The config for the configuration.json + + save_config_function (Callable): The function used to save the configuration, call this function + after the config is updated. + + """ + if config is None and hasattr(self, 'cfg'): + config = self.cfg + + if config is not None: + # Update the mode to `inference` in the preprocessor field. + if 'preprocessor' in config and config['preprocessor'] is not None: + if 'mode' in config['preprocessor']: + config['preprocessor']['mode'] = 'inference' + elif 'val' in config['preprocessor'] and 'mode' in config[ + 'preprocessor']['val']: + config['preprocessor']['val']['mode'] = 'inference' + + save_config_function(target_folder, config) diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index 7c48fb3c..5f23fb27 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -5,24 +5,22 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .text_error_correction import TextErrorCorrectionPreprocessor - from .nlp_base import (NLPTokenizerPreprocessorBase, NLPBasePreprocessor) - from .text_generation_jieba_preprocessor import TextGenerationJiebaPreprocessor + from .text_generation_preprocessor import TextGenerationJiebaPreprocessor from .sentence_piece_preprocessor import SentencePiecePreprocessor from .bert_seq_cls_tokenizer import Tokenize - from .document_segmentation_preprocessor import DocumentSegmentationPreprocessor - from .faq_question_answering_preprocessor import FaqQuestionAnsweringPreprocessor - from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, NLPPreprocessor - from .text_ranking_preprocessor import TextRankingPreprocessor - from .relation_extraction_preprocessor import RelationExtractionPreprocessor - from .sentence_classification_preprocessor import SequenceClassificationPreprocessor - from .sentence_embedding_preprocessor import SentenceEmbeddingPreprocessor - from .text_generation_preprocessor import TextGenerationPreprocessor - from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor - from .token_classification_preprocessor import TokenClassificationPreprocessor, \ + from .document_segmentation_preprocessor import DocumentSegmentationTransformersPreprocessor + from .faq_question_answering_preprocessor import FaqQuestionAnsweringTransformersPreprocessor + from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, FillMaskTransformersPreprocessor + from .text_ranking_preprocessor import TextRankingTransformersPreprocessor + from .relation_extraction_preprocessor import RelationExtractionTransformersPreprocessor + from .text_classification_preprocessor import TextClassificationTransformersPreprocessor + from .sentence_embedding_preprocessor import SentenceEmbeddingTransformersPreprocessor + from .text_generation_preprocessor import TextGenerationTransformersPreprocessor, TextGenerationT5Preprocessor + from .token_classification_preprocessor import TokenClassificationTransformersPreprocessor, \ WordSegmentationBlankSetToLabelPreprocessor from .token_classification_thai_preprocessor import WordSegmentationPreprocessorThai, NERPreprocessorThai from .token_classification_viet_preprocessor import NERPreprocessorViet - from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor + from .zero_shot_classification_preprocessor import ZeroShotClassificationTransformersPreprocessor from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor, InputFeatures, @@ -36,30 +34,31 @@ else: 'NLPTokenizerPreprocessorBase', 'NLPBasePreprocessor', ], - 'text_generation_jieba_preprocessor': - ['TextGenerationJiebaPreprocessor'], 'sentence_piece_preprocessor': ['SentencePiecePreprocessor'], 'bert_seq_cls_tokenizer': ['Tokenize'], 'document_segmentation_preprocessor': - ['DocumentSegmentationPreprocessor'], + ['DocumentSegmentationTransformersPreprocessor'], 'faq_question_answering_preprocessor': - ['FaqQuestionAnsweringPreprocessor'], + ['FaqQuestionAnsweringTransformersPreprocessor'], 'fill_mask_preprocessor': - ['FillMaskPoNetPreprocessor', 'NLPPreprocessor'], - 'text_ranking_preprocessor': ['TextRankingPreprocessor'], - 'relation_extraction_preprocessor': ['RelationExtractionPreprocessor'], - 'sentence_classification_preprocessor': - ['SequenceClassificationPreprocessor'], - 'sentence_embedding_preprocessor': ['SentenceEmbeddingPreprocessor'], - 'text_generation_preprocessor': ['TextGenerationPreprocessor'], - 'text2text_generation_preprocessor': - ['Text2TextGenerationPreprocessor'], + ['FillMaskPoNetPreprocessor', 'FillMaskTransformersPreprocessor'], + 'text_ranking_preprocessor': ['TextRankingTransformersPreprocessor'], + 'relation_extraction_preprocessor': + ['RelationExtractionTransformersPreprocessor'], + 'text_classification_preprocessor': + ['TextClassificationTransformersPreprocessor'], + 'sentence_embedding_preprocessor': + ['SentenceEmbeddingTransformersPreprocessor'], + 'text_generation_preprocessor': [ + 'TextGenerationTransformersPreprocessor', + 'TextGenerationJiebaPreprocessor', 'TextGenerationT5Preprocessor' + ], 'token_classification_preprocessor': [ - 'TokenClassificationPreprocessor', + 'TokenClassificationTransformersPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor' ], - 'zero_shot_classification_reprocessor': - ['ZeroShotClassificationPreprocessor'], + 'zero_shot_classification_preprocessor': + ['ZeroShotClassificationTransformersPreprocessor'], 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', ], diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py index 02249ea1..be922bf7 100644 --- a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py +++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py @@ -3,39 +3,52 @@ from typing import Any, Dict from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.logger import get_logger -from .nlp_base import NLPBasePreprocessor logger = get_logger() @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.document_segmentation) -class DocumentSegmentationPreprocessor(NLPBasePreprocessor): - - def __init__(self, model_dir: str, config, *args, **kwargs): - """preprocess the data +class DocumentSegmentationTransformersPreprocessor(Preprocessor): + + def __init__(self, + model_dir: str, + model_max_length: int, + mode: str = ModeKeys.INFERENCE, + question_column_name='labels', + context_column_name='sentences', + example_id_column_name='example_id', + label_list=['B-EOP', 'O']): + """The preprocessor for document segmentation task, based on transformers' tokenizer. Args: - model_dir (str): model path + model_dir: The model dir containing the essential files to build the tokenizer. + model_max_length: The max length the model supported. + mode: The mode for this preprocessor. + question_column_name: The key for the question column, default `labels`. + context_column_name: The key for the context column, default `sentences`. + example_id_column_name: The key for the example id column, default `example_id`. + label_list: The label list, default `['B-EOP', 'O']` """ - super().__init__(model_dir, *args, **kwargs) + super().__init__(mode) from transformers import BertTokenizerFast - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, - use_fast=True, - ) - self.question_column_name = 'labels' - self.context_column_name = 'sentences' - self.example_id_column_name = 'example_id' - self.label_to_id = {'B-EOP': 0, 'O': 1} + self.tokenizer = BertTokenizerFast.from_pretrained(model_dir, ) + self.question_column_name = question_column_name + self.context_column_name = context_column_name + self.example_id_column_name = example_id_column_name + self.label_list = label_list + self.label_to_id = { + label: id + for id, label in enumerate(self.label_list) + } self.target_specical_ids = set() self.target_specical_ids.add(self.tokenizer.eos_token_id) - self.max_seq_length = config.max_position_embeddings - self.label_list = ['B-EOP', 'O'] + self.max_seq_length = model_max_length def __call__(self, examples, model_cfg=None) -> Dict[str, Any]: questions = examples[self.question_column_name] diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py index 873a8448..bfff3885 100644 --- a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py +++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py @@ -1,38 +1,58 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os from typing import Any, Dict from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.config import Config, ConfigFields -from modelscope.utils.constant import Fields, ModeKeys, ModelFile +from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.type_assert import type_assert -from .nlp_base import NLPBasePreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor) -class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - super(FaqQuestionAnsweringPreprocessor, self).__init__( - model_dir, mode=ModeKeys.INFERENCE, **kwargs) - - from transformers import BertTokenizer - - preprocessor_config = Config.from_file( - os.path.join(model_dir, ModelFile.CONFIGURATION)).get( - ConfigFields.preprocessor, {}) - if preprocessor_config.get('tokenizer', - 'BertTokenizer') == 'XLMRoberta': +class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor): + + def __init__(self, + model_dir: str, + mode: str = ModeKeys.INFERENCE, + tokenizer='BertTokenizer', + query_set='query_set', + support_set='support_set', + label_in_support_set='label', + text_in_support_set='text', + sequence_length=None, + **kwargs): + """The preprocessor for Faq QA task, based on transformers' tokenizer. + + Args: + model_dir: The model dir containing the essential files to build the tokenizer. + mode: The mode for this preprocessor. + tokenizer: The tokenizer type used, supported types are `BertTokenizer` + and `XLMRobertaTokenizer`, default `BertTokenizer`. + query_set: The key for the query_set. + support_set: The key for the support_set. + label_in_support_set: The key for the label_in_support_set. + text_in_support_set: The key for the text_in_support_set. + sequence_length: The sequence length for the preprocessor. + """ + super().__init__(mode) + if tokenizer == 'XLMRoberta': from transformers import XLMRobertaTokenizer self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir) else: + from transformers import BertTokenizer self.tokenizer = BertTokenizer.from_pretrained(model_dir) - self.MAX_LEN = preprocessor_config.get('max_seq_length', 50) + if sequence_length is not None: + self.max_len = sequence_length + else: + self.max_len = kwargs.get('max_seq_length', 50) self.label_dict = None + self.query_set = query_set + self.support_set = support_set + self.label_in_support_set = label_in_support_set + self.text_in_support_set = text_in_support_set def pad(self, samples, max_len): result = [] @@ -58,25 +78,31 @@ class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor): @type_assert(object, Dict) def __call__(self, data: Dict[str, Any], **preprocessor_param) -> Dict[str, Any]: - TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN) - queryset = data['query_set'] + tmp_max_len = preprocessor_param.get( + 'sequence_length', + preprocessor_param.get('max_seq_length', self.max_len)) + queryset = data[self.query_set] if not isinstance(queryset, list): queryset = [queryset] - supportset = data['support_set'] - supportset = sorted(supportset, key=lambda d: d['label']) + supportset = data[self.support_set] + supportset = sorted( + supportset, key=lambda d: d[self.label_in_support_set]) queryset_tokenized = [self.encode_plus(text) for text in queryset] supportset_tokenized = [ - self.encode_plus(item['text']) for item in supportset + self.encode_plus(item[self.text_in_support_set]) + for item in supportset ] max_len = max( [len(seq) for seq in queryset_tokenized + supportset_tokenized]) - max_len = min(TMP_MAX_LEN, max_len) + max_len = min(tmp_max_len, max_len) queryset_padded = self.pad(queryset_tokenized, max_len) supportset_padded = self.pad(supportset_tokenized, max_len) - supportset_labels_ori = [item['label'] for item in supportset] + supportset_labels_ori = [ + item[self.label_in_support_set] for item in supportset + ] label_dict = [] for label in supportset_labels_ori: if label not in label_dict: diff --git a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py new file mode 100644 index 00000000..249aa24c --- /dev/null +++ b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py @@ -0,0 +1,78 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Tuple, Union + +import numpy as np + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.hub import get_model_type +from .transformers_tokenizer import NLPTokenizer +from .utils import parse_text_and_label + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.feature_extraction) +class FeatureExtractionTransformersPreprocessor(Preprocessor): + + def __init__(self, + model_dir: str = None, + first_sequence: str = None, + second_sequence: str = None, + mode: str = ModeKeys.INFERENCE, + sequence_length: int = 128, + use_fast: bool = None, + **kwargs): + """The preprocessor for feature extraction task, based on transformers' tokenizer. + + Args: + model_dir: The model dir used to initialize the tokenizer. + use_fast: Use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ + self.first_sequence = first_sequence + self.second_sequence = second_sequence + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = sequence_length + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(mode) + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizer( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + + def __call__(self, data: Union[str, Tuple, Dict], + **kwargs) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + Returns: + Dict[str, Any]: the preprocessed data + """ + + text_a, text_b, _ = parse_text_and_label(data, self.mode, + self.first_sequence, + self.second_sequence) + output = self._tokenize_text(text_a, text_b, **kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } + return output + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + return self.nlp_tokenizer(sequence1, sequence2, **kwargs) diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py index b0638dbc..80ac441f 100644 --- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py +++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py @@ -2,60 +2,207 @@ import os.path as osp import re +from abc import abstractmethod from typing import Any, Dict, Tuple, Union import numpy as np import torch from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.config import Config from modelscope.utils.constant import Fields, ModeKeys, ModelFile +from modelscope.utils.hub import get_model_type from modelscope.utils.nlp import import_external_nltk_data -from .nlp_base import NLPTokenizerPreprocessorBase +from .transformers_tokenizer import NLPTokenizer +from .utils import parse_text_and_label + + +class FillMaskPreprocessorBase(Preprocessor): + + def __init__(self, + first_sequence: str = None, + second_sequence: str = None, + mode: str = ModeKeys.INFERENCE): + """The base constructor for all the fill-mask preprocessors. + + Args: + first_sequence: The key of the first sequence. + second_sequence: The key of the second sequence. + mode: The mode for the preprocessor. + """ + super().__init__(mode) + self.first_sequence = first_sequence + self.second_sequence = second_sequence + + def __call__(self, data: Union[str, Tuple, Dict], + **kwargs) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + Returns: + Dict[str, Any]: the preprocessed data + """ + + text_a, text_b, _ = parse_text_and_label(data, self.mode, + self.first_sequence, + self.second_sequence) + output = self._tokenize_text(text_a, text_b, **kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } + return output + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ + raise NotImplementedError() + + @property + def mask_id(self): + """Return the id of the mask token. + + Returns: + The id of mask token. + """ + return None + + @abstractmethod + def decode(self, + token_ids, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs): + """Turn the token_ids to real sentence. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + Returns: + The real sentence decoded by the preprocessor. + """ + pass @PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.feature_extraction) -class NLPPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in MLM task. - """ +class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase): + + def __init__(self, + model_dir: str = None, + first_sequence: str = None, + second_sequence: str = None, + mode: str = ModeKeys.INFERENCE, + sequence_length: int = 128, + use_fast: bool = None, + **kwargs): + """The preprocessor for fill mask task, based on transformers' tokenizer. - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + Args: + model_dir: The model dir used to initialize the tokenizer. + use_fast: Use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['max_length'] = sequence_length kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', True) - super().__init__(model_dir, mode=mode, **kwargs) + super().__init__(first_sequence, second_sequence, mode) + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizer( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + return self.nlp_tokenizer(sequence1, sequence2, **kwargs) @property def mask_id(self): - return self.tokenizer.mask_token_id + """Return the id of the mask token. + + Returns: + The id of mask token. + """ + return self.nlp_tokenizer.tokenizer.mask_token_id def decode(self, token_ids, skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs): - return self.tokenizer.decode(token_ids, skip_special_tokens, - clean_up_tokenization_spaces, **kwargs) + """Turn the token_ids to real sentence. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + Returns: + The real sentence decoded by the preprocessor. + """ + return self.nlp_tokenizer.tokenizer.decode( + token_ids, skip_special_tokens, clean_up_tokenization_spaces, + **kwargs) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.fill_mask_ponet) -class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in PoNet model's MLM task. - """ +class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): + + def __init__(self, + model_dir, + first_sequence: str = None, + second_sequence: str = None, + mode: str = ModeKeys.INFERENCE, + sequence_length: int = 512, + use_fast: bool = None, + **kwargs): + """The tokenizer preprocessor used in PoNet model's MLM task. - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + Args: + model_dir: The model dir used to initialize the tokenizer. + use_fast: Use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 512) + kwargs['max_length'] = sequence_length kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', True) - super().__init__(model_dir, mode=mode, **kwargs) + super().__init__(first_sequence, second_sequence, mode) + self.nlp_tokenizer = NLPTokenizer( + model_dir, use_fast=use_fast, tokenize_kwargs=kwargs) self.cfg = Config.from_file( osp.join(model_dir, ModelFile.CONFIGURATION)) @@ -80,27 +227,15 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): self.sent_tokenize = sent_tokenize self.max_length = kwargs['max_length'] - def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data (tuple): [sentence1, sentence2] - sentence1 (str): a sentence - Example: - 'you are so handsome.' - sentence2 (str): a sentence - Example: - 'you are so beautiful.' - Returns: - Dict[str, Any]: the preprocessed data - """ - - text_a, text_b, labels = self.parse_text_and_label(data) - output = self.tokenizer( - text_a, - text_b, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - **self.tokenize_kwargs) + def __call__(self, data: Union[str, Tuple, Dict], + **kwargs) -> Dict[str, Any]: + text_a, text_b, _ = parse_text_and_label(data, self.mode, + self.first_sequence, + self.second_sequence) + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + output = self.nlp_tokenizer(text_a, text_b, **kwargs) max_seq_length = self.max_length if text_b is None: @@ -108,7 +243,7 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): seg_lens = list( map( len, - self.tokenizer( + self.nlp_tokenizer.tokenizer( self.sent_tokenize(text_a), add_special_tokens=False, truncation=True)['input_ids'])) @@ -125,18 +260,36 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): k: np.array(v) if isinstance(v, list) else v for k, v in output.items() } - - self.labels_to_id(labels, output) return output @property def mask_id(self): - return self.tokenizer.mask_token_id + """Return the id of the mask token. + + Returns: + The id of mask token. + """ + return self.nlp_tokenizer.tokenizer.mask_token_id def decode(self, token_ids, skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs): - return self.tokenizer.decode(token_ids, skip_special_tokens, - clean_up_tokenization_spaces, **kwargs) + """Turn the token_ids to real sentence. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + Returns: + The real sentence decoded by the preprocessor. + """ + return self.nlp_tokenizer.tokenizer.decode( + token_ids, skip_special_tokens, clean_up_tokenization_spaces, + **kwargs) diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py deleted file mode 100644 index 7fe28eb5..00000000 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ /dev/null @@ -1,291 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import os -from abc import ABC -from collections.abc import Mapping -from typing import Any, Dict, List, Tuple, Union - -import json -import numpy as np -import torch -from transformers import AutoTokenizer - -from modelscope.metainfo import Models -from modelscope.outputs import OutputKeys -from modelscope.preprocessors.base import Preprocessor -from modelscope.utils.constant import ModeKeys -from modelscope.utils.hub import get_model_type, parse_label_mapping -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = [ - 'NLPBasePreprocessor', - 'NLPTokenizerPreprocessorBase', -] - - -class NLPBasePreprocessor(Preprocessor, ABC): - - def __init__(self, - model_dir: str, - first_sequence=None, - second_sequence=None, - label=None, - label2id=None, - mode=ModeKeys.INFERENCE, - use_fast=None, - **kwargs): - """The NLP preprocessor base class. - - Args: - model_dir (str): The local model path - first_sequence: The key for the first sequence - second_sequence: The key for the second sequence - label: The label key - label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping - if this mapping is not supplied. - mode: Run this preprocessor in either 'train'/'eval'/'inference' mode - use_fast: use the fast version of tokenizer - - """ - self.model_dir = model_dir - self.first_sequence = first_sequence - self.second_sequence = second_sequence - self.label = label - - self.use_fast = use_fast - if self.use_fast is None and model_dir is None: - self.use_fast = False - elif self.use_fast is None and os.path.isfile( - os.path.join(model_dir, 'tokenizer_config.json')): - with open( - os.path.join(model_dir, 'tokenizer_config.json'), - 'r', - encoding='utf-8') as f: - json_config = json.load(f) - self.use_fast = json_config.get('use_fast') - self.use_fast = False if self.use_fast is None else self.use_fast - - self.label2id = label2id - if self.label2id is None and model_dir is not None: - self.label2id = parse_label_mapping(model_dir) - super().__init__(mode, **kwargs) - - @property - def mask_id(self): - """Child preprocessor can override this property to return the id of mask token. - - Returns: - The id of mask token, default None. - """ - return None - - def decode(self, - token_ids: Union[int, List[int], 'np.ndarray', 'torch.Tensor', - 'tf.Tensor'], - skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, - **kwargs): - """Turn the token_ids to real sentence. - - Args: - token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): - List of tokenized input ids. Can be obtained using the `__call__` method. - skip_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. - kwargs (additional keyword arguments, *optional*): - Will be passed to the underlying model specific decode method. - Returns: - The real sentence decoded by the preprocessor. - """ - raise NotImplementedError() - - -class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): - - def __init__(self, - model_dir: str, - first_sequence: str = None, - second_sequence: str = None, - label: str = 'label', - label2id: dict = None, - mode: str = ModeKeys.INFERENCE, - use_fast: bool = None, - **kwargs): - """The NLP tokenizer preprocessor base class. - - Any nlp preprocessor which uses the hf tokenizer can inherit from this class. - - Args: - model_dir (str): The local model path - first_sequence: The key for the first sequence - second_sequence: The key for the second sequence - label: The key for the label - label2id: An optional label2id dict. - If label2id is None, the preprocessor will try to parse label-id mapping from: - - configuration.json model.label2id/model.id2label - - config.json label2id/id2label - - label_mapping.json - mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different. - use_fast: use the fast version of tokenizer - kwargs: These kwargs will be directly fed into the tokenizer. - """ - - super().__init__(model_dir, first_sequence, second_sequence, label, - label2id, mode, use_fast, **kwargs) - self.model_dir = model_dir - self.tokenize_kwargs = kwargs - self.tokenizer = self.build_tokenizer(model_dir) - logger.info(f'The key of sentence1: {self.first_sequence}, ' - f'The key of sentence2: {self.second_sequence}, ' - f'The key of label: {self.label}') - if self.first_sequence is None: - logger.warning('[Important] first_sequence attribute is not set, ' - 'this will cause an error if your input is a dict.') - - @property - def id2label(self): - """Return the id2label mapping according to the label2id mapping. - - @return: The id2label mapping if exists. - """ - if self.label2id is not None: - return {id: label for label, id in self.label2id.items()} - return None - - def build_tokenizer(self, model_dir): - """Build a tokenizer by the model type. - - NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a - multi-thread problem. - - Args: - model_dir: The local model dir. - - Returns: - The initialized tokenizer. - """ - self.is_transformer_based_model = 'lstm' not in model_dir - # fast version lead to parallel inference failed - model_type = get_model_type(model_dir) - if model_type in (Models.structbert, Models.gpt3, Models.palm, - Models.plug): - from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast - tokenizer = SbertTokenizerFast if self.use_fast else SbertTokenizer - return tokenizer.from_pretrained(model_dir) - elif model_type == Models.veco: - from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast - tokenizer = VecoTokenizerFast if self.use_fast else VecoTokenizer - return tokenizer.from_pretrained(model_dir) - elif model_type == Models.deberta_v2: - from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast - tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer - return tokenizer.from_pretrained(model_dir) - elif not self.is_transformer_based_model: - from transformers import BertTokenizer, BertTokenizerFast - tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer - return tokenizer.from_pretrained(model_dir) - else: - return AutoTokenizer.from_pretrained( - model_dir, use_fast=self.use_fast) - - def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: - """process the raw input data - - Args: - data (tuple): [sentence1, sentence2] - sentence1 (str): a sentence - Example: - 'you are so handsome.' - sentence2 (str): a sentence - Example: - 'you are so beautiful.' - Returns: - Dict[str, Any]: the preprocessed data - """ - - text_a, text_b, labels = self.parse_text_and_label(data) - output = self.tokenizer( - text_a, - text_b, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - **self.tokenize_kwargs) - output = { - k: np.array(v) if isinstance(v, list) else v - for k, v in output.items() - } - self.labels_to_id(labels, output) - return output - - def parse_text_and_label(self, data): - """Parse the input and return the sentences and labels. - - When input type is tuple or list and its size is 2: - If the pair param is False, data will be parsed as the first_sentence and the label, - else it will be parsed as the first_sentence and the second_sentence. - - Args: - data: The input data. - - Returns: - The sentences and labels tuple. - """ - text_a, text_b, labels = None, None, None - if isinstance(data, str): - text_a = data - elif isinstance(data, tuple) or isinstance(data, list): - if len(data) == 3: - text_a, text_b, labels = data - elif len(data) == 2: - if self._mode == ModeKeys.INFERENCE: - text_a, text_b = data - else: - text_a, labels = data - elif isinstance(data, Mapping): - text_a = data.get(self.first_sequence) - text_b = data.get(self.second_sequence) - labels = data.get(self.label) - - return text_a, text_b, labels - - def labels_to_id(self, labels, output): - """Turn the labels to id with the type int or float. - - If the original label's type is str or int, the label2id mapping will try to convert it to the final label. - If the original label's type is float, or the label2id mapping does not exist, - the original label will be returned. - - Args: - labels: The input labels. - output: The label id. - - Returns: - The final labels. - """ - - def label_can_be_mapped(label): - return isinstance(label, str) or isinstance(label, int) - - try: - if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \ - and self.label2id is not None: - output[OutputKeys.LABELS] = [ - self.label2id[label] - if label in self.label2id else self.label2id[str(label)] - for label in labels - ] - elif label_can_be_mapped(labels) and self.label2id is not None: - output[OutputKeys.LABELS] = self.label2id[ - labels] if labels in self.label2id else self.label2id[str( - labels)] - elif labels is not None: - output[OutputKeys.LABELS] = labels - except KeyError as e: - logger.error( - f'Label {labels} cannot be found in the label mapping {self.label2id},' - f'which comes from the user input or the configuration files. ' - f'Please consider matching your labels with this mapping.') - raise e diff --git a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py index 9a426ab7..58aa000d 100644 --- a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py +++ b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py @@ -5,34 +5,36 @@ from typing import Any, Dict from transformers import AutoTokenizer from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.type_assert import type_assert -from .nlp_base import NLPBasePreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.re_tokenizer) -class RelationExtractionPreprocessor(NLPBasePreprocessor): - """The relation extraction preprocessor used in normal RE task. - """ +class RelationExtractionTransformersPreprocessor(Preprocessor): - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data + def __init__( + self, + model_dir: str, + mode: str = ModeKeys.INFERENCE, + **kwargs, + ): + """The preprocessor for relation Extraction task, based on transformers' tokenizer. Args: - model_dir (str): model path + model_dir: The model dir used to initialize the tokenizer. + mode: The mode for the preprocessor. """ - super().__init__(model_dir, *args, **kwargs) - + super().__init__(mode) self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) self.tokenizer = AutoTokenizer.from_pretrained( model_dir, use_fast=True) @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + def __call__(self, data: str, **kwargs) -> Dict[str, Any]: """process the raw input data Args: @@ -46,7 +48,9 @@ class RelationExtractionPreprocessor(NLPBasePreprocessor): # preprocess the data for the model input text = data - output = self.tokenizer([text], return_tensors='pt') + if 'return_tensors' not in kwargs: + kwargs['return_tensors'] = 'pt' + output = self.tokenizer([text], **kwargs) return { 'text': text, 'input_ids': output['input_ids'], diff --git a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py deleted file mode 100644 index f1295c50..00000000 --- a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from modelscope.metainfo import Preprocessors -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields, ModeKeys -from .nlp_base import NLPTokenizerPreprocessorBase - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.nli_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in sequence classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py index 519de60c..ccbf3ef2 100644 --- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py +++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py @@ -1,31 +1,61 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys -from .nlp_base import NLPTokenizerPreprocessorBase +from modelscope.utils.hub import get_model_type +from .transformers_tokenizer import NLPTokenizer @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sentence_embedding) -class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase): +class SentenceEmbeddingTransformersPreprocessor(Preprocessor): """The tokenizer preprocessor used in sentence embedding. """ - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) + def __init__(self, + model_dir: str, + first_sequence='source_sentence', + second_sequence='sentences_to_compare', + mode=ModeKeys.INFERENCE, + use_fast: bool = None, + sequence_length: int = 128, + **kwargs): + """The preprocessor for sentence embedding task, based on transformers' tokenizer. - def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: + Args: + model_dir: The model dir used to initialize the tokenizer. + first_sequence: The key of the first sequence. + second_sequence: The key of the second sequence. + mode: The mode for the preprocessor. + use_fast: Use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ + self.first_sequence = first_sequence + self.second_sequence = second_sequence + kwargs['max_length'] = sequence_length + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizer( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + super().__init__(mode=mode) + + def __call__(self, + data: Dict, + padding=True, + truncation=True, + **kwargs) -> Dict[str, Any]: """process the raw input data Args: data Dict: - keys: "source_sentence" && "sentences_to_compare" + keys: the source sentence and the sentences to compare values: list of sentences Example: {"source_sentence": ["how long it take to get a master's degree"], @@ -37,16 +67,16 @@ class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase): Returns: Dict[str, Any]: the preprocessed data """ - source_sentence = data['source_sentence'] - compare_sentences = data['sentences_to_compare'] - sentences = [] - sentences.append(source_sentence[0]) + source_sentence = data[self.first_sequence] + compare_sentences = data[self.second_sequence] + sentences = [source_sentence[0]] for sent in compare_sentences: sentences.append(sent) - tokenized_inputs = self.tokenizer( - sentences, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - padding=True, - truncation=True) + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + + tokenized_inputs = self.nlp_tokenizer( + sentences, padding=padding, truncation=truncation, **kwargs) return tokenized_inputs diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py index 1d1ef19d..6b0b76e1 100644 --- a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py +++ b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py @@ -1,7 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - +import os import os.path as osp -from typing import Any, Dict import sentencepiece as spm import torch @@ -9,17 +8,26 @@ import torch from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Fields, ModeKeys @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sentence_piece) class SentencePiecePreprocessor(Preprocessor): - def __init__(self, model_dir: str, *args, **kwargs): - import os + def __init__(self, + model_dir: str, + mode=ModeKeys.INFERENCE, + *args, + **kwargs): + """The preprocessor for the sentence piece tokenizer. + + Args: + model_dir: The model dir contains the essential files used by the `SentencePieceProcessor`. + mode: The mode for the preprocessor. + """ - super().__init__(*args, **kwargs) + super().__init__(mode) self.tokenizer = None for file_name in os.listdir(model_dir): if file_name.endswith('.model'): @@ -28,5 +36,5 @@ class SentencePiecePreprocessor(Preprocessor): break assert self.tokenizer is not None, 'Can not find .model file' - def __call__(self, data: str) -> Dict[str, Any]: + def __call__(self, data: str) -> torch.Tensor: return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long) diff --git a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py deleted file mode 100644 index 5693d36e..00000000 --- a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict, Union - -from modelscope.metainfo import Preprocessors -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields, ModeKeys -from .nlp_base import NLPTokenizerPreprocessorBase - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor) -class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in text generation. - """ - - def __init__(self, - model_dir: str, - tokenizer=None, - mode=ModeKeys.INFERENCE, - **kwargs): - kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate') - kwargs['padding'] = kwargs.get('padding', False) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - False) - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) - - def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: - text_a, _, _ = self.parse_text_and_label(data) - - inputs = self.tokenizer( - text_a, - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - **self.tokenize_kwargs) - - # This is produced by tokenizers but is an invalid generate kwargs - if 'token_type_ids' in inputs: - del inputs['token_type_ids'] - return inputs diff --git a/modelscope/preprocessors/nlp/text_classification_preprocessor.py b/modelscope/preprocessors/nlp/text_classification_preprocessor.py new file mode 100644 index 00000000..06820e6c --- /dev/null +++ b/modelscope/preprocessors/nlp/text_classification_preprocessor.py @@ -0,0 +1,152 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from abc import abstractmethod +from typing import Any, Dict, List, Tuple, Union + +import numpy as np + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.hub import get_model_type, parse_label_mapping +from modelscope.utils.logger import get_logger +from .transformers_tokenizer import NLPTokenizer +from .utils import labels_to_id, parse_text_and_label + +logger = get_logger(__name__) + + +class TextClassificationPreprocessorBase(Preprocessor): + + def __init__( + self, + model_dir=None, + first_sequence: str = None, + second_sequence: str = None, + label: str = 'label', + label2id: Dict = None, + mode: str = ModeKeys.INFERENCE, + ): + """The base class for the text classification preprocessor. + + Args: + model_dir(str, `optional`): The model dir used to parse the label mapping, can be None. + first_sequence(str, `optional`): The key of the first sequence. + second_sequence(str, `optional`): The key of the second sequence. + label(str, `optional`): The keys of the label columns, default is `label` + label2id: (dict, `optional`): The optional label2id mapping + mode: The mode for the preprocessor + """ + super().__init__(mode) + self.model_dir = model_dir + self.first_sequence = first_sequence + self.second_sequence = second_sequence + self.label = label + self.label2id = label2id + if self.label2id is None and self.model_dir is not None: + self.label2id = parse_label_mapping(self.model_dir) + + logger.info(f'The key of sentence1: {self.first_sequence}, ' + f'The key of sentence2: {self.second_sequence}, ' + f'The key of label: {self.label}') + if self.first_sequence is None: + logger.warning('[Important] first_sequence attribute is not set, ' + 'this will cause an error if your input is a dict.') + + @property + def id2label(self): + """Return the id2label mapping according to the label2id mapping. + + @return: The id2label mapping if exists. + """ + if self.label2id is not None: + return {id: label for label, id in self.label2id.items()} + return None + + def __call__(self, data: Union[str, Tuple, Dict], + **kwargs) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + sentence2 (str): a sentence + Example: + 'you are so beautiful.' + Returns: + Dict[str, Any]: the preprocessed data + """ + + text_a, text_b, labels = parse_text_and_label(data, self.mode, + self.first_sequence, + self.second_sequence, + self.label) + output = self._tokenize_text(text_a, text_b, **kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } + labels_to_id(labels, output, self.label2id) + return output + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ + raise NotImplementedError() + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.nli_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) +class TextClassificationTransformersPreprocessor( + TextClassificationPreprocessorBase): + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + return self.nlp_tokenizer(sequence1, sequence2, **kwargs) + + def __init__(self, + model_dir=None, + first_sequence: str = None, + second_sequence: str = None, + label: Union[str, List] = 'label', + label2id: Dict = None, + mode: str = ModeKeys.INFERENCE, + sequence_length: int = 128, + use_fast: bool = None, + **kwargs): + """The tokenizer preprocessor used in sequence classification. + + Args: + use_fast: Whether to use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = sequence_length + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizer( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + super().__init__(model_dir, first_sequence, second_sequence, label, + label2id, mode) diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py index 4e5ba3bd..357a946f 100644 --- a/modelscope/preprocessors/nlp/text_error_correction.py +++ b/modelscope/preprocessors/nlp/text_error_correction.py @@ -7,12 +7,11 @@ from modelscope.metainfo import Preprocessors from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields -from .nlp_base import NLPBasePreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_error_correction) -class TextErrorCorrectionPreprocessor(NLPBasePreprocessor): +class TextErrorCorrectionPreprocessor(Preprocessor): """The preprocessor used in text correction task. """ @@ -23,7 +22,7 @@ class TextErrorCorrectionPreprocessor(NLPBasePreprocessor): Args: model_dir (str): model path """ - super().__init__(model_dir, *args, **kwargs) + super().__init__(*args, **kwargs) self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt')) def __call__(self, data: str) -> Dict[str, Any]: diff --git a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py deleted file mode 100644 index 1e972d64..00000000 --- a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import os.path as osp -from typing import Any, Dict - -from modelscope.metainfo import Preprocessors -from modelscope.preprocessors.base import Preprocessor -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer) -class TextGenerationJiebaPreprocessor(Preprocessor): - """The jieba tokenizer preprocessor used in text generation. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - from modelscope.models.nlp.gpt3 import JiebaBPETokenizer - super().__init__(*args, **kwargs) - self.tokenizer = JiebaBPETokenizer( - osp.join(model_dir, 'tokenizer.json')) - - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - '深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地' - Returns: - Dict[str, Any]: the preprocessed data - Example: - {'net_input': - {'src_tokens':tensor([1,2,3,4]), - 'src_lengths': tensor([4])} - } - """ - import torch - - return { - 'input_ids': - torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0) - } diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py index 238e2972..7ce04a38 100644 --- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py +++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py @@ -1,62 +1,257 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp from typing import Any, Dict, Optional, Union +import numpy as np +import torch + from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys -from .nlp_base import NLPTokenizerPreprocessorBase +from modelscope.utils.hub import get_model_type +from modelscope.utils.logger import get_logger +from .transformers_tokenizer import NLPTokenizer +from .utils import parse_text_and_label + +logger = get_logger(__name__) + + +class TextGenerationPreprocessorBase(Preprocessor): + + def __init__(self, + mode: str = ModeKeys.INFERENCE, + src_txt='src_txt', + tgt_txt='tgt_txt'): + """The base class for all the text generation task's preprocessors. + + Args: + mode: The preprocessor mode. + src_txt: The key for the src text. + tgt_txt: The key for the tgt text. + """ + super().__init__(mode) + self.src_txt = src_txt + self.tgt_txt = tgt_txt + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ + raise NotImplementedError() + + def __call__(self, data: Union[Dict, str], **kwargs) -> Dict[str, Any]: + text_a, text_b = parse_text_and_label(data, self.mode, self.src_txt, + self.tgt_txt)[0:2] + + output = self._tokenize_text(text_a, text_b, **kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } + return output + + def decode(self, tokens, **kwargs): + """Decode the tokens to real text. + + Args: + tokens: The output tokens from model's `forward` and `generate` + + Returns: + The actual text. + """ + raise NotImplementedError() + + +class NLPTokenizerForRoberta(NLPTokenizer): + + def build_tokenizer(self): + + def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: + import os + for name in os.listdir(model_dir): + full_name = os.path.join(model_dir, name) + if 'roberta' in name and os.path.isdir(full_name): + return full_name + + roberta_tokenizer_dir = get_roberta_tokenizer_dir(self.model_dir) + if roberta_tokenizer_dir: + from transformers import RobertaTokenizer + return RobertaTokenizer.from_pretrained( + roberta_tokenizer_dir, do_lower_case=False) + return super().build_tokenizer() @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) -class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in text generation. - """ +class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase): def __init__(self, model_dir: str, tokenizer=None, - mode=ModeKeys.INFERENCE, + mode: str = ModeKeys.INFERENCE, + src_txt='src_txt', + tgt_txt='tgt_txt', + sequence_length: int = 128, + use_fast: bool = None, **kwargs): + """The tokenizer preprocessor used in text generation. + + Args: + model_dir: The model dir used to initialize the tokenizer. + mode: The mode for the preprocessor. + src_txt: The key of the source sentence. + tgt_txt: The key of the generated sentence. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + use_fast: Whether to use the fast tokenizer or not. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ + if 'first_sequence' in kwargs: + src_txt = kwargs.pop('first_sequence') + super().__init__(mode, src_txt, tgt_txt) kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', False) - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, mode=mode, **kwargs) - - @staticmethod - def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: - import os - for name in os.listdir(model_dir): - full_name = os.path.join(model_dir, name) - if 'roberta' in name and os.path.isdir(full_name): - return full_name - - def build_tokenizer(self, model_dir: str): - roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir) - if roberta_tokenizer_dir: - from transformers import RobertaTokenizer - return RobertaTokenizer.from_pretrained( - roberta_tokenizer_dir, do_lower_case=False) - return super().build_tokenizer(model_dir) - - def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: - if self._mode == ModeKeys.INFERENCE: - return super().__call__(data) - src_rst = super().__call__(data['src_txt']) - src_input_ids = src_rst['input_ids'] - src_attention_mask = src_rst['attention_mask'] - if 'tgt_txt' in data: - labels = super().__call__(data['tgt_txt'])['input_ids'] - else: - labels = src_input_ids[1:] - src_input_ids = src_input_ids[:-1] - src_attention_mask = src_attention_mask[:-1] + kwargs['max_length'] = sequence_length + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizerForRoberta( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + + def decode(self, tokens, **kwargs): + """Decode the tokens to real text. + + Args: + tokens: The output tokens from model's `forward` and `generate` + + Returns: + The actual text. + """ + return self.nlp_tokenizer.tokenizer.decode(tokens, **kwargs) + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None + + output = self.nlp_tokenizer(sequence1, **kwargs) + + if self.mode != ModeKeys.INFERENCE: + if sequence2 is not None: + labels = self.nlp_tokenizer(sequence2)['input_ids'] + src_input_ids = output['input_ids'] + src_attention_mask = output['attention_mask'] + else: + labels = output['input_ids'][1:] + src_input_ids = output['input_ids'][:-1] + src_attention_mask = output['attention_mask'][:-1] + + output = { + 'input_ids': src_input_ids, + 'attention_mask': src_attention_mask, + 'labels': labels, + } + return output + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer) +class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase): + """The jieba tokenizer preprocessor used in text generation. + """ + + def __init__(self, + model_dir: str, + mode: str = ModeKeys.INFERENCE, + src_txt='src_txt', + tgt_txt=None): + from modelscope.models.nlp.gpt3 import JiebaBPETokenizer + super().__init__(mode, src_txt, tgt_txt) + if self.tgt_txt is not None: + logger.warn( + f'TextGenerationJiebaPreprocessor currently does not support training, ' + f'the {self.tgt_txt} of the tgt_txt field will be ignored.') + self.src_txt = src_txt + self.tokenizer = JiebaBPETokenizer( + osp.join(model_dir, 'tokenizer.json')) + + def decode(self, tokens, **kwargs): + """Decode the tokens to real text. + + Args: + tokens: The output tokens from model's `forward` and `generate` + Returns: + The actual text. + """ + return self.tokenizer.detokenize(tokens) + + def _tokenize_text(self, sequence1, sequence2=None, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ return { - 'input_ids': src_input_ids, - 'attention_mask': src_attention_mask, - 'labels': labels, + 'input_ids': + torch.tensor(self.tokenizer.tokenize(sequence1)).unsqueeze_(0) } + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor) +class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor): + + def __init__(self, + model_dir: str, + mode: str = ModeKeys.INFERENCE, + src_txt='src_txt', + tgt_txt='tgt_txt', + use_fast: bool = None, + sequence_length: int = 128, + **kwargs): + """The preprocessor for text to text generation task, based on transformers' tokenizer. + + Args: + model_dir: The model dir used to initialize the tokenizer. + src_txt: The key of the first sequence. + use_fast: Use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + mode: The mode for the preprocessor. + **kwargs: Extra args input into the tokenizer's __call__ method. + """ + super().__init__( + model_dir, + mode=mode, + src_txt=src_txt, + tgt_txt=tgt_txt, + sequence_length=sequence_length, + use_fast=use_fast, + truncation=kwargs.pop('truncation', True), + padding=kwargs.pop('padding', 'max_length'), + return_token_type_ids=kwargs.pop('return_token_type_ids', False), + **kwargs) diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py index 2ada6892..574b94ae 100644 --- a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py +++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py @@ -1,67 +1,78 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict from transformers import AutoTokenizer from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.type_assert import type_assert -from .nlp_base import NLPTokenizerPreprocessorBase @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_ranking) -class TextRankingPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in passage ranking model. - """ +class TextRankingTransformersPreprocessor(Preprocessor): def __init__(self, model_dir: str, - mode=ModeKeys.INFERENCE, - *args, + mode: str = ModeKeys.INFERENCE, + first_sequence='source_sentence', + second_sequence='sentences_to_compare', + label='labels', + qid='qid', + sequence_length=128, **kwargs): - """preprocess the data + """The tokenizer preprocessor class for the text ranking preprocessor. Args: - model_dir (str): model path + model_dir(str, `optional`): The model dir used to parse the label mapping, can be None. + first_sequence(str, `optional`): The key of the first sequence. + second_sequence(str, `optional`): The key of the second sequence. + label(str, `optional`): The keys of the label columns, default `labels`. + qid(str, `optional`): The qid info. + mode: The mode for the preprocessor. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. """ - super().__init__(model_dir, mode=mode, *args, **kwargs) - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'source_sentence') - self.second_sequence = kwargs.pop('second_sequence', - 'sentences_to_compare') - self.sequence_length = kwargs.pop('sequence_length', 128) - + super().__init__(mode) + self.model_dir = model_dir + self.first_sequence = first_sequence + self.second_sequence = second_sequence + self.label = label + self.qid = qid + self.sequence_length = sequence_length self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) - @type_assert(object, (str, tuple, Dict)) - def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]: - if isinstance(data, tuple): - sentence1, sentence2 = data - elif isinstance(data, dict): - sentence1 = data.get(self.first_sequence) - sentence2 = data.get(self.second_sequence) + @type_assert(object, dict) + def __call__(self, + data: Dict, + padding='max_length', + truncation=True, + **kwargs) -> Dict[str, Any]: + sentence1 = data.get(self.first_sequence) + sentence2 = data.get(self.second_sequence) + labels = data.get(self.label) + qid = data.get(self.qid) + if isinstance(sentence2, str): sentence2 = [sentence2] if isinstance(sentence1, str): sentence1 = [sentence1] sentence1 = sentence1 * len(sentence2) - - max_seq_length = self.sequence_length + kwargs['max_length'] = kwargs.get( + 'max_length', kwargs.pop('sequence_length', self.sequence_length)) + if 'return_tensors' not in kwargs: + kwargs['return_tensors'] = 'pt' feature = self.tokenizer( sentence1, sentence2, - padding='max_length', - truncation=True, - max_length=max_seq_length, - return_tensors='pt') - if 'labels' in data: - labels = data['labels'] + padding=padding, + truncation=truncation, + **kwargs) + if labels is not None: feature['labels'] = labels - if 'qid' in data: - qid = data['qid'] + if qid is not None: feature['qid'] = qid return feature diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index a7616736..1d42324d 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -1,28 +1,35 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch from modelscope.metainfo import Preprocessors from modelscope.outputs import OutputKeys +from modelscope.preprocessors import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.hub import get_model_type, parse_label_mapping +from modelscope.utils.logger import get_logger from modelscope.utils.type_assert import type_assert -from .nlp_base import NLPBasePreprocessor, NLPTokenizerPreprocessorBase +from .transformers_tokenizer import NLPTokenizer +from .utils import parse_text_and_label + +logger = get_logger(__name__) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.word_segment_text_to_label_preprocessor) -class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): +class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): """The preprocessor used to turn a single sentence to a labeled token-classification dict. """ - def __init__(self, **kwargs): - self.first_sequence: str = kwargs.pop('first_sequence', 'tokens') - self.label = kwargs.pop('label', OutputKeys.LABELS) + def __init__(self, generated_sentence='tokens', generated_label='labels'): + super().__init__() + self.generated_sentence = generated_sentence + self.generated_label = generated_label def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: data = data.split(' ') @@ -43,9 +50,134 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): chars, labels = produce_train_sample(data) return { - self.first_sequence: chars, - self.label: labels, + self.generated_sentence: chars, + self.generated_label: labels, + } + + +class TokenClassificationPreprocessorBase(Preprocessor): + + def __init__( + self, + model_dir: str = None, + first_sequence: str = None, + label: str = 'label', + label2id: Dict = None, + label_all_tokens: bool = False, + mode: str = ModeKeys.INFERENCE, + ): + """The base class for all the token-classification tasks. + + Args: + model_dir: The model dir to build the the label2id mapping. + If None, user need to pass in the `label2id` param. + first_sequence: The key for the text(token) column if input type is a dict. + label: The key for the label column if input type is a dict and the mode is `training` or `evaluation`. + label2id: The label2id mapping, if not provided, you need to specify the model_dir to search the mapping + from config files. + label_all_tokens: If label exists in the dataset, the preprocessor will try to label the tokens. + If label_all_tokens is true, all non-initial sub-tokens will get labels like `I-xxx`, + or else the labels will be filled with -100, default False. + mode: The preprocessor mode. + """ + super().__init__(mode) + self.model_dir = model_dir + self.first_sequence = first_sequence + self.label = label + self.label2id = label2id + self.label_all_tokens = label_all_tokens + if self.label2id is None and self.model_dir is not None: + self.label2id = parse_label_mapping(self.model_dir) + + @property + def id2label(self): + """Return the id2label mapping according to the label2id mapping. + + @return: The id2label mapping if exists. + """ + if self.label2id is not None: + return {id: label for label, id in self.label2id.items()} + return None + + def labels_to_id(self, labels_list, word_ids): + # align the labels with tokenized text + assert self.label2id is not None + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + label_enumerate_values = [ + k for k, v in sorted( + self.label2id.items(), key=lambda item: item[1]) + ] + for idx, label in enumerate(label_enumerate_values): + if label.startswith('B-') and label.replace( + 'B-', 'I-') in label_enumerate_values: + b_to_i_label.append( + label_enumerate_values.index(label.replace('B-', 'I-'))) + else: + b_to_i_label.append(idx) + + label_row = [self.label2id[lb] for lb in labels_list] + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: + label_ids.append(label_row[word_idx]) + else: + if self.label_all_tokens: + label_ids.append(b_to_i_label[label_row[word_idx]]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + return label_ids + + def _tokenize_text(self, sequence1, **kwargs): + """Tokenize the text. + + Args: + sequence1: The first sequence. + sequence2: The second sequence which may be None. + + Returns: + The encoded sequence. + """ + raise NotImplementedError() + + @type_assert(object, (str, tuple, dict)) + def __call__(self, data: Union[dict, tuple, str], + **kwargs) -> Dict[str, Any]: + text, _, label = parse_text_and_label( + data, self.mode, self.first_sequence, label=self.label) + outputs, word_ids = self._tokenize_text(text, **kwargs) + if label is not None: + label_ids = self.labels_to_id(label, word_ids) + outputs[OutputKeys.LABELS] = label_ids + outputs = { + k: np.array(v) if isinstance(v, list) else v + for k, v in outputs.items() } + if self.mode == ModeKeys.INFERENCE: + outputs['text'] = text + return outputs + + +class NLPTokenizerForLSTM(NLPTokenizer): + + def build_tokenizer(self): + if self.model_type == 'lstm': + from transformers import AutoTokenizer + return AutoTokenizer.from_pretrained( + self.model_dir, use_fast=self.use_fast, tokenizer_type='bert') + else: + return super().build_tokenizer() + + def get_tokenizer_class(self): + tokenizer_class = self.tokenizer.__class__.__name__ + if tokenizer_class.endswith( + 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': + tokenizer_class = tokenizer_class[:-4] + return tokenizer_class @PREPROCESSORS.register_module( @@ -54,227 +186,238 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) -class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): +class TokenClassificationTransformersPreprocessor( + TokenClassificationPreprocessorBase): """The tokenizer preprocessor used in normal NER task. """ - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - """preprocess the data + def __init__(self, + model_dir: str = None, + first_sequence: str = None, + label: str = 'label', + label2id: Dict = None, + label_all_tokens: bool = False, + mode: str = ModeKeys.INFERENCE, + sequence_length=128, + use_fast=None, + **kwargs): + """ Args: - model_dir (str): model path + use_fast: Whether to use the fast tokenizer or not. + sequence_length: The max sequence length which the model supported, + will be passed into tokenizer as the 'max_length' param. + **kwargs: Extra args input into the tokenizer's __call__ method. """ + super().__init__(model_dir, first_sequence, label, label2id, + label_all_tokens, mode) + self.is_lstm_model = 'lstm' in model_dir + model_type = None + if self.is_lstm_model: + model_type = 'lstm' + elif model_dir is not None: + model_type = get_model_type(model_dir) kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get( - 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - self.sequence_length = kwargs['max_length'] - self.label_all_tokens = kwargs.pop('label_all_tokens', False) - super().__init__(model_dir, mode=mode, **kwargs) - - if 'is_split_into_words' in kwargs: - self.tokenize_kwargs['is_split_into_words'] = kwargs.pop( - 'is_split_into_words') - else: - self.tokenize_kwargs[ - 'is_split_into_words'] = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - if 'label2id' in kwargs: - kwargs.pop('label2id') + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = sequence_length + kwargs['add_special_tokens'] = model_type != 'lstm' + self.nlp_tokenizer = NLPTokenizerForLSTM( + model_dir=model_dir, + model_type=model_type, + use_fast=use_fast, + tokenize_kwargs=kwargs) - @type_assert(object, (str, dict)) - def __call__(self, data: Union[dict, str]) -> Dict[str, Any]: - """process the raw input data + def _tokenize_text(self, text: Union[str, List[str]], **kwargs): + tokens = text + if self.mode != ModeKeys.INFERENCE: + assert isinstance(tokens, list), 'Input needs to be lists in training and evaluating,' \ + 'because the length of the words and the labels need to be equal.' + is_split_into_words = self.nlp_tokenizer.get_tokenizer_kwarg( + 'is_split_into_words', False) + if is_split_into_words: + tokens = list(tokens) - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ + if is_split_into_words and self.mode == ModeKeys.INFERENCE: + encodings, word_ids = self._tokenize_text_by_words( + tokens, **kwargs) + elif self.nlp_tokenizer.tokenizer.is_fast: + encodings, word_ids = self._tokenize_text_with_fast_tokenizer( + tokens, **kwargs) + else: + encodings, word_ids = self._tokenize_text_with_slow_tokenizer( + tokens, **kwargs) - # preprocess the data for the model input - text = None - labels_list = None - if isinstance(data, str): - # for inference inputs without label - text = data - elif isinstance(data, dict): - # for finetune inputs with label - text = data.get(self.first_sequence) - labels_list = data.get(self.label) - if isinstance(text, list): - self.tokenize_kwargs['is_split_into_words'] = True - - if self._mode == ModeKeys.INFERENCE: - self.tokenize_kwargs['add_special_tokens'] = False + if self.mode == ModeKeys.INFERENCE: + for key in encodings.keys(): + encodings[key] = torch.tensor(encodings[key]).unsqueeze(0) + else: + encodings.pop('offset_mapping', None) + return encodings, word_ids + def _tokenize_text_by_words(self, tokens, **kwargs): input_ids = [] label_mask = [] offset_mapping = [] - token_type_ids = [] - if self.tokenize_kwargs[ - 'is_split_into_words'] and self._mode == ModeKeys.INFERENCE: - for offset, token in enumerate(list(text)): - subtoken_ids = self.tokenizer.encode(token, - **self.tokenize_kwargs) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)]) + attention_mask = [] + for offset, token in enumerate(tokens): + subtoken_ids = self.nlp_tokenizer.tokenizer.encode( + token, add_special_tokens=False) + if len(subtoken_ids) == 0: + subtoken_ids = [self.nlp_tokenizer.tokenizer.unk_token_id] + input_ids.extend(subtoken_ids) + attention_mask.extend([1] * len(subtoken_ids)) + label_mask.extend([True] + [False] * (len(subtoken_ids) - 1)) + offset_mapping.extend([(offset, offset + 1)]) + + padding = kwargs.get('padding', + self.nlp_tokenizer.get_tokenizer_kwarg('padding')) + max_length = kwargs.get( + 'max_length', + kwargs.get('sequence_length', + self.nlp_tokenizer.get_tokenizer_kwarg('max_length'))) + special_token = 1 if self.nlp_tokenizer.get_tokenizer_kwarg( + 'add_special_tokens') else 0 + if len(label_mask) > max_length - 2 * special_token: + label_mask = label_mask[:(max_length - 2 * special_token)] + input_ids = input_ids[:(max_length - 2 * special_token)] + offset_mapping = offset_mapping[:sum(label_mask)] + if padding == 'max_length': + label_mask = [False] * special_token + label_mask + \ + [False] * (max_length - len(label_mask) - special_token) + offset_mapping = offset_mapping + [(0, 0)] * ( + max_length - len(offset_mapping)) + input_ids = [self.nlp_tokenizer.tokenizer.cls_token_id] * special_token + input_ids + \ + [self.nlp_tokenizer.tokenizer.sep_token_id] * special_token + \ + [self.nlp_tokenizer.tokenizer.pad_token_id] * (max_length - len(input_ids) - 2 * special_token) + attention_mask = attention_mask + [1] * ( + special_token * 2) + [0] * ( + max_length - len(attention_mask) - 2 * special_token) else: - if self.tokenizer.is_fast: - encodings = self.tokenizer( - text, return_offsets_mapping=True, **self.tokenize_kwargs) - attention_mask = encodings['attention_mask'] - if 'token_type_ids' in encodings: - token_type_ids = encodings['token_type_ids'] - input_ids = encodings['input_ids'] - word_ids = encodings.word_ids() - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = ( - offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) - else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) + label_mask = [False] * special_token + label_mask + \ + [False] * special_token + input_ids = [self.nlp_tokenizer.tokenizer.cls_token_id] * special_token + input_ids + \ + [self.nlp_tokenizer.tokenizer.sep_token_id] * special_token + attention_mask = attention_mask + [1] * (special_token * 2) + + encodings = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping, + } + return encodings, None + + def _tokenize_text_with_fast_tokenizer(self, tokens, **kwargs): + is_split_into_words = isinstance(tokens, list) + encodings = self.nlp_tokenizer( + tokens, + return_offsets_mapping=True, + is_split_into_words=is_split_into_words, + **kwargs) + label_mask = [] + word_ids = encodings.word_ids() + offset_mapping = [] + for i in range(len(word_ids)): + if word_ids[i] is None: + label_mask.append(False) + elif word_ids[i] == word_ids[i - 1]: + label_mask.append(False) + if not is_split_into_words: + offset_mapping[-1] = (offset_mapping[-1][0], + encodings['offset_mapping'][i][1]) else: - encodings = self.tokenizer(text, **self.tokenize_kwargs) - input_ids = encodings['input_ids'] - label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( - text) - - if self._mode == ModeKeys.INFERENCE: - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - offset_mapping = offset_mapping[:sum(label_mask)] - - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - - input_ids = torch.tensor(input_ids).unsqueeze(0) - attention_mask = torch.tensor(attention_mask).unsqueeze(0) - label_mask = torch.tensor( - label_mask, dtype=torch.bool).unsqueeze(0) - - # the token classification - output = { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } + label_mask.append(True) + if is_split_into_words: + offset_mapping.append((word_ids[i], word_ids[i] + 1)) + else: + offset_mapping.append(encodings['offset_mapping'][i]) + + padding = self.nlp_tokenizer.get_tokenizer_kwarg('padding') + if padding == 'max_length': + offset_mapping = offset_mapping + [(0, 0)] * ( + len(label_mask) - len(offset_mapping)) + encodings['offset_mapping'] = offset_mapping + encodings['label_mask'] = label_mask + return encodings, word_ids + + def _tokenize_text_with_slow_tokenizer(self, tokens, **kwargs): + assert self.mode == ModeKeys.INFERENCE and isinstance(tokens, str), \ + 'Slow tokenizer now only support str input in inference mode. If you are training models, ' \ + 'please consider using the fast tokenizer.' + word_ids = None + encodings = self.nlp_tokenizer( + tokens, is_split_into_words=False, **kwargs) + tokenizer_name = self.nlp_tokenizer.get_tokenizer_class() + method = 'get_label_mask_and_offset_mapping_' + tokenizer_name + if not hasattr(self, method): + raise RuntimeError( + f'No `{method}` method defined for ' + f'tokenizer {tokenizer_name}, please use a fast tokenizer instead, or ' + f'try to implement a `{method}` method') + label_mask, offset_mapping = getattr(self, method)(tokens) + padding = self.nlp_tokenizer.get_tokenizer_kwarg('padding') + max_length = self.nlp_tokenizer.get_tokenizer_kwarg('max_length') + special_token = 1 if self.nlp_tokenizer.get_tokenizer_kwarg( + 'add_special_tokens') else 0 + if len(label_mask) > max_length - 2 * special_token: + label_mask = label_mask[:(max_length - 2 * special_token)] + offset_mapping = offset_mapping[:sum(label_mask)] + if padding == 'max_length': + label_mask = [False] * special_token + label_mask + \ + [False] * (max_length - len(label_mask) - special_token) + offset_mapping = offset_mapping + [(0, 0)] * ( + max_length - len(offset_mapping)) else: - output = { - 'input_ids': input_ids, - 'token_type_ids': token_type_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - } - - # align the labels with tokenized text - if labels_list is not None: - assert self.label2id is not None - # Map that sends B-Xxx label to its I-Xxx counterpart - b_to_i_label = [] - label_enumerate_values = [ - k for k, v in sorted( - self.label2id.items(), key=lambda item: item[1]) - ] - for idx, label in enumerate(label_enumerate_values): - if label.startswith('B-') and label.replace( - 'B-', 'I-') in label_enumerate_values: - b_to_i_label.append( - label_enumerate_values.index( - label.replace('B-', 'I-'))) - else: - b_to_i_label.append(idx) - - label_row = [self.label2id[lb] for lb in labels_list] - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - if word_idx is None: - label_ids.append(-100) - elif word_idx != previous_word_idx: - label_ids.append(label_row[word_idx]) - else: - if self.label_all_tokens: - label_ids.append(b_to_i_label[label_row[word_idx]]) - else: - label_ids.append(-100) - previous_word_idx = word_idx - labels = label_ids - output['labels'] = labels - output = { - k: np.array(v) if isinstance(v, list) else v - for k, v in output.items() - } - return output + label_mask = [False] * special_token + label_mask + \ + [False] * special_token + encodings['offset_mapping'] = offset_mapping + encodings['label_mask'] = label_mask + return encodings, word_ids - def get_tokenizer_class(self): - tokenizer_class = self.tokenizer.__class__.__name__ - if tokenizer_class.endswith( - 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': - tokenizer_class = tokenizer_class[:-4] - return tokenizer_class + def get_label_mask_and_offset_mapping_BertTokenizer(self, text): + label_mask = [] + offset_mapping = [] + tokens = self.nlp_tokenizer.tokenizer.tokenize(text) + offset = 0 + for token in tokens: + is_start = (token[:2] != '##') + if is_start: + label_mask.append(True) + else: + token = token[2:] + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + + return label_mask, offset_mapping - def get_label_mask_and_offset_mapping(self, text): + def get_label_mask_and_offset_mapping_XLMRobertaTokenizer(self, text): label_mask = [] offset_mapping = [] - tokens = self.tokenizer.tokenize(text) + tokens = self.nlp_tokenizer.tokenizer.tokenize(text) offset = 0 - if self.get_tokenizer_class() == 'BertTokenizer': - for token in tokens: - is_start = (token[:2] != '##') - if is_start: - label_mask.append(True) - else: - token = token[2:] - label_mask.append(False) - start = offset + text[offset:].index(token) - end = start + len(token) - if is_start: - offset_mapping.append((start, end)) - else: - offset_mapping[-1] = (offset_mapping[-1][0], end) - offset = end - elif self.get_tokenizer_class() == 'XLMRobertaTokenizer': + last_is_blank = False + for token in tokens: + is_start = (token[0] == '▁') + if is_start: + token = token[1:] + label_mask.append(True) + if len(token) == 0: + last_is_blank = True + continue + else: + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if last_is_blank or is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end last_is_blank = False - for token in tokens: - is_start = (token[0] == '▁') - if is_start: - token = token[1:] - label_mask.append(True) - if len(token) == 0: - last_is_blank = True - continue - else: - label_mask.append(False) - start = offset + text[offset:].index(token) - end = start + len(token) - if last_is_blank or is_start: - offset_mapping.append((start, end)) - else: - offset_mapping[-1] = (offset_mapping[-1][0], end) - offset = end - last_is_blank = False - else: - raise NotImplementedError - return label_mask, offset_mapping diff --git a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py index a356cea7..f2ea73f6 100644 --- a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py @@ -9,19 +9,23 @@ from modelscope.outputs import OutputKeys from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.type_assert import type_assert -from .token_classification_preprocessor import TokenClassificationPreprocessor +from .token_classification_preprocessor import \ + TokenClassificationTransformersPreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.thai_ner_tokenizer) -class NERPreprocessorThai(TokenClassificationPreprocessor): +class NERPreprocessorThai(TokenClassificationTransformersPreprocessor): - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + @type_assert(object, (str, dict)) + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: from pythainlp import word_tokenize - + if isinstance(data, str): + text = data + else: + text = data[self.first_sequence] segmented_data = ' '.join([ - w.strip(' ') for w in word_tokenize(text=data, engine='newmm') + w.strip(' ') for w in word_tokenize(text=text, engine='newmm') if w.strip(' ') != '' ]) output = super().__call__(segmented_data) @@ -31,12 +35,17 @@ class NERPreprocessorThai(TokenClassificationPreprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.thai_wseg_tokenizer) -class WordSegmentationPreprocessorThai(TokenClassificationPreprocessor): +class WordSegmentationPreprocessorThai( + TokenClassificationTransformersPreprocessor): - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + @type_assert(object, (str, dict)) + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: import regex - data = regex.findall(r'\X', data) + if isinstance(data, str): + text = data + else: + text = data[self.first_sequence] + data = regex.findall(r'\X', text) data = ' '.join([char for char in data]) output = super().__call__(data) diff --git a/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py index f8970d1a..c68d6c3b 100644 --- a/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py @@ -9,19 +9,23 @@ from modelscope.outputs import OutputKeys from modelscope.preprocessors.builder import PREPROCESSORS from modelscope.utils.constant import Fields, ModeKeys from modelscope.utils.type_assert import type_assert -from .token_classification_preprocessor import TokenClassificationPreprocessor +from .token_classification_preprocessor import \ + TokenClassificationTransformersPreprocessor @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.viet_ner_tokenizer) -class NERPreprocessorViet(TokenClassificationPreprocessor): +class NERPreprocessorViet(TokenClassificationTransformersPreprocessor): - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: + @type_assert(object, (str, dict)) + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: from pyvi import ViTokenizer - + if isinstance(data, str): + text = data + else: + text = data[self.first_sequence] seg_words = [ - t.strip(' ') for t in ViTokenizer.tokenize(data).split(' ') + t.strip(' ') for t in ViTokenizer.tokenize(text).split(' ') if t.strip(' ') != '' ] raw_words = [] diff --git a/modelscope/preprocessors/nlp/transformers_tokenizer.py b/modelscope/preprocessors/nlp/transformers_tokenizer.py new file mode 100644 index 00000000..2cec4b93 --- /dev/null +++ b/modelscope/preprocessors/nlp/transformers_tokenizer.py @@ -0,0 +1,112 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from collections.abc import Mapping + +import json +from transformers import AutoTokenizer + +from modelscope.metainfo import Models +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModeKeys +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = [ + 'NLPTokenizer', +] + + +class NLPTokenizer: + + def __init__(self, + model_dir: str = None, + model_type=None, + use_fast: bool = None, + tokenize_kwargs=None): + """The transformers tokenizer preprocessor base class. + + Any nlp preprocessor which uses the huggingface tokenizer can inherit from this class. + + Args: + model_dir (str, `optional`): The local path containing the files used to create a preprocessor. + use_fast (str, `optional`): Use the fast version of tokenizer + tokenize_kwargs (dict, `optional`): These args will be directly fed into the tokenizer. + """ + self.model_dir = model_dir + self.model_type = model_type + self.tokenize_kwargs = tokenize_kwargs + if self.tokenize_kwargs is None: + self.tokenize_kwargs = {} + self._use_fast = use_fast + self._tokenizer = None + + @property + def tokenizer(self): + if self._tokenizer is None: + self._tokenizer = self.build_tokenizer() + return self._tokenizer + + @property + def use_fast(self): + if self._use_fast is None: + if self._use_fast is None and self.model_dir is None: + self._use_fast = False + elif self._use_fast is None and os.path.isfile( + os.path.join(self.model_dir, 'tokenizer_config.json')): + with open( + os.path.join(self.model_dir, 'tokenizer_config.json'), + 'r', + encoding='utf-8') as f: + json_config = json.load(f) + self._use_fast = json_config.get('use_fast') + self._use_fast = False if self._use_fast is None else self._use_fast + return self._use_fast + + def build_tokenizer(self): + """Build a tokenizer by the model type. + + NOTE: The fast tokenizers have a multi-thread problem, use it carefully. + + Returns: + The initialized tokenizer. + """ + # fast version lead to parallel inference failed + model_type = self.model_type + model_dir = self.model_dir + if model_type == Models.deberta_v2: + from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast + tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer + return tokenizer.from_pretrained( + model_dir) if model_dir is not None else tokenizer() + + if model_type in (Models.structbert, Models.gpt3, Models.palm, + Models.plug): + from transformers import BertTokenizer, BertTokenizerFast + tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer + return tokenizer.from_pretrained( + model_dir) if model_dir is not None else tokenizer() + elif model_type == Models.veco: + from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast + tokenizer = XLMRobertaTokenizerFast if self.use_fast else XLMRobertaTokenizer + return tokenizer.from_pretrained( + model_dir) if model_dir is not None else tokenizer() + + assert model_dir is not None + return AutoTokenizer.from_pretrained(model_dir, use_fast=self.use_fast) + + def __call__(self, text, text_pair=None, **kwargs): + kwargs['max_length'] = kwargs.get('max_length', + kwargs.pop('sequence_length', None)) + if kwargs['max_length'] is None: + kwargs.pop('max_length') + tokenize_kwargs = {k: v for k, v in self.tokenize_kwargs.items()} + tokenize_kwargs.update(kwargs) + kwargs.update(self.tokenize_kwargs) + return self.tokenizer(text, text_pair, **tokenize_kwargs) + + def get_tokenizer_kwarg(self, key, default_value=None): + if key in self.tokenize_kwargs: + return self.tokenize_kwargs[key] + return self.tokenizer.init_kwargs.get(key, default_value) diff --git a/modelscope/preprocessors/nlp/utils.py b/modelscope/preprocessors/nlp/utils.py new file mode 100644 index 00000000..bc097f3e --- /dev/null +++ b/modelscope/preprocessors/nlp/utils.py @@ -0,0 +1,100 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from collections.abc import Mapping +from typing import Any, Dict, List, Tuple, Union + +import json +import numpy as np +from transformers import AutoTokenizer + +from modelscope.metainfo import Models +from modelscope.outputs import OutputKeys +from modelscope.preprocessors.base import Preprocessor +from modelscope.utils.constant import ModeKeys +from modelscope.utils.hub import get_model_type, parse_label_mapping +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['parse_text_and_label', 'labels_to_id'] + + +def parse_text_and_label(data, + mode, + first_sequence=None, + second_sequence=None, + label=None): + """Parse the input and return the sentences and labels. + + When input type is tuple or list and its size is 2: + If the pair param is False, data will be parsed as the first_sentence and the label, + else it will be parsed as the first_sentence and the second_sentence. + + Args: + data: The input data. + mode: The mode of the preprocessor + first_sequence: The key of the first sequence + second_sequence: The key of the second sequence + label: The key of the label + Returns: + The sentences and labels tuple. + """ + text_a, text_b, labels = None, None, None + if isinstance(data, str): + text_a = data + elif isinstance(data, tuple) or isinstance(data, list): + if len(data) == 3: + text_a, text_b, labels = data + elif len(data) == 2: + if mode == ModeKeys.INFERENCE: + text_a, text_b = data + else: + text_a, labels = data + elif isinstance(data, Mapping): + text_a = data.get(first_sequence) + text_b = data.get(second_sequence) + if label is None or isinstance(label, str): + labels = data.get(label) + else: + labels = [data.get(lb) for lb in label] + return text_a, text_b, labels + + +def labels_to_id(labels, output, label2id=None): + """Turn the labels to id with the type int or float. + + If the original label's type is str or int, the label2id mapping will try to convert it to the final label. + If the original label's type is float, or the label2id mapping does not exist, + the original label will be returned. + + Args: + label2id: An extra label2id mapping. If not provided, the label will not be translated to ids. + labels: The input labels. + output: The label id. + + Returns: + The final labels. + """ + + def label_can_be_mapped(label): + return isinstance(label, str) or isinstance(label, int) + + try: + if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \ + and label2id is not None: + output[OutputKeys.LABELS] = [ + label2id[label] if label in label2id else label2id[str(label)] + for label in labels + ] + elif label_can_be_mapped(labels) and label2id is not None: + output[OutputKeys.LABELS] = label2id[ + labels] if labels in label2id else label2id[str(labels)] + elif labels is not None: + output[OutputKeys.LABELS] = labels + except KeyError as e: + logger.error( + f'Label {labels} cannot be found in the label mapping {label2id},' + f'which comes from the user input or the configuration files. ' + f'Please consider matching your labels with this mapping.') + raise e diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py new file mode 100644 index 00000000..a7d87674 --- /dev/null +++ b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py @@ -0,0 +1,74 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict, Union + +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.hub import get_model_type +from .transformers_tokenizer import NLPTokenizer + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) +class ZeroShotClassificationTransformersPreprocessor(Preprocessor): + """The tokenizer preprocessor used in zero shot classification. + """ + + def __init__(self, + model_dir: str, + first_sequence=None, + mode=ModeKeys.INFERENCE, + sequence_length=512, + use_fast=None, + **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ + self.sequence_length = sequence_length + model_type = None + if model_dir is not None: + model_type = get_model_type(model_dir) + self.nlp_tokenizer = NLPTokenizer( + model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs) + self.first_sequence = first_sequence + super().__init__(mode=mode) + + def __call__(self, + data: Union[str, Dict], + hypothesis_template: str, + candidate_labels: list, + padding=True, + truncation=True, + truncation_strategy='only_first', + **kwargs) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str or dict): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + if isinstance(data, dict): + data = data.get(self.first_sequence) + + pairs = [[data, hypothesis_template.format(label)] + for label in candidate_labels] + + if 'return_tensors' not in kwargs: + kwargs[ + 'return_tensors'] = 'pt' if self._mode == ModeKeys.INFERENCE else None + + features = self.nlp_tokenizer( + pairs, + padding=padding, + truncation=truncation, + truncation_strategy=truncation_strategy, + **kwargs) + return features diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py deleted file mode 100644 index eb3c4b37..00000000 --- a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict, Union - -from modelscope.metainfo import Preprocessors -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields, ModeKeys -from .nlp_base import NLPTokenizerPreprocessorBase - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) -class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in zero shot classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - self.sequence_length = kwargs.pop('sequence_length', 512) - super().__init__(model_dir, mode=mode, **kwargs) - - def __call__(self, data: Union[str, Dict], hypothesis_template: str, - candidate_labels: list) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str or dict): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - if isinstance(data, dict): - data = data.get(self.first_sequence) - - pairs = [[data, hypothesis_template.format(label)] - for label in candidate_labels] - - features = self.tokenizer( - pairs, - padding=True, - truncation=True, - max_length=self.sequence_length, - truncation_strategy='only_first', - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None) - return features diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index 89aa39ba..91b4ef8b 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -6,11 +6,12 @@ import numpy as np import torch from modelscope import __version__ -from modelscope.metainfo import Hooks -from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint +from modelscope.metainfo import Hooks, Pipelines +from modelscope.utils.checkpoint import (load_checkpoint, save_checkpoint, + save_configuration) from modelscope.utils.constant import LogKeys, ModelFile from modelscope.utils.logger import get_logger -from modelscope.utils.torch_utils import get_dist_info, is_master +from modelscope.utils.torch_utils import is_master from .builder import HOOKS from .hook import Hook from .priority import Priority @@ -28,17 +29,25 @@ class CheckpointHook(Hook): save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` save_last (bool): Whether to save the last checkpoint. Default: True. checkpoint_file (str): The checkpoint file to be loaded. + load_all_state (bool): Load all states(optimizer, epoch, lr_scheduler, random_state, etc.) when loading old + training state file or not. The model's state dict will only be loaded if False. + max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything. + If the number exceeding the limit, earlier checkpoints will be deleted first. """ PRIORITY = Priority.LOW - def __init__(self, - interval=0, - by_epoch=True, - save_optimizer=True, - save_dir=None, - save_last=True, - checkpoint_file=None): + def __init__( + self, + interval=0, + by_epoch=True, + save_optimizer=True, + save_dir=None, + save_last=True, + checkpoint_file=None, + load_all_state=True, + max_checkpoint_num=None, + ): self.interval = interval self.by_epoch = by_epoch self.save_optimizer = save_optimizer @@ -47,6 +56,11 @@ class CheckpointHook(Hook): self.save_last = save_last self.rng_state = None self.need_load_rng_state = False + self.load_all_state = load_all_state + self.max_checkpoint_num = None + if max_checkpoint_num is not None: + self.max_checkpoint_num = max(int(max_checkpoint_num), 1) + self.history_checkpoints = [] def before_run(self, trainer): if not self.save_dir: @@ -65,9 +79,10 @@ class CheckpointHook(Hook): if self.checkpoint_file is not None and os.path.isfile( self.checkpoint_file): - meta = self.load_checkpoint(self.checkpoint_file, trainer) + meta = self.load_checkpoint(self.checkpoint_file, trainer, + self.load_all_state) self.rng_state = meta.get('rng_state') - self.need_load_rng_state = True + self.need_load_rng_state = self.load_all_state def before_train_iter(self, trainer): if self.need_load_rng_state: @@ -95,28 +110,30 @@ class CheckpointHook(Hook): self._save_checkpoint(trainer) @classmethod - def load_checkpoint(cls, filename, trainer): + def load_checkpoint(cls, filename, trainer, load_all_state=True): from modelscope.trainers.parallel.utils import is_parallel if is_parallel(trainer.model): model = trainer.model.module else: model = trainer.model - meta = load_checkpoint(filename, model, - getattr(trainer, 'optimizer', None), - getattr(trainer, 'lr_scheduler', None)) - trainer._epoch = meta.get('epoch', trainer._epoch) - trainer._iter = meta.get('iter', trainer._iter) - trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter) - - for i, hook in enumerate(trainer.hooks): - # hook: Hook - key = f'{hook.__class__}-{i}' - if key in meta and hasattr(hook, 'load_state_dict'): - hook.load_state_dict(meta.get(key, {})) - else: - trainer.logger.warn( - f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.' - ) + meta = load_checkpoint( + filename, model, + getattr(trainer, 'optimizer', None) if load_all_state else None, + getattr(trainer, 'lr_scheduler', None) if load_all_state else None) + if load_all_state: + trainer._epoch = meta.get('epoch', trainer._epoch) + trainer._iter = meta.get('iter', trainer._iter) + trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter) + + for i, hook in enumerate(trainer.hooks): + # hook: Hook + key = f'{hook.__class__}-{i}' + if key in meta and hasattr(hook, 'load_state_dict'): + hook.load_state_dict(meta.get(key, {})) + else: + trainer.logger.warn( + f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.' + ) version = meta.get('modelscope') if version != __version__: @@ -163,6 +180,21 @@ class CheckpointHook(Hook): and not self.by_epoch): self._save_pretrained(trainer) + self.history_checkpoints.append(cur_save_name) + self.remove_obsolete_checkpoints() + + def remove_obsolete_checkpoints(self): + if self.max_checkpoint_num is not None and \ + len(self.history_checkpoints) > self.max_checkpoint_num: + history_checkpoints = [ckpt for ckpt in self.history_checkpoints] + self.history_checkpoints.clear() + for i, ckpt_file in enumerate(history_checkpoints): + if i < len(history_checkpoints) - self.max_checkpoint_num: + if os.path.isfile(ckpt_file): + os.remove(ckpt_file) + else: + self.history_checkpoints.append(ckpt_file) + def _save_pretrained(self, trainer): output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR) from modelscope.trainers.parallel.utils import is_parallel @@ -175,15 +207,53 @@ class CheckpointHook(Hook): config = trainer.cfg.to_dict() # override pipeline by tasks name after finetune done, # avoid case like fill mask pipeline with a text cls task - config['pipeline'] = {'type': config['task']} + if config['task'] in [ + getattr(Pipelines, attr) for attr in dir(Pipelines) + if not attr.startswith('__') + ]: + # TODO a temp fix to avoid pipeline_name and task mismatch + config['pipeline'] = {'type': config['task']} + + class SaveConfig: + + def __init__(self, output_dir, config): + self.output_dir = output_dir + self.config = config + + def __call__(self, _output_dir, _config): + self.config = _config + + def save_config(self): + save_configuration(self.output_dir, self.config) + + save_config_fn = SaveConfig(output_dir, config) if hasattr(model, 'save_pretrained'): + # Now support two binary files: pytorch_model.bin and pytorch_model.pt + default_bin_file = ModelFile.TORCH_MODEL_BIN_FILE + if hasattr( + model, + 'model_dir') and ModelFile.TORCH_MODEL_FILE in os.listdir( + model.model_dir): + default_bin_file = ModelFile.TORCH_MODEL_FILE model.save_pretrained( output_dir, - ModelFile.TORCH_MODEL_BIN_FILE, + default_bin_file, save_function=save_checkpoint, - config=config, + config=save_config_fn.config, + save_config_function=save_config_fn, with_meta=False) + if trainer.train_preprocessor is not None: + trainer.train_preprocessor.save_pretrained( + output_dir, + save_config_fn.config, + save_config_function=save_config_fn) + if trainer.eval_preprocessor is not None: + trainer.eval_preprocessor.save_pretrained( + output_dir, + save_config_fn.config, + save_config_function=save_config_fn) + save_config_fn.save_config() def after_train_iter(self, trainer): if self.by_epoch: @@ -222,6 +292,9 @@ class BestCkptSaverHook(CheckpointHook): save_optimizer (bool): Whether to save optimizer state dict. Default: True. save_dir (str): Output directory to save best checkpoint. restore_best (bool): Whether to restore the best checkpoint after training. + max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything. + If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the + `rule` and `metric_key` arguments. """ PRIORITY = Priority.LOW @@ -235,13 +308,17 @@ class BestCkptSaverHook(CheckpointHook): save_dir=None, save_file_name=None, restore_best=False, - interval=0): + max_checkpoint_num=1, + interval=0, + **kwargs): assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.' super().__init__( interval=interval, by_epoch=by_epoch, save_optimizer=save_optimizer, save_dir=save_dir, + max_checkpoint_num=max_checkpoint_num, + **kwargs, ) self.metric_key = metric_key self.rule = rule @@ -249,6 +326,7 @@ class BestCkptSaverHook(CheckpointHook): self._best_ckpt_file = None self.save_file_name = save_file_name self.restore_best = restore_best + self.history_checkpoints = set() def _should_save(self, trainer): return self._is_best_metric(trainer.metric_values) @@ -284,6 +362,10 @@ class BestCkptSaverHook(CheckpointHook): self.save_dir, f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth' ) + else: + if '.' not in cur_save_name: + cur_save_name = f'{cur_save_name}.pth' + cur_save_name = os.path.join(self.save_dir, cur_save_name) meta = { 'epoch': trainer.epoch, @@ -300,6 +382,28 @@ class BestCkptSaverHook(CheckpointHook): trainer.lr_scheduler, meta) self._best_ckpt_file = cur_save_name self._save_pretrained(trainer) + self.history_checkpoints.add(cur_save_name) + self.remove_obsolete_checkpoints() + + def remove_obsolete_checkpoints(self): + + def extract_metric_from_filename(name1): + metric1 = float(name1.split(self.metric_key)[1].split('.')[0]) + if self.rule == 'max': + return -metric1 + else: + return metric1 + + if self.max_checkpoint_num is not None and \ + len(self.history_checkpoints) > self.max_checkpoint_num: + history_checkpoints = sorted( + self.history_checkpoints, key=extract_metric_from_filename) + self.history_checkpoints.clear() + for i, ckpt_file in enumerate(history_checkpoints): + if i < self.max_checkpoint_num: + self.history_checkpoints.add(ckpt_file) + elif os.path.isfile(ckpt_file): + os.remove(ckpt_file) def state_dict(self): return { diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py index f02faf71..fa6a448f 100644 --- a/modelscope/trainers/nlp/text_generation_trainer.py +++ b/modelscope/trainers/nlp/text_generation_trainer.py @@ -14,8 +14,8 @@ from modelscope.utils.file_utils import func_receive_dict_inputs class TextGenerationTrainer(NlpEpochBasedTrainer): def _decode(self, tokens): - tokenizer = self.eval_preprocessor.tokenizer - return tokenizer.decode(tokens.tolist(), skip_special_tokens=True) + return self.eval_preprocessor.decode( + tokens.tolist(), skip_special_tokens=True) def evaluation_step(self, data): model = self.model.module if self._dist else self.model diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 65e56f9e..5ce7c2f5 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -426,77 +426,51 @@ class NlpTrainerArguments: @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer) class NlpEpochBasedTrainer(EpochBasedTrainer): + """Add code to adapt with nlp models. + + This trainer will accept the information of labels&text keys in the cfg, and then initialize + the nlp models/preprocessors with this information. + + Labels&text key information may be carried in the cfg like this: + + >>> cfg = { + >>> ... + >>> "dataset": { + >>> "train": { + >>> "first_sequence": "text1", + >>> "second_sequence": "text2", + >>> "label": "label", + >>> "labels": [1, 2, 3, 4], + >>> }, + >>> "val": { + >>> "first_sequence": "text3", + >>> "second_sequence": "text4", + >>> "label": "label2", + >>> }, + >>> } + >>> } + + To view some actual finetune examples, please check the test files listed below: + tests/trainers/test_finetune_sequence_classification.py + tests/trainers/test_finetune_token_classification.py + """ - def __init__( - self, - model: Optional[Union[TorchModel, nn.Module, str]] = None, - cfg_file: Optional[str] = None, - cfg_modify_fn: Optional[Callable] = None, - arg_parse_fn: Optional[Callable] = None, - data_collator: Optional[Callable] = None, - train_dataset: Optional[Union[MsDataset, Dataset]] = None, - eval_dataset: Optional[Union[MsDataset, Dataset]] = None, - preprocessor: Optional[Preprocessor] = None, - optimizers: Tuple[torch.optim.Optimizer, - torch.optim.lr_scheduler._LRScheduler] = (None, - None), - model_revision: Optional[str] = DEFAULT_MODEL_REVISION, - **kwargs): - """Add code to adapt with nlp models. - - This trainer will accept the information of labels&text keys in the cfg, and then initialize - the nlp models/preprocessors with this information. - - Labels&text key information may be carried in the cfg like this: - - >>> cfg = { - >>> ... - >>> "dataset": { - >>> "train": { - >>> "first_sequence": "text1", - >>> "second_sequence": "text2", - >>> "label": "label", - >>> "labels": [1, 2, 3, 4] - >>> } - >>> } - >>> } - - - Args: - cfg_modify_fn: An input fn which is used to modify the cfg read out of the file. - - Example: - >>> def cfg_modify_fn(cfg): - >>> cfg.preprocessor.first_sequence= 'text1' - >>> cfg.preprocessor.second_sequence='text2' - >>> return cfg - - To view some actual finetune examples, please check the test files listed below: - tests/trainers/test_finetune_sequence_classification.py - tests/trainers/test_finetune_token_classification.py - """ - - if isinstance(model, str): - model_dir = self.get_or_download_model_dir(model, model_revision) - if cfg_file is None: - cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) - else: - assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!' - model_dir = os.path.dirname(cfg_file) - + def __init__(self, *args, **kwargs): self.label2id = None self.id2label = None self.num_labels = None - self.cfg_modify_fn = cfg_modify_fn - self.cfg = self.rebuild_config(Config.from_file(cfg_file)) + self.train_keys = None + self.eval_keys = None + super().__init__(*args, **kwargs) + def prepare_labels(self, cfg): try: - labels = self.cfg.dataset.train.labels + labels = cfg.dataset.train.labels self.label2id = {label: idx for idx, label in enumerate(labels)} self.id2label = {idx: label for idx, label in enumerate(labels)} self.num_labels = len(labels) except AttributeError: - label2id = parse_label_mapping(model_dir) + label2id = parse_label_mapping(self.model_dir) if label2id is not None: self.label2id = label2id self.id2label = {id: label for label, id in label2id.items()} @@ -514,30 +488,15 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): return {k: v for k, v in input_keys.items() if v is not None} - self.train_keys = build_dataset_keys( - self.cfg.dataset.train if hasattr(self.cfg, 'dataset') - and hasattr(self.cfg.dataset, 'train') else None) - self.eval_keys = build_dataset_keys( - self.cfg.dataset.val if hasattr(self.cfg, 'dataset') - and hasattr(self.cfg.dataset, 'val') else None) + self.train_keys = build_dataset_keys(cfg.safe_get('dataset.train')) + self.eval_keys = build_dataset_keys(cfg.safe_get('dataset.val')) if len(self.eval_keys) == 0: self.eval_keys = self.train_keys - super().__init__( - model=model_dir, - cfg_file=cfg_file, - arg_parse_fn=arg_parse_fn, - data_collator=data_collator, - preprocessor=preprocessor, - optimizers=optimizers, - model_revision=model_revision, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - **kwargs) - def rebuild_config(self, cfg: Config): if self.cfg_modify_fn is not None: cfg = self.cfg_modify_fn(cfg) + self.prepare_labels(cfg) if not hasattr(cfg.model, 'label2id') and not hasattr( cfg.model, 'id2label'): if self.id2label is not None: @@ -571,6 +530,8 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): Returns: The preprocessor instance. """ + + # Compatible with old logic model_args = {} if self.label2id is None else { 'label2id': self.label2id } diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index db5f6a9c..172cd6a8 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -74,12 +74,20 @@ class EpochBasedTrainer(BaseTrainer): containing the optimizer and the scheduler to use. seed (int): The optional random seed for torch, cuda, numpy and random. max_epochs: (int, optional): Total training epochs. + cfg_modify_fn: An input fn which is used to modify the cfg read out of the file. + + Examples of cfg_modify_fn: + >>> def cfg_modify_fn(cfg): + >>> cfg.preprocessor.first_sequence= 'text1' + >>> cfg.preprocessor.second_sequence='text2' + >>> return cfg """ def __init__( self, model: Optional[Union[TorchModel, nn.Module, str]] = None, cfg_file: Optional[str] = None, + cfg_modify_fn: Optional[Callable] = None, arg_parse_fn: Optional[Callable] = None, data_collator: Optional[Union[Callable, Dict[str, Callable]]] = None, @@ -96,6 +104,14 @@ class EpochBasedTrainer(BaseTrainer): self._seed = seed set_random_seed(self._seed) + self._metric_values = None + self.optimizers = optimizers + self._mode = ModeKeys.TRAIN + self._hooks: List[Hook] = [] + self._epoch = 0 + self._iter = 0 + self._inner_iter = 0 + if isinstance(model, str): self.model_dir = self.get_or_download_model_dir( model, model_revision) @@ -107,11 +123,11 @@ class EpochBasedTrainer(BaseTrainer): self.model_dir = os.path.dirname(cfg_file) super().__init__(cfg_file, arg_parse_fn) - + self.cfg_modify_fn = cfg_modify_fn # add default config merge_cfg(self.cfg) self.cfg = self.rebuild_config(self.cfg) - + self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) if 'cfg_options' in kwargs: self.cfg.merge_from_dict(kwargs['cfg_options']) @@ -125,110 +141,136 @@ class EpochBasedTrainer(BaseTrainer): else: self.work_dir = self.cfg.train.get('work_dir', './work_dir') - self.train_preprocessor, self.eval_preprocessor = None, None - if isinstance(preprocessor, Preprocessor): - self.train_preprocessor = preprocessor - self.eval_preprocessor = preprocessor - elif isinstance(preprocessor, Mapping): - if not (ConfigKeys.train in preprocessor - or ConfigKeys.val in preprocessor): - raise ValueError( - f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' - ) - if ConfigKeys.train in preprocessor: - assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) - self.train_preprocessor = preprocessor[ConfigKeys.train] - if ConfigKeys.val in preprocessor: - assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) - self.eval_preprocessor = preprocessor[ConfigKeys.val] - elif hasattr(self.cfg, ConfigFields.preprocessor - ) and self.cfg.preprocessor is not None: - self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( - ) - - if self.train_preprocessor is not None: - self.train_preprocessor.mode = ModeKeys.TRAIN - if self.eval_preprocessor is not None: - self.eval_preprocessor.mode = ModeKeys.EVAL + self.train_preprocessor, self.eval_preprocessor = self.get_preprocessors( + preprocessor) - if kwargs.get('launcher', None) is not None: - init_dist(kwargs['launcher']) - - _, world_size = get_dist_info() - self._dist = world_size > 1 + self._dist = self.init_dist(kwargs.get('launcher')) + self.device = self.get_device(kwargs.get('device')) - device_name = kwargs.get('device', 'gpu') - if self._dist: - local_rank = get_local_rank() - device_name = f'cuda:{local_rank}' - - self.device = create_device(device_name) self.train_dataset = self.to_task_dataset( train_dataset, mode=ModeKeys.TRAIN, - task_data_config=self.cfg.dataset.get('train', None) if hasattr( - self.cfg, 'dataset') else None, + task_data_config=self.cfg.safe_get('dataset.train'), preprocessor=self.train_preprocessor, **kwargs) self.eval_dataset = self.to_task_dataset( eval_dataset, mode=ModeKeys.EVAL, - task_data_config=self.cfg.dataset.get('val', None) if hasattr( - self.cfg, 'dataset') else None, + task_data_config=self.cfg.safe_get('dataset.val'), preprocessor=self.eval_preprocessor, **kwargs) - self.train_data_collator, self.eval_data_collator = None, None + self.train_data_collator, self.eval_data_collator = self.get_data_collator( + data_collator) + self.metrics = self.get_metrics() + self._max_epochs = kwargs.get('max_epochs', + self.cfg.safe_get('train.max_epochs')) + assert self._max_epochs is not None, 'max_epochs should be provided by the init arguments or configured ' \ + 'in the `train.max_epochs` key in the configuration file.' + self._train_iters_per_epoch = kwargs.get( + 'train_iters_per_epoch', + self.cfg.safe_get('train.train_iters_per_epoch')) + self._eval_iters_per_epoch = kwargs.get( + 'val_iters_per_epoch', + self.cfg.safe_get('evaluation.val_iters_per_epoch')) + self.use_fp16 = kwargs.get('use_fp16', False) + # model placement + self.place_model() + + def place_model(self): + """Place model to device, or to DDP + """ + if self.device.type == 'cuda': + self.model.to(self.device) + if not is_parallel(self.model) and self._dist: + self.model = self.to_parallel(self.model) + + def get_data_collator(self, data_collator): + """Get the data collator for both training and evaluating. + + Args: + data_collator: The input data_collator param. + + Returns: + The train_data_collator and eval_data_collator, can be None. + """ + + train_data_collator, eval_data_collator = None, None if isinstance(data_collator, Mapping): - if not (ConfigKeys.train in data_collator - or ConfigKeys.val in data_collator): - raise ValueError( - f'data_collator must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' - ) if ConfigKeys.train in data_collator: assert isinstance(data_collator[ConfigKeys.train], Callable) - self.train_data_collator = data_collator[ConfigKeys.train] + train_data_collator = data_collator[ConfigKeys.train] if ConfigKeys.val in data_collator: assert isinstance(data_collator[ConfigKeys.val], Callable) - self.eval_data_collator = data_collator[ConfigKeys.val] + eval_data_collator = data_collator[ConfigKeys.val] else: collate_fn = default_collate if data_collator is None else data_collator - self.train_data_collator = collate_fn - self.eval_data_collator = collate_fn + train_data_collator = collate_fn + eval_data_collator = collate_fn + return train_data_collator, eval_data_collator - self.metrics = self.get_metrics() - self._metric_values = None - self.optimizers = optimizers - self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO')) - self._mode = ModeKeys.TRAIN - self._hooks: List[Hook] = [] - self._epoch = 0 - self._iter = 0 - self._inner_iter = 0 - if 'max_epochs' not in kwargs: - assert hasattr( - self.cfg.train, - 'max_epochs'), 'max_epochs is missing in configuration file' - self._max_epochs = self.cfg.train.max_epochs - else: - self._max_epochs = kwargs['max_epochs'] - self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None) - self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None) - if self._train_iters_per_epoch is None and hasattr( - self.cfg.train, 'train_iters_per_epoch'): - self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch - if self._eval_iters_per_epoch is None and hasattr( - self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, - 'val_iters_per_epoch'): - self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch + def init_dist(self, launcher=None): + """Init dist and returns the dist information. - self.use_fp16 = kwargs.get('use_fp16', False) + Args: + launcher: The launcher info. - # model placement - if self.device.type == 'cuda': - self.model.to(self.device) - if not is_parallel(self.model) and self._dist: - self.model = self.to_parallel(self.model) + Returns: + _dist: If world_size is greater than 1. + """ + if launcher is not None: + init_dist(launcher) + + _, world_size = get_dist_info() + _dist = world_size > 1 + return _dist + + def get_device(self, device=None): + """Get the device information. + + Args: + device: The input device info. + + Returns: + device_name: The final device name. + """ + device_name = device if device is not None else 'gpu' + if self._dist: + local_rank = get_local_rank() + device_name = f'cuda:{local_rank}' + + return create_device(device_name) + + def get_preprocessors(self, preprocessor): + """Get the preprocessors information. + + Args: + preprocessor: The input preprocessor info. + + Returns: + The train_preprocessor and eval_preprocessor, can be None. + """ + train_preprocessor = None + eval_preprocessor = None + if isinstance(preprocessor, Preprocessor): + train_preprocessor = preprocessor + eval_preprocessor = preprocessor + elif isinstance(preprocessor, Mapping): + if ConfigKeys.train in preprocessor: + assert isinstance(preprocessor[ConfigKeys.train], Callable) + train_preprocessor = preprocessor[ConfigKeys.train] + if ConfigKeys.val in preprocessor: + assert isinstance(preprocessor[ConfigKeys.val], Callable) + eval_preprocessor = preprocessor[ConfigKeys.val] + elif hasattr(self.cfg, ConfigFields.preprocessor + ) and self.cfg.preprocessor is not None: + train_preprocessor, eval_preprocessor = self.build_preprocessor() + + if train_preprocessor is not None: + train_preprocessor.mode = ModeKeys.TRAIN + if eval_preprocessor is not None: + eval_preprocessor.mode = ModeKeys.EVAL + return train_preprocessor, eval_preprocessor def rebuild_config(self, cfg: Config): """A method used to rebuild the config, any subclass can override this method. @@ -236,6 +278,8 @@ class EpochBasedTrainer(BaseTrainer): Returns: The rebuilt config """ + if self.cfg_modify_fn is not None: + cfg = self.cfg_modify_fn(cfg) return cfg @property diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index 6e4e7a19..87e0abc7 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -4,6 +4,7 @@ import logging import os import pickle import shutil +from collections.abc import Mapping import torch from torch import distributed as dist @@ -58,7 +59,7 @@ def single_gpu_test(trainer, if progress_with_iters: batch_size = 1 # iteration count else: - if isinstance(data, dict): + if isinstance(data, Mapping): if 'nsentences' in data: batch_size = data['nsentences'] else: @@ -138,7 +139,7 @@ def multi_gpu_test(trainer, result = trainer.evaluation_step(data) results.append(result) - if isinstance(data, dict): + if isinstance(data, Mapping): if 'nsentences' in data: batch_size = data['nsentences'] else: diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py index 5acaa411..e21c3dcc 100644 --- a/modelscope/utils/checkpoint.py +++ b/modelscope/utils/checkpoint.py @@ -5,7 +5,7 @@ import os import time from collections import OrderedDict from shutil import copytree, ignore_patterns, rmtree -from typing import Callable, List, Optional, Union +from typing import Callable, Dict, Optional, Union import json import torch @@ -137,11 +137,18 @@ def load_checkpoint(filename, return checkpoint.get('meta', {}) +def save_configuration(target_folder, config: Dict): + if ConfigFields.pipeline not in config: + config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]} + cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder) + config_file = os.path.join(target_folder, ModelFile.CONFIGURATION) + storage.write(cfg_str.encode(), config_file) + + def save_pretrained(model, target_folder: Union[str, os.PathLike], save_checkpoint_name: str = None, save_function: Callable = None, - config: Optional[dict] = None, **kwargs): """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded @@ -154,11 +161,8 @@ def save_pretrained(model, save_checkpoint_name (str): The checkpoint name to be saved in the target_folder - save_function (Callable, optional): + save_function (Callable): The function to use to save the state dictionary. - - config (Optional[dict], optional): - The config for the configuration.json, might not be identical with model.config """ if save_function is None or not isinstance(save_function, Callable): @@ -173,9 +177,6 @@ def save_pretrained(model, raise Exception( 'At least pass in one checkpoint name for saving method') - if config is None: - raise ValueError('Configuration is not valid') - # Clean the folder from a previous save if os.path.exists(target_folder): rmtree(target_folder) @@ -201,10 +202,3 @@ def save_pretrained(model, raise Exception( f'During saving checkpoints, the error of "{type(e).__name__} ' f'with msg {e} throwed') - - # Dump the config to the configuration.json - if ConfigFields.pipeline not in config: - config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]} - cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder) - config_file = os.path.join(target_folder, ModelFile.CONFIGURATION) - storage.write(cfg_str.encode(), config_file) diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index b3512251..71d820e5 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -3,6 +3,7 @@ # https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py import copy +import dataclasses import os import os.path as osp import platform @@ -10,6 +11,7 @@ import shutil import sys import tempfile import types +from dataclasses import fields from pathlib import Path from types import FunctionType from typing import Dict, Union @@ -337,6 +339,37 @@ class Config: super(Config, self).__setattr__('_filename', _filename) super(Config, self).__setattr__('_text', _text) + def safe_get(self, key_chain: str, default=None): + """Get a value with a key-chain in str format, if key does not exist, the default value will be returned. + + This method is safe to call, and will not edit any value. + + Args: + key_chain: The input key chain, for example: 'train.hooks[0].type' + default: The default value returned when any key does not exist, default None. + + Returns: + The value, or the default value. + """ + try: + keys = key_chain.split('.') + _cfg_dict = self._cfg_dict + for key in keys: + val = None + if '[' in key: + key, val = key.split('[') + val, _ = val.split(']') + _cfg_dict = getattr(_cfg_dict, key) + if val is not None: + _cfg_dict = _cfg_dict[int(val)] + return _cfg_dict + except Exception as e: + logger.debug( + f'Key not valid in Config: {key_chain}, return the default value: {default}' + ) + logger.debug(e) + return default + def dump(self, file: str = None): """Dumps config into a file or returns a string representation of the config. @@ -635,16 +668,6 @@ def check_config(cfg: Union[str, ConfigDict], is_training=False): check_attr(ConfigFields.evaluation) -def use_task_specific_params(model, task): - """Update config with summarization specific params.""" - task_specific_params = model.config.task_specific_params - - if task_specific_params is not None: - pars = task_specific_params.get(task, {}) - logger.info(f'using task specific params for {task}: {pars}') - model.config.update(pars) - - class JSONIteratorEncoder(json.JSONEncoder): """Implement this method in order that supporting arbitrary iterators, it returns a serializable object for ``obj``, or calls the base implementation diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py index 87a6eaff..7841e1fa 100644 --- a/modelscope/utils/hub.py +++ b/modelscope/utils/hub.py @@ -56,8 +56,10 @@ def read_config(model_id_or_path: str, if not os.path.exists(model_id_or_path): local_path = model_file_download( model_id_or_path, ModelFile.CONFIGURATION, revision=revision) - else: + elif os.path.isdir(model_id_or_path): local_path = os.path.join(model_id_or_path, ModelFile.CONFIGURATION) + elif os.path.isfile(model_id_or_path): + local_path = model_id_or_path return Config.from_file(local_path) diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py index 13a21480..3295b5d5 100644 --- a/modelscope/utils/nlp/utils.py +++ b/modelscope/utils/nlp/utils.py @@ -1,5 +1,7 @@ import os.path as osp +from modelscope.utils.hub import parse_label_mapping + def import_external_nltk_data(nltk_data_dir, package_name): """import external nltk_data, and extract nltk zip package. @@ -18,3 +20,49 @@ def import_external_nltk_data(nltk_data_dir, package_name): import zipfile with zipfile.ZipFile(filepath) as zf: zf.extractall(osp.join(packagepath)) + + +def parse_labels_in_order(model_dir=None, cfg=None, **kwargs): + """Parse labels information in order. + + This is a helper function, used to get labels information in the correct order. + 1. The kw arguments listed in the method will in the first priority. + 2. Information in the cfg.dataset.train.labels will be used in the second priority (Compatible with old logic). + 3. Information in other files will be used then. + + Args: + model_dir: The model_dir used to call `parse_label_mapping`. + cfg: An optional cfg parsed and modified from the configuration.json. + **kwargs: The user inputs into the method. + + Returns: + The modified kwargs. + """ + label2id = kwargs.pop('label2id', None) + id2label = kwargs.pop('id2label', None) + num_labels = kwargs.pop('num_labels', None) + if label2id is None and id2label is not None: + label2id = {label: id for id, label in id2label.items()} + if label2id is None: + if cfg is not None and cfg.safe_get( + 'dataset.train.labels') is not None: + # An extra logic to parse labels from the dataset area. + label2id = { + label: idx + for idx, label in enumerate( + cfg.safe_get('dataset.train.labels')) + } + elif model_dir is not None: + label2id = parse_label_mapping(model_dir) + + if num_labels is None and label2id is not None: + num_labels = len(label2id) + if id2label is None and label2id is not None: + id2label = {id: label for label, id in label2id.items()} + if num_labels is not None: + kwargs['num_labels'] = num_labels + if label2id is not None: + kwargs['label2id'] = label2id + if id2label is not None: + kwargs['id2label'] = id2label + return kwargs diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index 5284aa43..38071bb8 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -64,8 +64,9 @@ class Registry(object): if group_key not in self._modules: self._modules[group_key] = dict() - if not inspect.isclass(module_cls): - raise TypeError(f'module is not a class type: {type(module_cls)}') + # Some registered module_cls can be function type. + # if not inspect.isclass(module_cls): + # raise TypeError(f'module is not a class type: {type(module_cls)}') if module_name is None: module_name = module_cls.__name__ diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py index 58b5b1a3..e7a47214 100644 --- a/modelscope/utils/regress_test_utils.py +++ b/modelscope/utils/regress_test_utils.py @@ -770,8 +770,6 @@ class IgnoreKeyFn: self.keys = keys if isinstance(keys, list) else [] def __call__(self, v1output, v2output, key, type): - if key == 'encoder.encoder.layer.0.intermediate.intermediate_act_fn': - print() for _key in self.keys: pattern = re.compile(_key) if key is not None and pattern.fullmatch(key): diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index dff411f6..81a87398 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -4,7 +4,7 @@ import unittest from modelscope.models import Model from modelscope.msdatasets import MsDataset -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.preprocessors.base import Preprocessor from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode from modelscope.utils.test_utils import require_tf, require_torch, test_level @@ -73,7 +73,7 @@ class MsDatasetTest(unittest.TestCase): def test_to_torch_dataset_text(self): model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny' nlp_model = Model.from_pretrained(model_id) - preprocessor = SequenceClassificationPreprocessor( + preprocessor = TextClassificationTransformersPreprocessor( nlp_model.model_dir, first_sequence='premise', second_sequence=None, @@ -95,7 +95,7 @@ class MsDatasetTest(unittest.TestCase): tf.compat.v1.enable_eager_execution() model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny' nlp_model = Model.from_pretrained(model_id) - preprocessor = SequenceClassificationPreprocessor( + preprocessor = TextClassificationTransformersPreprocessor( nlp_model.model_dir, first_sequence='premise', second_sequence=None) diff --git a/tests/pipelines/test_addr_similarity.py b/tests/pipelines/test_addr_similarity.py index 57c47b09..8c1f93c9 100644 --- a/tests/pipelines/test_addr_similarity.py +++ b/tests/pipelines/test_addr_similarity.py @@ -6,7 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -22,7 +22,8 @@ class AddrSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - preprocessor = SequenceClassificationPreprocessor(model.model_dir) + preprocessor = TextClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.text_classification, diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py index 549d2cb3..9ed5cd2b 100644 --- a/tests/pipelines/test_deberta_tasks.py +++ b/tests/pipelines/test_deberta_tasks.py @@ -8,7 +8,7 @@ from modelscope.models import Model from modelscope.models.nlp import DebertaV2ForMaskedLM from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import NLPPreprocessor +from modelscope.preprocessors import FillMaskTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -22,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): model_dir = snapshot_download(self.model_id_deberta) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = DebertaV2ForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -38,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): # sbert print(self.model_id_deberta) model = Model.from_pretrained(self.model_id_deberta) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( task=Tasks.fill_mask, model=model, preprocessor=preprocessor) diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py index 2f66f516..20c21755 100644 --- a/tests/pipelines/test_faq_question_answering.py +++ b/tests/pipelines/test_faq_question_answering.py @@ -9,7 +9,8 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForFaqQuestionAnswering from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline -from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor +from modelscope.preprocessors import \ + FaqQuestionAnsweringTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -47,7 +48,7 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained( + preprocessor = FaqQuestionAnsweringTransformersPreprocessor.from_pretrained( cache_path) model = SbertForFaqQuestionAnswering.from_pretrained(cache_path) pipeline_ins = FaqQuestionAnsweringPipeline( @@ -58,7 +59,8 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - preprocessor = FaqQuestionAnsweringPreprocessor(model.model_dir) + preprocessor = FaqQuestionAnsweringTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.faq_question_answering, model=model, diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py index 39291e76..6bad602a 100644 --- a/tests/pipelines/test_feature_extraction.py +++ b/tests/pipelines/test_feature_extraction.py @@ -9,7 +9,7 @@ from modelscope.models.nlp import FeatureExtractionModel from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FeatureExtractionPipeline -from modelscope.preprocessors import NLPPreprocessor +from modelscope.preprocessors import FillMaskTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -27,7 +27,7 @@ class FeatureExtractionTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = NLPPreprocessor(cache_path, padding=False) + tokenizer = FillMaskTransformersPreprocessor(cache_path, padding=False) model = FeatureExtractionModel.from_pretrained(self.model_id) pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -43,7 +43,8 @@ class FeatureExtractionTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = NLPPreprocessor(model.model_dir, padding=False) + tokenizer = FillMaskTransformersPreprocessor( + model.model_dir, padding=False) pipeline_ins = pipeline( task=Tasks.feature_extraction, model=model, preprocessor=tokenizer) result = pipeline_ins(input=self.sentence1) diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 64833026..bc244826 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -8,7 +8,7 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForMaskedLM, VecoForMaskedLM from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import NLPPreprocessor +from modelscope.preprocessors import FillMaskTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -52,7 +52,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # sbert for language in ['zh']: model_dir = snapshot_download(self.model_id_sbert[language]) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = SbertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -67,7 +67,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model_dir = snapshot_download(self.model_id_veco) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = VecoForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -84,7 +84,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # bert language = 'zh' model_dir = snapshot_download(self.model_id_bert) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = Model.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -102,7 +102,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): for language in ['zh']: print(self.model_id_sbert[language]) model = Model.from_pretrained(self.model_id_sbert[language]) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) @@ -118,7 +118,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model = Model.from_pretrained(self.model_id_veco) - preprocessor = NLPPreprocessor( + preprocessor = FillMaskTransformersPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py index cb2b32d6..5ed019d9 100644 --- a/tests/pipelines/test_multilingual_named_entity_recognition.py +++ b/tests/pipelines/test_multilingual_named_entity_recognition.py @@ -6,8 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition, TransformerCRFForNamedEntityRecognition) from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import (NamedEntityRecognitionThaiPipeline, - NamedEntityRecognitionVietPipeline) +from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline from modelscope.preprocessors import NERPreprocessorThai, NERPreprocessorViet from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck @@ -36,7 +35,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase, tokenizer = NERPreprocessorThai(cache_path) model = TransformerCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) - pipeline1 = NamedEntityRecognitionThaiPipeline( + pipeline1 = NamedEntityRecognitionPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.named_entity_recognition, @@ -76,7 +75,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase, tokenizer = NERPreprocessorViet(cache_path) model = TransformerCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) - pipeline1 = NamedEntityRecognitionVietPipeline( + pipeline1 = NamedEntityRecognitionPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.named_entity_recognition, @@ -103,6 +102,30 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase, task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id) print(pipeline_ins(input=self.viet_sentence)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_tcrf_with_model_name_viet_batch(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id) + print( + pipeline_ins( + input=[ + self.viet_sentence, self.viet_sentence[:10], + self.viet_sentence[5:] + ], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_tcrf_with_model_name_viet_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.viet_tcrf_model_id, + padding=False) + print( + pipeline_ins(input=[ + self.viet_sentence, self.viet_sentence[:10], + self.viet_sentence[5:] + ])) + @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): self.compatibility_check() diff --git a/tests/pipelines/test_multilingual_word_segmentation.py b/tests/pipelines/test_multilingual_word_segmentation.py index 25b4b241..da54fe02 100644 --- a/tests/pipelines/test_multilingual_word_segmentation.py +++ b/tests/pipelines/test_multilingual_word_segmentation.py @@ -48,6 +48,23 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.word_segmentation, model=self.model_id) print(pipeline_ins(input=self.sentence)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=self.model_id) + print( + pipeline_ins( + input=[self.sentence, self.sentence[:10], self.sentence[6:]], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=self.model_id, padding=False) + print( + pipeline_ins( + input=[self.sentence, self.sentence[:10], self.sentence[6:]])) + @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): self.compatibility_check() diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 3317c604..c4bcdfec 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -7,7 +7,8 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition, TransformerCRFForNamedEntityRecognition) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline -from modelscope.preprocessors import TokenClassificationPreprocessor +from modelscope.preprocessors import \ + TokenClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -24,15 +25,19 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base' + lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic' sentence = '这与温岭市新河镇的一个神秘的传说有关。' sentence_en = 'pizza shovel' sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。' addr = '浙江省杭州市余杭区文一西路969号亲橙里' + addr1 = '浙江省西湖区灵隐隧道' + addr2 = '内蒙古自治区巴彦淖尔市' + ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): cache_path = snapshot_download(self.tcrf_model_id) - tokenizer = TokenClassificationPreprocessor(cache_path) + tokenizer = TokenClassificationTransformersPreprocessor(cache_path) model = TransformerCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -49,7 +54,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_by_direct_model_download(self): cache_path = snapshot_download(self.lcrf_model_id) - tokenizer = TokenClassificationPreprocessor(cache_path) + tokenizer = TokenClassificationTransformersPreprocessor(cache_path) model = LSTMCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -66,7 +71,8 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_tcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.tcrf_model_id) - tokenizer = TokenClassificationPreprocessor(model.model_dir) + tokenizer = TokenClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, @@ -77,7 +83,8 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): def test_run_addrst_with_model_from_modelhub(self): model = Model.from_pretrained( 'damo/nlp_structbert_address-parsing_chinese_base') - tokenizer = TokenClassificationPreprocessor(model.model_dir) + tokenizer = TokenClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, @@ -90,10 +97,27 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.named_entity_recognition, model=self.addr_model_id) print(pipeline_ins(input=self.addr)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_addrst_with_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.addr_model_id) + print( + pipeline_ins( + input=[self.addr, self.addr1, self.addr2], batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_addrst_with_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.addr_model_id, + padding=False) + print(pipeline_ins(input=[self.addr, self.addr1, self.addr2])) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.lcrf_model_id) - tokenizer = TokenClassificationPreprocessor(model.model_dir) + tokenizer = TokenClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, @@ -112,18 +136,87 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.named_entity_recognition, model=self.lcrf_model_id) print(pipeline_ins(input=self.sentence)) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_lcrf_with_chinese_model_name(self): pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=self.chinese_model_id) print(pipeline_ins(input=self.sentence_zh)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_lcrf_with_chinese_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.chinese_model_id, + padding=False) + print( + pipeline_ins(input=[ + self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:] + ])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_lcrf_with_chinese_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.chinese_model_id) + print( + pipeline_ins( + input=[ + self.sentence_zh, self.sentence_zh[:20], + self.sentence_zh[10:] + ], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_lstm_with_chinese_model_name(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.lstm_model_id) + print(pipeline_ins(input=self.sentence_zh)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_lstm_with_chinese_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.lstm_model_id, + padding=False) + print( + pipeline_ins(input=[ + self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:] + ])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_lstm_with_chinese_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.lstm_model_id) + print( + pipeline_ins( + input=[ + self.sentence_zh, self.sentence_zh[:20], + self.sentence_zh[10:] + ], + batch_size=2)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_english_with_model_name(self): pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=self.english_model_id) print(pipeline_ins(input=self.sentence_en)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_english_with_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.english_model_id) + print( + pipeline_ins( + input=[self.ecom, self.sentence_zh, self.sentence], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_english_with_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.english_model_id, + padding=False) + print(pipeline_ins(input=[self.ecom, self.sentence_zh, self.sentence])) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.named_entity_recognition) diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index 9e9fefea..94689e96 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -25,7 +25,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = SequenceClassificationPreprocessor(cache_path) + tokenizer = TextClassificationTransformersPreprocessor(cache_path) model = Model.from_pretrained(cache_path) pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) @@ -38,7 +38,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SequenceClassificationPreprocessor(model.model_dir) + tokenizer = TextClassificationTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.nli, model=model, preprocessor=tokenizer) print(pipeline_ins(input=(self.sentence1, self.sentence2))) diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py index 038a90f0..5e4b20dc 100644 --- a/tests/pipelines/test_part_of_speech.py +++ b/tests/pipelines/test_part_of_speech.py @@ -7,7 +7,8 @@ from modelscope.models import Model from modelscope.models.nlp import TokenClassificationModel from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TokenClassificationPipeline -from modelscope.preprocessors import TokenClassificationPreprocessor +from modelscope.preprocessors import \ + TokenClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -19,7 +20,7 @@ class PartOfSpeechTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = TokenClassificationPreprocessor(cache_path) + tokenizer = TokenClassificationTransformersPreprocessor(cache_path) model = TokenClassificationModel.from_pretrained(cache_path) pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -32,7 +33,8 @@ class PartOfSpeechTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = TokenClassificationPreprocessor(model.model_dir) + tokenizer = TokenClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.part_of_speech, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.sentence)) diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py index 561eaf21..b7bbe131 100644 --- a/tests/pipelines/test_relation_extraction.py +++ b/tests/pipelines/test_relation_extraction.py @@ -6,7 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp import InformationExtractionModel from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import InformationExtractionPipeline -from modelscope.preprocessors import RelationExtractionPreprocessor +from modelscope.preprocessors import RelationExtractionTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -23,7 +23,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = RelationExtractionPreprocessor(cache_path) + tokenizer = RelationExtractionTransformersPreprocessor(cache_path) model = InformationExtractionModel.from_pretrained(cache_path) pipeline1 = InformationExtractionPipeline( model, preprocessor=tokenizer) @@ -37,7 +37,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = RelationExtractionPreprocessor(model.model_dir) + tokenizer = RelationExtractionTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.relation_extraction, model=model, diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py index e96724a8..4132f965 100644 --- a/tests/pipelines/test_sentence_embedding.py +++ b/tests/pipelines/test_sentence_embedding.py @@ -7,7 +7,7 @@ from modelscope.models import Model from modelscope.models.nlp import BertForSentenceEmbedding from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import SentenceEmbeddingPipeline -from modelscope.preprocessors import SentenceEmbeddingPreprocessor +from modelscope.preprocessors import SentenceEmbeddingTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -39,7 +39,7 @@ class SentenceEmbeddingTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = SentenceEmbeddingPreprocessor(cache_path) + tokenizer = SentenceEmbeddingTransformersPreprocessor(cache_path) model = BertForSentenceEmbedding.from_pretrained(cache_path) pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -61,7 +61,7 @@ class SentenceEmbeddingTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SentenceEmbeddingPreprocessor(model.model_dir) + tokenizer = SentenceEmbeddingTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.inputs)) diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 904caea3..486fadfa 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -6,7 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -26,7 +26,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): cache_path = snapshot_download(self.model_id) - tokenizer = SequenceClassificationPreprocessor(cache_path) + tokenizer = TextClassificationTransformersPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -42,13 +42,35 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SequenceClassificationPreprocessor(model.model_dir) + tokenizer = TextClassificationTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=model, preprocessor=tokenizer) print(pipeline_ins(input=(self.sentence1, self.sentence2))) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.sentence_similarity, model=self.model_id) + print( + pipeline_ins( + input=[(self.sentence1, self.sentence2), + (self.sentence1[:4], self.sentence2[5:]), + (self.sentence1[2:], self.sentence2[:8])], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.sentence_similarity, model=self.model_id, padding=False) + print( + pipeline_ins(input=[( + self.sentence1, + self.sentence2), (self.sentence1[:4], self.sentence2[5:] + ), (self.sentence1[2:], + self.sentence2[:8])])) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline( diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index 5c8d4e93..e0f823be 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -7,7 +7,7 @@ from modelscope.models.nlp.task_models.sequence_classification import \ SequenceClassificationModel from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -25,7 +25,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = SequenceClassificationPreprocessor(cache_path) + tokenizer = TextClassificationTransformersPreprocessor(cache_path) model = SequenceClassificationModel.from_pretrained( self.model_id, num_labels=2) pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer) @@ -39,7 +39,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = SequenceClassificationPreprocessor(model.model_dir) + tokenizer = TextClassificationTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.text_classification, model=model, diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py index d90263c4..6ce6a9b3 100644 --- a/tests/pipelines/test_text2text_generation.py +++ b/tests/pipelines/test_text2text_generation.py @@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import T5ForConditionalGeneration from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import Text2TextGenerationPipeline -from modelscope.preprocessors import Text2TextGenerationPreprocessor +from modelscope.pipelines.nlp import TextGenerationT5Pipeline +from modelscope.preprocessors import TextGenerationT5Preprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -24,8 +24,8 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): def test_run_T5(self): cache_path = snapshot_download(self.model_id_generate) model = T5ForConditionalGeneration.from_pretrained(cache_path) - preprocessor = Text2TextGenerationPreprocessor(cache_path) - pipeline1 = Text2TextGenerationPipeline(model, preprocessor) + preprocessor = TextGenerationT5Preprocessor(cache_path) + pipeline1 = TextGenerationT5Pipeline(model, preprocessor) pipeline2 = pipeline( Tasks.text2text_generation, model=model, preprocessor=preprocessor) print( @@ -35,7 +35,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_pipeline_with_model_instance(self): model = Model.from_pretrained(self.model_id_translate) - preprocessor = Text2TextGenerationPreprocessor(model.model_dir) + preprocessor = TextGenerationT5Preprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.text2text_generation, model=model, @@ -48,6 +48,28 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.text2text_generation, model=self.model_id_translate) print(pipeline_ins(self.input_translate)) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_pipeline_with_model_id_batch(self): + pipeline_ins = pipeline( + task=Tasks.text2text_generation, model=self.model_id_translate) + inputs = [ + self.input_translate, self.input_translate[:8], + self.input_translate[8:] + ] + print(pipeline_ins(inputs, batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_pipeline_with_model_id_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.text2text_generation, + model=self.model_id_translate, + padding=False) + print( + pipeline_ins([ + self.input_translate, self.input_translate[:8], + self.input_translate[8:] + ])) + @unittest.skip( 'only for test cases, there is no default official model yet') def test_run_pipeline_without_model_id(self): diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index 5b38e116..d07ddbb8 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -5,7 +5,7 @@ from modelscope.models import Model from modelscope.msdatasets import MsDataset from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -41,7 +41,7 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skip('nlp model does not support tensor input, skipped') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - preprocessor = SequenceClassificationPreprocessor( + preprocessor = TextClassificationTransformersPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( task=Tasks.text_classification, diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index ddb77eeb..1ce6695f 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -6,7 +6,7 @@ from modelscope.models import Model from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextGenerationPipeline -from modelscope.preprocessors import TextGenerationPreprocessor +from modelscope.preprocessors import TextGenerationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -44,7 +44,7 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): def run_pipeline_with_model_instance(self, model_id, input): model = Model.from_pretrained(model_id) - preprocessor = TextGenerationPreprocessor( + preprocessor = TextGenerationTransformersPreprocessor( model.model_dir, model.tokenizer, first_sequence='sentence', @@ -53,15 +53,38 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.text_generation, model=model, preprocessor=preprocessor) print(pipeline_ins(input)) - def run_pipeline_with_model_id(self, model_id, input): - pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id) - print(pipeline_ins(input)) + def run_pipeline_with_model_id(self, + model_id, + input, + init_kwargs={}, + run_kwargs={}): + pipeline_ins = pipeline( + task=Tasks.text_generation, model=model_id, **init_kwargs) + print(pipeline_ins(input, **run_kwargs)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_palm_zh_base_with_model_name(self): self.run_pipeline_with_model_id(self.palm_model_id_zh_base, self.palm_input_zh) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_palm_zh_base_with_model_name_batch(self): + self.run_pipeline_with_model_id( + self.palm_model_id_zh_base, [ + self.palm_input_zh, self.palm_input_zh[:10], + self.palm_input_zh[10:] + ], + run_kwargs={'batch_size': 2}) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_palm_zh_base_with_model_name_batch_iter(self): + self.run_pipeline_with_model_id( + self.palm_model_id_zh_base, [ + self.palm_input_zh, self.palm_input_zh[:10], + self.palm_input_zh[10:] + ], + init_kwargs={'padding': False}) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_palm_en_with_model_name(self): self.run_pipeline_with_model_id(self.palm_model_id_en, @@ -144,11 +167,8 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): self.palm_input_en)): cache_path = snapshot_download(model_id) model = PalmForTextGeneration.from_pretrained(cache_path) - preprocessor = TextGenerationPreprocessor( - cache_path, - model.tokenizer, - first_sequence='sentence', - second_sequence=None) + preprocessor = TextGenerationTransformersPreprocessor( + cache_path, first_sequence='sentence', second_sequence=None) pipeline1 = TextGenerationPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.text_generation, model=model, preprocessor=preprocessor) @@ -160,7 +180,7 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): def test_run_gpt3(self): cache_path = snapshot_download(self.gpt3_base_model_id) model = GPT3ForTextGeneration(cache_path) - preprocessor = TextGenerationPreprocessor( + preprocessor = TextGenerationTransformersPreprocessor( cache_path, model.tokenizer, first_sequence='sentence', @@ -175,7 +195,10 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.text_generation) - print(pipeline_ins(self.palm_input_zh)) + print( + pipeline_ins( + [self.palm_input_zh, self.palm_input_zh, self.palm_input_zh], + batch_size=2)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_bloom(self): diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py index 0b43e8b4..01f1887f 100644 --- a/tests/pipelines/test_text_ranking.py +++ b/tests/pipelines/test_text_ranking.py @@ -7,7 +7,7 @@ from modelscope.models import Model from modelscope.models.nlp import BertForTextRanking from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextRankingPipeline -from modelscope.preprocessors import TextRankingPreprocessor +from modelscope.preprocessors import TextRankingTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -32,7 +32,7 @@ class TextRankingTest(unittest.TestCase): def test_run_by_direct_model_download(self): for model_id in self.models: cache_path = snapshot_download(model_id) - tokenizer = TextRankingPreprocessor(cache_path) + tokenizer = TextRankingTransformersPreprocessor(cache_path) model = BertForTextRanking.from_pretrained(cache_path) pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -46,7 +46,7 @@ class TextRankingTest(unittest.TestCase): def test_run_with_model_from_modelhub(self): for model_id in self.models: model = Model.from_pretrained(model_id) - tokenizer = TextRankingPreprocessor(model.model_dir) + tokenizer = TextRankingTransformersPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.text_ranking, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.inputs)) diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py index 6969c0e6..ffaf0155 100644 --- a/tests/pipelines/test_word_segmentation.py +++ b/tests/pipelines/test_word_segmentation.py @@ -6,7 +6,8 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForTokenClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import WordSegmentationPipeline -from modelscope.preprocessors import TokenClassificationPreprocessor +from modelscope.preprocessors import \ + TokenClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -26,7 +27,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = TokenClassificationPreprocessor(cache_path) + tokenizer = TokenClassificationTransformersPreprocessor(cache_path) model = SbertForTokenClassification.from_pretrained(cache_path) pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer) pipeline2 = pipeline( @@ -38,7 +39,8 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = TokenClassificationPreprocessor(model.model_dir) + tokenizer = TokenClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.word_segmentation, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.sentence)) @@ -52,11 +54,24 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): 'sbert_ws_zh', compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print(pipeline_ins(input=self.sentence)) - with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, - 'sbert_ws_en', - compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): - print(pipeline_ins(input=self.sentence_eng)) + print(pipeline_ins(input=self.sentence_eng)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch(self): + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=self.model_id) + print( + pipeline_ins( + input=[self.sentence, self.sentence[:5], self.sentence[5:]], + batch_size=2)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name_batch_iter(self): + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=self.model_id, padding=False) + print( + pipeline_ins( + input=[self.sentence, self.sentence[:5], self.sentence[5:]])) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py index 00789707..f9a52b42 100644 --- a/tests/pipelines/test_zero_shot_classification.py +++ b/tests/pipelines/test_zero_shot_classification.py @@ -6,7 +6,8 @@ from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import ZeroShotClassificationPipeline -from modelscope.preprocessors import ZeroShotClassificationPreprocessor +from modelscope.preprocessors import \ + ZeroShotClassificationTransformersPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool @@ -28,7 +29,7 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = ZeroShotClassificationPreprocessor(cache_path) + tokenizer = ZeroShotClassificationTransformersPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) pipeline1 = ZeroShotClassificationPipeline( model, preprocessor=tokenizer) @@ -53,7 +54,8 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = ZeroShotClassificationPreprocessor(model.model_dir) + tokenizer = ZeroShotClassificationTransformersPreprocessor( + model.model_dir) pipeline_ins = pipeline( task=Tasks.zero_shot_classification, model=model, diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index f9f4d93f..9a31cc91 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -32,81 +32,74 @@ class NLPPreprocessorTest(unittest.TestCase): output['attention_mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) - def test_token_classification_tokenize(self): - with self.subTest(tokenizer_type='bert'): - cfg = dict( - type='token-cls-tokenizer', - model_dir='bert-base-cased', - label2id={ - 'O': 0, - 'B': 1, - 'I': 2 - }) - preprocessor = build_preprocessor(cfg, Fields.nlp) - input = 'Do not meddle in the affairs of wizards, ' \ - 'for they are subtle and quick to anger.' - output = preprocessor(input) - self.assertTrue(InputFields.text in output) - self.assertEqual(output['input_ids'].tolist()[0], [ - 101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, - 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, - 119, 102 - ]) - self.assertEqual(output['attention_mask'].tolist()[0], [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1 - ]) - self.assertEqual(output['label_mask'].tolist()[0], [ - False, True, True, True, False, True, True, True, True, True, - False, True, True, True, True, True, True, True, True, True, - True, False - ]) - self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), - (7, 13), (14, 16), - (17, 20), (21, 28), - (29, 31), (32, 39), - (39, 40), (41, 44), - (45, 49), (50, 53), - (54, 60), (61, 64), - (65, 70), (71, 73), - (74, 79), (79, 80)]) + def test_token_classification_tokenize_bert(self): + cfg = dict( + type='token-cls-tokenizer', + padding=False, + label_all_tokens=False, + model_dir='bert-base-cased', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, + 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102 + ]) + self.assertEqual( + output['attention_mask'].tolist()[0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, True, True, + False, True, True, True, True, True, True, True, True, True, True, + False + ]) + self.assertEqual( + output['offset_mapping'].tolist()[0], + [[0, 2], [3, 6], [7, 13], [14, 16], [17, 20], [21, 28], [29, 31], + [32, 39], [39, 40], [41, 44], [45, 49], [50, 53], [54, 60], + [61, 64], [65, 70], [71, 73], [74, 79], [79, 80]]) - with self.subTest(tokenizer_type='roberta'): - cfg = dict( - type='token-cls-tokenizer', - model_dir='xlm-roberta-base', - label2id={ - 'O': 0, - 'B': 1, - 'I': 2 - }) - preprocessor = build_preprocessor(cfg, Fields.nlp) - input = 'Do not meddle in the affairs of wizards, ' \ - 'for they are subtle and quick to anger.' - output = preprocessor(input) - self.assertTrue(InputFields.text in output) - self.assertEqual(output['input_ids'].tolist()[0], [ - 0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239, - 99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56, - 5, 2 - ]) - self.assertEqual(output['attention_mask'].tolist()[0], [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1 - ]) - self.assertEqual(output['label_mask'].tolist()[0], [ - False, True, True, True, False, True, True, True, False, True, - True, False, False, False, True, True, True, True, False, True, - True, True, True, False, False, False - ]) - self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), - (7, 13), (14, 16), - (17, 20), (21, 28), - (29, 31), (32, 40), - (41, 44), (45, 49), - (50, 53), (54, 60), - (61, 64), (65, 70), - (71, 73), (74, 80)]) + def test_token_classification_tokenize_roberta(self): + cfg = dict( + type='token-cls-tokenizer', + padding=False, + label_all_tokens=False, + model_dir='xlm-roberta-base', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239, 99397, + 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56, 5, 2 + ]) + self.assertEqual(output['attention_mask'].tolist()[0], [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + ]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, False, True, + True, False, False, False, True, True, True, True, False, True, + True, True, True, False, False, False + ]) + self.assertEqual( + output['offset_mapping'].tolist()[0], + [[0, 2], [3, 6], [7, 13], [14, 16], [17, 20], [21, 28], [29, 31], + [32, 40], [41, 44], [45, 49], [50, 53], [54, 60], [61, 64], + [65, 70], [71, 73], [74, 80]]) if __name__ == '__main__': diff --git a/tests/run.py b/tests/run.py index 1b252756..e7fae5a2 100644 --- a/tests/run.py +++ b/tests/run.py @@ -555,7 +555,7 @@ if __name__ == '__main__': nargs='*', help='Run specified test suites(test suite files list split by space)') args = parser.parse_args() - set_test_level(args.level) + set_test_level(2) os.environ['REGRESSION_BASELINE'] = '1' logger.info(f'TEST LEVEL: {test_level()}') if not args.disable_profile: diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py index 061d37d3..f5632b63 100644 --- a/tests/trainers/test_finetune_sequence_classification.py +++ b/tests/trainers/test_finetune_sequence_classification.py @@ -340,21 +340,16 @@ class TestFinetuneSequenceClassification(unittest.TestCase): User can train a custom dataset by modifying this piece of code and comment the @unittest.skip. """ - from datasets import load_dataset langs = ['en'] langs_eval = ['en'] train_datasets = [] - from datasets import DownloadConfig - dc = DownloadConfig() - dc.local_files_only = False for lang in langs: train_datasets.append( - load_dataset('xnli', lang, split='train', download_config=dc)) + MsDataset.load('xnli', subset_name=lang, split='train')) eval_datasets = [] for lang in langs_eval: eval_datasets.append( - load_dataset( - 'xnli', lang, split='validation', download_config=dc)) + MsDataset.load('xnli', subset_name=lang, split='validation')) train_len = sum([len(dataset) for dataset in train_datasets]) labels = ['0', '1', '2'] diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py index a92cee7b..a1480d38 100644 --- a/tests/trainers/test_finetune_token_classificatin.py +++ b/tests/trainers/test_finetune_token_classificatin.py @@ -91,8 +91,13 @@ class TestFinetuneTokenClassification(unittest.TestCase): 'label': 'labels', } } - cfg['preprocessor'] = {'type': 'token-cls-tokenizer'} + cfg['preprocessor'] = { + 'type': 'token-cls-tokenizer', + 'padding': 'max_length' + } cfg.train.max_epochs = 2 + cfg.train.dataloader.workers_per_gpu = 0 + cfg.evaluation.dataloader.workers_per_gpu = 0 cfg.train.lr_scheduler = { 'type': 'LinearLR', 'start_factor': 1.0, diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index f1d9e414..5e9850a7 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -119,6 +119,85 @@ class TestTrainerWithNlp(unittest.TestCase): checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth')) self.assertTrue(Metrics.accuracy in eval_results) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_trainer_save_best_ckpt(self): + + class MockTrainer(EpochBasedTrainer): + + def evaluation_loop(self, data_loader, metric_classes): + return {'accuracy': 10 + (-1)**self.iter * 1 * self.iter} + + from modelscope.utils.regress_test_utils import MsRegressTool + model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' + cfg: Config = read_config(model_id) + cfg.train.max_epochs = 10 + cfg.preprocessor.first_sequence = 'sentence1' + cfg.preprocessor.second_sequence = 'sentence2' + cfg.preprocessor.label = 'label' + cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1} + cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1} + cfg.train.dataloader.batch_size_per_gpu = 2 + cfg.train.hooks = [{ + 'type': 'BestCkptSaverHook', + 'interval': 1, + 'by_epoch': False, + 'metric_key': 'accuracy', + 'max_checkpoint_num': 4, + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }, { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': 1 + }] + cfg.train.work_dir = self.tmp_dir + cfg_file = os.path.join(self.tmp_dir, 'config.json') + cfg.dump(cfg_file) + dataset = MsDataset.load('clue', subset_name='afqmc', split='train') + dataset = dataset.to_hf_dataset().select(range(4)) + kwargs = dict( + model=model_id, + train_dataset=dataset, + eval_dataset=dataset, + cfg_file=cfg_file) + + regress_tool = MsRegressTool(baseline=True) + trainer: MockTrainer = MockTrainer(**kwargs) + + def lazy_stop_callback(): + from modelscope.trainers.hooks.hook import Hook, Priority + + class EarlyStopHook(Hook): + PRIORITY = Priority.VERY_LOW + + def after_iter(self, trainer): + if trainer.iter == 10: + raise MsRegressTool.EarlyStopError('Test finished.') + + if 'EarlyStopHook' not in [ + hook.__class__.__name__ for hook in trainer.hooks + ]: + trainer.register_hook(EarlyStopHook()) + + with regress_tool.monitor_ms_train( + trainer, + 'trainer_continue_train', + level='strict', + lazy_stop_callback=lazy_stop_callback): + trainer.train() + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in [22, 24, 26, 28]: + self.assertTrue( + any([ + f'accuracy{i}.pth' in filename + for filename in results_files + ])) + @unittest.skip('skip for now before test is re-configured') def test_trainer_with_configured_datasets(self): model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index 0243053e..2db61637 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -40,12 +40,18 @@ class AstScaningTest(unittest.TestCase): self.assertIsInstance(imports, dict) self.assertIsInstance(from_imports, dict) self.assertIsInstance(decorators, list) - self.assertListEqual(list(set(imports.keys()) - set(['torch'])), []) + self.assertListEqual( + list(set(imports.keys()) - set(['torch', 'os'])), []) self.assertEqual(len(from_imports.keys()), 10) self.assertTrue(from_imports['modelscope.metainfo'] is not None) self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines']) - self.assertEqual(decorators, - [('PIPELINES', 'text-generation', 'text-generation')]) + self.assertEqual( + decorators, + [('PIPELINES', 'text-generation', 'text-generation'), + ('PIPELINES', 'text2text-generation', 'translation_en_to_de'), + ('PIPELINES', 'text2text-generation', 'translation_en_to_ro'), + ('PIPELINES', 'text2text-generation', 'translation_en_to_fr'), + ('PIPELINES', 'text2text-generation', 'text2text-generation')]) def test_files_scaning_method(self): fileScaner = FilesAstScaning()