diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 5870ebe3..a1cf5e06 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -91,17 +91,22 @@ class TaskModels(object): text_classification = 'text-classification' token_classification = 'token-classification' information_extraction = 'information-extraction' + fill_mask = 'fill-mask' + feature_extraction = 'feature-extraction' class Heads(object): # nlp heads + + # text cls text_classification = 'text-classification' - # mlm + # fill mask + fill_mask = 'fill-mask' bert_mlm = 'bert-mlm' - # roberta mlm roberta_mlm = 'roberta-mlm' # token cls token_classification = 'token-classification' + # extraction information_extraction = 'information-extraction' @@ -203,6 +208,7 @@ class Pipelines(object): passage_ranking = 'passage-ranking' relation_extraction = 'relation-extraction' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' # audio tasks sambert_hifigan_tts = 'sambert-hifigan-tts' @@ -306,6 +312,7 @@ class Preprocessors(object): table_question_answering_preprocessor = 'table-question-answering-preprocessor' re_tokenizer = 're-tokenizer' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py index 33f111a8..7a8e28f4 100644 --- a/modelscope/models/builder.py +++ b/modelscope/models/builder.py @@ -37,13 +37,16 @@ def build_backbone(cfg: ConfigDict, cfg, BACKBONES, group_key=field, default_args=default_args) -def build_head(cfg: ConfigDict, default_args: dict = None): +def build_head(cfg: ConfigDict, + group_key: str = None, + default_args: dict = None): """ build head given config dict Args: cfg (:obj:`ConfigDict`): config dict for head object. default_args (dict, optional): Default initialization arguments. """ - + if group_key is None: + group_key = cfg[TYPE_NAME] return build_from_cfg( - cfg, HEADS, group_key=cfg[TYPE_NAME], default_args=default_args) + cfg, HEADS, group_key=group_key, default_args=default_args) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 152a32dc..8ef96365 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -6,7 +6,6 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .backbones import SbertModel from .bart_for_text_error_correction import BartForTextErrorCorrection - from .bert_for_sequence_classification import BertForSequenceClassification from .bert_for_document_segmentation import BertForDocumentSegmentation from .csanmt_for_translation import CsanmtForTranslation from .heads import SequenceClassificationHead @@ -20,12 +19,15 @@ if TYPE_CHECKING: from .palm_v2 import PalmForTextGeneration from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering from .star_text_to_sql import StarForTextToSql - from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification + from .sequence_classification import (VecoForSequenceClassification, + SbertForSequenceClassification, + BertForSequenceClassification) from .space import SpaceForDialogIntent from .space import SpaceForDialogModeling from .space import SpaceForDialogStateTracking from .table_question_answering import TableQuestionAnswering - from .task_models import (InformationExtractionModel, + from .task_models import (FeatureExtractionModel, + InformationExtractionModel, SequenceClassificationModel, SingleBackboneTaskModelBase, TokenClassificationModel) @@ -37,7 +39,6 @@ else: _import_structure = { 'backbones': ['SbertModel'], 'bart_for_text_error_correction': ['BartForTextErrorCorrection'], - 'bert_for_sequence_classification': ['BertForSequenceClassification'], 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], 'csanmt_for_translation': ['CsanmtForTranslation'], 'heads': ['SequenceClassificationHead'], @@ -54,15 +55,20 @@ else: 'palm_v2': ['PalmForTextGeneration'], 'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'], 'star_text_to_sql': ['StarForTextToSql'], - 'sequence_classification': - ['VecoForSequenceClassification', 'SbertForSequenceClassification'], + 'sequence_classification': [ + 'VecoForSequenceClassification', 'SbertForSequenceClassification', + 'BertForSequenceClassification' + ], 'space': [ 'SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDialogStateTracking' ], 'task_models': [ - 'InformationExtractionModel', 'SequenceClassificationModel', - 'SingleBackboneTaskModelBase', 'TokenClassificationModel' + 'FeatureExtractionModel', + 'InformationExtractionModel', + 'SequenceClassificationModel', + 'SingleBackboneTaskModelBase', + 'TokenClassificationModel', ], 'token_classification': ['SbertForTokenClassification'], 'table_question_answering': ['TableQuestionAnswering'], diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py new file mode 100644 index 00000000..aa513944 --- /dev/null +++ b/modelscope/models/nlp/backbones/bert.py @@ -0,0 +1,7 @@ +from modelscope.metainfo import Models +from modelscope.models.builder import BACKBONES +from modelscope.models.nlp.bert import BertModel +from modelscope.utils.constant import Fields + +BACKBONES.register_module( + group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel) diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py new file mode 100644 index 00000000..705d9519 --- /dev/null +++ b/modelscope/models/nlp/bert/__init__.py @@ -0,0 +1,60 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .modeling_bert import ( + BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + BertForMaskedLM, + BertForMultipleChoice, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertLayer, + BertLMHeadModel, + BertModel, + BertPreTrainedModel, + load_tf_weights_in_bert, + ) + + from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig + from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer + from .tokenization_bert_fast import BertTokenizerFast + +else: + _import_structure = { + 'configuration_bert': + ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'], + 'tokenization_bert': + ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'], + } + _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast'] + + _import_structure['modeling_bert'] = [ + 'BERT_PRETRAINED_MODEL_ARCHIVE_LIST', + 'BertForMaskedLM', + 'BertForMultipleChoice', + 'BertForNextSentencePrediction', + 'BertForPreTraining', + 'BertForQuestionAnswering', + 'BertForSequenceClassification', + 'BertForTokenClassification', + 'BertLayer', + 'BertLMHeadModel', + 'BertModel', + 'BertPreTrainedModel', + 'load_tf_weights_in_bert', + ] + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration_bert.py new file mode 100644 index 00000000..2c9293ec --- /dev/null +++ b/modelscope/models/nlp/bert/configuration_bert.py @@ -0,0 +1,162 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ +from collections import OrderedDict +from typing import Mapping + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig + +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`BertModel`] or a [`TFBertModel`]. It is used to instantiate a BERT model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the BERT + [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to + control the model outputs. Read the documentation from [`PretrainedConfig`] + for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different + tokens that can be represented by the `inputs_ids` passed when + calling [`BertModel`] or [`TFBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the + Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) + layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the + encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and + `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the + embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or + 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling + [`BertModel`] or [`TFBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, + `"relative_key"`, `"relative_key_query"`. For positional embeddings + use `"absolute"`. For more information on `"relative_key"`, please + refer to [Self-Attention with Relative Position Representations + (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more + information on `"relative_key_query"`, please refer to *Method 4* in + [Improve Transformer Models with Better Relative Position Embeddings + (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values + attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = 'bert' + + def __init__(self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type='absolute', + use_cache=True, + classifier_dropout=None, + **kwargs): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class BertOnnxConfig(OnnxConfig): + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict([ + ('input_ids', { + 0: 'batch', + 1: 'sequence' + }), + ('attention_mask', { + 0: 'batch', + 1: 'sequence' + }), + ('token_type_ids', { + 0: 'batch', + 1: 'sequence' + }), + ]) diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py new file mode 100755 index 00000000..f8fd5994 --- /dev/null +++ b/modelscope/models/nlp/bert/modeling_bert.py @@ -0,0 +1,2040 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.file_utils import (ModelOutput, add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, MaskedLMOutput, + MultipleChoiceModelOutput, NextSentencePredictorOutput, + QuestionAnsweringModelOutput, SequenceClassifierOutput, + TokenClassifierOutput) +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +from modelscope.models.base import TorchModel +from modelscope.utils.logger import get_logger +from .configuration_bert import BertConfig + +logger = get_logger(__name__) + +_CONFIG_FOR_DOC = 'BertConfig' + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + 'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see ' + 'https://www.tensorflow.org/install/ for installation instructions.' + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f'Converting TensorFlow checkpoint from {tf_path}') + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f'Loading TF weight {name} with shape {shape}') + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in [ + 'adam_v', 'adam_m', 'AdamWeightDecayOptimizer', + 'AdamWeightDecayOptimizer_1', 'global_step' + ] for n in name): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + scope_names = re.split(r'_(\d+)', m_name) + else: + scope_names = [m_name] + if scope_names[0] == 'kernel' or scope_names[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif scope_names[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError( + f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched' + ) + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f'Initialize PyTorch weight {name}') + pointer.data = torch.from_numpy(array) + return model + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model + # variable name and be able to load any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and + # exported when serialized + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse('1.6.0'): + self.register_buffer( + 'token_type_ids', + torch.zeros(self.position_ids.size(), dtype=torch.long), + persistent=False, + ) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, + past_key_values_length:seq_length + + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor + # where it is all zeros, which usually occurs when its auto-generated, + # registered buffer helps users when tracing the model without passing + # token_type_ids, solves issue #5664 + if token_type_ids is None: + if hasattr(self, 'token_type_ids'): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, + dtype=torch.long, + device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, 'position_embedding_type', 'absolute') + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all + # cross attention key/value_states. Further calls to cross_attention + # layer can then reuse all cross-attention key/value_states (first + # "if" case) if uni-directional self-attention (decoder) save + # Tuple(torch.Tensor, torch.Tensor) of all previous decoder + # key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected + # key/value_states (third "elif" case) if encoder bi-directional + # self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BertSelfAttention( + config, position_embedding_type=position_embedding_type) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, + self.self.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError( + f'{self} should be used as a decoder model if cross attention is added' + ) + self.crossattention = BertAttention( + config, position_embedding_type='absolute') + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, 'crossattention'): + raise ValueError( + f'If `encoder_hidden_states` are passed, {self} has to be instantiated ' + f'with cross-attention layers by setting `config.add_cross_attention=True`' + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[ + -2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[ + 1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + ) if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + all_self_attentions = all_self_attentions + ( + layer_outputs[1], ) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface + for downloading and loading pretrained models. + """ + + config_class = BertConfig + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BertEncoder): + module.gradient_checkpointing = value + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, + `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the + next sequence prediction (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, + sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each + vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, + 2)`): + Prediction scores of the next sequence prediction (classification) + head (scores of True/False continuation before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + + one for the output of each layer) of shape `(batch_size, + sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass + documentation for the generic methods the library implements for all its + model (such as downloading or saving, resizing the input embeddings, pruning + heads etc.) + + This model is also a PyTorch + [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch + documentation for all matter related to general usage and behavior. + + Parameters: + config ([`BertConfig`]): Model configuration class with all the + parameters of the model. + Initializing with a config file does not load the weights associated + with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model + weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the + inputs. Indices are selected in `[0, 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position + embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask + values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, + *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a + plain tuple. +""" + + +@add_start_docstrings( + 'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.', + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a + decoder, in which case a layer of cross-attention is added between the + self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam + Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + `is_decoder` argument of the configuration set to `True`. To be used in a + Seq2Seq model, the model needs to initialized with both `is_decoder` + argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` + is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config): + config = BertConfig(**config) + model = cls(config, add_pooling_layer) + return model + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder. Used in the cross-attention if the model is configured as a + decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of + the encoder input. This mask is used in the cross-attention if the + model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention + blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only + the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead + of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned + and can be used to speed up decoding (see `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[ + 2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, 'token_type_ids'): + buffered_token_type_ids = self.embeddings.token_type_ids[:, : + seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def extract_sequence_outputs(self, outputs): + return outputs['last_hidden_state'] + + def extract_pooled_outputs(self, outputs): + return outputs['pooler_output'] + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked + language modeling` head and a `next sentence prediction (classification)` + head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the masked language modeling loss. Indices + should be in `[-100, 0, ..., config.vocab_size]` (see + `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with + labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, + *optional*): + Labels for computing the next sequence prediction + (classification) loss. Input should be a sequence pair (see + `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, BertForPreTraining + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls( + sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2), + next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, + BERT_START_DOCSTRING) +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning( + 'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`' + ) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder. Used in the cross-attention if the model is configured + as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices + of the encoder input. This mask is used in the cross-attention + if the model is configured as a decoder. Mask values selected in + `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the left-to-right language modeling loss + (next word prediction). Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with + indices set to `-100` are ignored (masked), the loss is only + computed for the tokens with labels n `[0, ..., + config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, + embed_size_per_head)`): + Contains precomputed key and value hidden states of the + attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input + only the last `decoder_input_ids` (those that don't have their + past key value states given to this model) of shape + `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are + returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, BertLMHeadModel, + BertConfig >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> config.is_decoder = True + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, : + -1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((lm_loss, ) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'past_key_values': past + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple( + past_state.index_select(0, beam_idx) + for past_state in layer_past), ) + return reordered_past + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the masked language modeling loss. Indices + should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` + docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., + config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError('The PAD token should be defined for generation') + + padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1)) + attention_mask = torch.cat([attention_mask, padding_mask], dim=-1) + dummy_token = torch.full((effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) + loss. Input should be a sequence pair (see `input_ids` docstring). + Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, + BertForNextSentencePrediction >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if 'next_sentence_label' in kwargs: + warnings.warn( + 'The `next_sentence_label` argument is deprecated, use `labels` instead.', + FutureWarning, + ) + labels = kwargs.pop('next_sentence_label') + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct( + seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores, ) + outputs[2:] + return ((next_sentence_loss, ) + + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top + (a linear layer on top of the pooled output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class BertForSequenceClassification(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. + Indices should be in `[0, ..., config.num_labels - 1]`. If + `config.num_labels == 1` a regression loss is computed (Mean-Square + loss), If `config.num_labels > 1` a classification loss is computed + (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long + or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer + on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class BertForMultipleChoice(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format( + 'batch_size, num_choices, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. + Indices should be in `[0, ..., num_choices-1]` where `num_choices` + is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[ + 1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view( + -1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view( + -1, + attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view( + -1, + token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view( + -1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), + inputs_embeds.size(-1)) + if inputs_embeds is not None else None) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class BertForTokenClassification(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the token classification loss. Indices should + be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive + question-answering tasks like SQuAD (a linear layers on top of the + hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class BertForQuestionAnswering(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, + *optional*): + Labels for position (index) of the start of the labelled span for + computing the token classification loss. Positions are clamped to + the length of the sequence (`sequence_length`). Position outside of + the sequence are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for + computing the token classification loss. Positions are clamped to + the length of the sequence (`sequence_length`). Position outside of + the sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py deleted file mode 100644 index 2b1a3b3b..00000000 --- a/modelscope/models/nlp/bert_for_sequence_classification.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict - -import json -import numpy as np - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks - -__all__ = ['BertForSequenceClassification'] - - -@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) -class BertForSequenceClassification(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs) - # Predictor.__init__(self, *args, **kwargs) - """initialize the sequence classification model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - - super().__init__(model_dir, *args, **kwargs) - import torch - from easynlp.appzoo import SequenceClassification - from easynlp.core.predictor import get_model_predictor - self.model = get_model_predictor( - model_dir=self.model_dir, - model_cls=SequenceClassification, - input_keys=[('input_ids', torch.LongTensor), - ('attention_mask', torch.LongTensor), - ('token_type_ids', torch.LongTensor)], - output_keys=['predictions', 'probabilities', 'logits']) - - self.label_path = os.path.join(self.model_dir, 'label_mapping.json') - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = {idx: name for name, idx in self.label_mapping.items()} - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'predictions': array([1]), # lable 0-negative 1-positive - 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - return self.model.predict(input) - - def postprocess(self, inputs: Dict[str, np.ndarray], - **kwargs) -> Dict[str, np.ndarray]: - # N x num_classes - probs = inputs['probabilities'] - result = { - 'probs': probs, - } - - return result diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py index 664fc6c6..830210ed 100644 --- a/modelscope/models/nlp/deberta_v2/__init__.py +++ b/modelscope/models/nlp/deberta_v2/__init__.py @@ -21,21 +21,12 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule -_import_structure = { - 'configuration_deberta_v2': [ - 'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config', - 'DebertaV2OnnxConfig' - ], - 'tokenization_deberta_v2': ['DebertaV2Tokenizer'], -} - if TYPE_CHECKING: from .configuration_deberta_v2 import DebertaV2Config from .tokenization_deberta_v2 import DebertaV2Tokenizer from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast from .modeling_deberta_v2 import ( - DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST, DebertaV2ForMaskedLM, DebertaV2ForMultipleChoice, DebertaV2ForQuestionAnswering, @@ -55,7 +46,6 @@ else: 'DebertaV2TokenizerFast' ] _import_structure['modeling_deberta_v2'] = [ - 'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST', 'DebertaV2ForMaskedLM', 'DebertaV2ForMultipleChoice', 'DebertaV2ForQuestionAnswering', diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py new file mode 100644 index 00000000..6b0c5e05 --- /dev/null +++ b/modelscope/models/nlp/heads/fill_mask_head.py @@ -0,0 +1,101 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Heads +from modelscope.models.base import TorchHead +from modelscope.models.builder import HEADS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + + +@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) +class BertFillMaskHead(TorchHead): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.cls = BertOnlyMLMHead(self.config) + + def forward(self, sequence_output): + prediction_scores = self.cls(sequence_output) + return {OutputKeys.LOGITS: prediction_scores} + + def compute_loss(self, outputs: Dict[str, torch.Tensor], + labels) -> Dict[str, torch.Tensor]: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + outputs.view(-1, self.config.vocab_size), labels.view(-1)) + return {OutputKeys.LOSS: masked_lm_loss} + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py index fb54637b..e477533f 100644 --- a/modelscope/models/nlp/heads/torch_pretrain_head.py +++ b/modelscope/models/nlp/heads/torch_pretrain_head.py @@ -11,7 +11,7 @@ from modelscope.models.builder import HEADS from modelscope.utils.constant import Tasks -@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) +# @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) class BertMLMHead(BertOnlyMLMHead, TorchHead): def compute_loss(self, outputs: Dict[str, torch.Tensor], diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index 514a04cd..b7a890c1 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -1,10 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from transformers import BertForMaskedLM as BertForMaskedLMTransformer - from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import \ + BertForMaskedLM as BertForMaskedLMTransformer from modelscope.models.nlp.deberta_v2 import \ DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer from modelscope.models.nlp.structbert import SbertForMaskedLM diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py index 62198ed2..8b0c59b2 100644 --- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py @@ -41,12 +41,9 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: input_tensor = { - 'input_ids': - torch.tensor(input['input_ids']).unsqueeze(0), - 'attention_mask': - torch.tensor(input['attention_mask']).unsqueeze(0), - 'label_mask': - torch.tensor(input['label_mask'], dtype=torch.bool).unsqueeze(0) + 'input_ids': input['input_ids'], + 'attention_mask': input['attention_mask'], + 'label_mask': input['label_mask'], } output = { 'text': input['text'], diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py index a8930e68..156c615c 100644 --- a/modelscope/models/nlp/sequence_classification.py +++ b/modelscope/models/nlp/sequence_classification.py @@ -7,6 +7,7 @@ from torch import nn from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertPreTrainedModel from modelscope.models.nlp.structbert import SbertPreTrainedModel from modelscope.models.nlp.veco import \ VecoForSequenceClassification as VecoForSequenceClassificationTransform @@ -16,7 +17,10 @@ from modelscope.utils.hub import parse_label_mapping from modelscope.utils.tensor_utils import (torch_nested_detach, torch_nested_numpify) -__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification'] +__all__ = [ + 'SbertForSequenceClassification', 'VecoForSequenceClassification', + 'BertForSequenceClassification' +] class SequenceClassificationBase(TorchModel): @@ -132,7 +136,7 @@ class SbertForSequenceClassification(SequenceClassificationBase, label2id = parse_label_mapping(model_dir) if label2id is not None and len(label2id) > 0: num_labels = len(label2id) - + cls.id2label = {id: label for label, id in label2id.items()} model_args = {} if num_labels is None else {'num_labels': num_labels} return super(SbertPreTrainedModel, SbertForSequenceClassification).from_pretrained( @@ -206,3 +210,78 @@ class VecoForSequenceClassification(TorchModel, pretrained_model_name_or_path=kwargs.get('model_dir'), model_dir=kwargs.get('model_dir'), **model_args) + + +@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.bert) +@MODELS.register_module(Tasks.nli, module_name=Models.bert) +@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) +class BertForSequenceClassification(SequenceClassificationBase, + BertPreTrainedModel): + """Bert sequence classification model. + + Inherited from SequenceClassificationBase. + """ + base_model_prefix: str = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + BertForSequenceClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .bert import BertModel + return BertModel(self.config, add_pooling_layer=True) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(BertPreTrainedModel, + BertForSequenceClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py index 7493ba74..90f22aa1 100644 --- a/modelscope/models/nlp/task_models/__init__.py +++ b/modelscope/models/nlp/task_models/__init__.py @@ -5,6 +5,8 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .information_extraction import InformationExtractionModel + from .feature_extraction import FeatureExtractionModel + from .fill_mask import FillMaskModel from .sequence_classification import SequenceClassificationModel from .task_model import SingleBackboneTaskModelBase from .token_classification import TokenClassificationModel @@ -12,6 +14,8 @@ if TYPE_CHECKING: else: _import_structure = { 'information_extraction': ['InformationExtractionModel'], + 'feature_extraction': ['FeatureExtractionModel'], + 'fill_mask': ['FillMaskModel'], 'sequence_classification': ['SequenceClassificationModel'], 'task_model': ['SingleBackboneTaskModelBase'], 'token_classification': ['TokenClassificationModel'], diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py new file mode 100644 index 00000000..069c37aa --- /dev/null +++ b/modelscope/models/nlp/task_models/feature_extraction.py @@ -0,0 +1,43 @@ +from typing import Any, Dict + +import numpy as np + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertConfig +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping + +__all__ = ['FeatureExtractionModel'] + + +@MODELS.register_module( + Tasks.feature_extraction, module_name=TaskModels.feature_extraction) +class FeatureExtractionModel(SingleBackboneTaskModelBase): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fill mask model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + self.build_backbone(self.backbone_cfg) + + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + + outputs = super().forward(input) + sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + if labels is not None: + input[OutputKeys.LABELS] = labels + + return {OutputKeys.TEXT_EMBEDDING: sequence_output} diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py new file mode 100644 index 00000000..f7ef1cc2 --- /dev/null +++ b/modelscope/models/nlp/task_models/fill_mask.py @@ -0,0 +1,47 @@ +from typing import Any, Dict + +import numpy as np + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertConfig +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping + +__all__ = ['FillMaskModel'] + + +@MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask) +class FillMaskModel(SingleBackboneTaskModelBase): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fill mask model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) + + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + + outputs = super().forward(input) + sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + outputs = self.head.forward(sequence_output) + + if labels is not None: + input[OutputKeys.LABELS] = labels + loss = self.compute_loss(outputs, labels) + outputs.update(loss) + outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS] + return outputs diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py index 4792d07c..0a7d5a47 100644 --- a/modelscope/models/nlp/task_models/information_extraction.py +++ b/modelscope/models/nlp/task_models/information_extraction.py @@ -26,21 +26,12 @@ class InformationExtractionModel(SingleBackboneTaskModelBase): """ super().__init__(model_dir, *args, **kwargs) - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: outputs = super().forward(input) sequence_output, pooled_output = self.extract_backbone_outputs(outputs) outputs = self.head.forward(sequence_output, input['text'], input['offsets']) return {OutputKeys.SPO_LIST: outputs} - - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - return sequence_output, pooled_output diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py index 43a96327..1f5e46c3 100644 --- a/modelscope/models/nlp/task_models/sequence_classification.py +++ b/modelscope/models/nlp/task_models/sequence_classification.py @@ -11,10 +11,14 @@ from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping __all__ = ['SequenceClassificationModel'] +@MODELS.register_module( + Tasks.sentence_similarity, module_name=TaskModels.text_classification) +@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification) @MODELS.register_module( Tasks.sentiment_classification, module_name=TaskModels.text_classification) @MODELS.register_module( @@ -31,49 +35,36 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase): if 'base_model_prefix' in kwargs: self._base_model_prefix = kwargs['base_model_prefix'] - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - # get the num_labels from label_mapping.json self.id2label = {} - self.label_path = os.path.join(model_dir, 'label_mapping.json') - if os.path.exists(self.label_path): - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = { - idx: name - for name, idx in self.label_mapping.items() - } - head_cfg['num_labels'] = len(self.label_mapping) + # get the num_labels + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + self.id2label = {id: label for label, id in label2id.items()} + self.head_cfg['num_labels'] = num_labels - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + outputs = super().forward(input) sequence_output, pooled_output = self.extract_backbone_outputs(outputs) outputs = self.head.forward(pooled_output) - if 'labels' in input: - loss = self.compute_loss(outputs, input['labels']) + if labels is not None: + input[OutputKeys.LABELS] = labels + loss = self.compute_loss(outputs, labels) outputs.update(loss) return outputs def extract_logits(self, outputs): return outputs[OutputKeys.LOGITS].cpu().detach() - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - if hasattr(self.backbone, 'extract_pooled_outputs'): - pooled_output = self.backbone.extract_pooled_outputs(outputs) - return sequence_output, pooled_output - - def compute_loss(self, outputs, labels): - loss = self.head.compute_loss(outputs, labels) - return loss - def postprocess(self, input, **kwargs): logits = self.extract_logits(input) probs = logits.softmax(-1).numpy() diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py index e93dd5f6..0b43044f 100644 --- a/modelscope/models/nlp/task_models/task_model.py +++ b/modelscope/models/nlp/task_models/task_model.py @@ -74,7 +74,7 @@ class BaseTaskModel(TorchModel, ABC): def __init__(self, model_dir: str, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) - self.cfg = ConfigDict(kwargs) + self.config = ConfigDict(kwargs) def __repr__(self): # only log backbone and head name @@ -397,6 +397,9 @@ class SingleBackboneTaskModelBase(BaseTaskModel): def __init__(self, model_dir: str, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) + self.backbone_cfg = self.config.get('backbone', None) + assert self.backbone_cfg is not None + self.head_cfg = self.config.get('head', None) def build_backbone(self, cfg): if 'prefix' in cfg: @@ -405,9 +408,13 @@ class SingleBackboneTaskModelBase(BaseTaskModel): setattr(self, cfg['prefix'], backbone) def build_head(self, cfg): + if cfg is None: + raise ValueError( + 'Head config is missing, check if this was a backbone-only model' + ) if 'prefix' in cfg: self._head_prefix = cfg['prefix'] - head = build_head(cfg) + head = build_head(cfg, group_key=self.group_key) setattr(self, self._head_prefix, head) return head @@ -431,8 +438,18 @@ class SingleBackboneTaskModelBase(BaseTaskModel): outputs = self.backbone.forward(**input) return outputs - def compute_loss(self, outputs: Dict[str, Any], labels): - raise NotImplementedError() + def compute_loss(self, outputs, labels): + loss = self.head.compute_loss(outputs, labels) + return loss + + def extract_backbone_outputs(self, outputs): + sequence_output = None + pooled_output = None + if hasattr(self.backbone, 'extract_sequence_outputs'): + sequence_output = self.backbone.extract_sequence_outputs(outputs) + if hasattr(self.backbone, 'extract_pooled_outputs'): + pooled_output = self.backbone.extract_pooled_outputs(outputs) + return sequence_output, pooled_output class EncoderDecoderTaskModelBase(BaseTaskModel): @@ -453,7 +470,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel): def build_encoder(self): encoder = build_backbone( - self.cfg, + self.config, type_name=self._encoder_key_in_cfg, task_name=Tasks.backbone) setattr(self, self._encoder_prefix, encoder) @@ -461,7 +478,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel): def build_decoder(self): decoder = build_backbone( - self.cfg, + self.config, type_name=self._decoder_key_in_cfg, task_name=Tasks.backbone) setattr(self, self._decoder_prefix, decoder) diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py index 5c22098f..f3930182 100644 --- a/modelscope/models/nlp/task_models/token_classification.py +++ b/modelscope/models/nlp/task_models/token_classification.py @@ -31,9 +31,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): if 'base_model_prefix' in kwargs: self._base_model_prefix = kwargs['base_model_prefix'] - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - # get the num_labels num_labels = kwargs.get('num_labels') if num_labels is None: @@ -41,12 +38,12 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): if label2id is not None and len(label2id) > 0: num_labels = len(label2id) self.id2label = {id: label for label, id in label2id.items()} - head_cfg['num_labels'] = num_labels + self.head_cfg['num_labels'] = num_labels - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: labels = None if OutputKeys.LABEL in input: labels = input.pop(OutputKeys.LABEL) @@ -71,10 +68,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): sequence_output = self.backbone.extract_sequence_outputs(outputs) return sequence_output, pooled_output - def compute_loss(self, outputs, labels): - loss = self.head.compute_loss(outputs, labels) - return loss - def postprocess(self, input, **kwargs): logits = self.extract_logits(input) pred = torch.argmax(logits[0], dim=-1) diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py index c3723a61..c63e8037 100644 --- a/modelscope/models/nlp/token_classification.py +++ b/modelscope/models/nlp/token_classification.py @@ -10,12 +10,13 @@ from torch import nn from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertPreTrainedModel +from modelscope.models.nlp.structbert import SbertPreTrainedModel from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.hub import parse_label_mapping from modelscope.utils.tensor_utils import (torch_nested_detach, torch_nested_numpify) -from .structbert import SbertPreTrainedModel __all__ = ['SbertForTokenClassification'] @@ -171,3 +172,49 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): pretrained_model_name_or_path=kwargs.get('model_dir'), model_dir=kwargs.get('model_dir'), **model_args) + + +@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert) +@MODELS.register_module(Tasks.token_classification, module_name=Models.bert) +class BertForSequenceClassification(TokenClassification, BertPreTrainedModel): + """Bert token classification model. + + Inherited from TokenClassificationBase. + """ + base_model_prefix: str = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + BertForSequenceClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .bert import BertModel + return BertModel(self.config, add_pooling_layer=True) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index ce9e8d07..357afd07 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -417,6 +417,22 @@ TASK_OUTPUTS = { # } Tasks.fill_mask: [OutputKeys.TEXT], + # feature extraction result for single sample + # { + # "text_embedding": [[ + # [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04], + # [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01], + # [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05] + # ], + # [ + # [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05], + # [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05], + # [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05] + # ] + # ] + # } + Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING], + # (Deprecated) dialog intent prediction result for single sample # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05, # 1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 51d50d51..4f6873b0 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -52,8 +52,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_vit_object-detection_coco'), Tasks.image_denoising: (Pipelines.image_denoise, 'damo/cv_nafnet_image-denoise_sidd'), - Tasks.text_classification: (Pipelines.sentiment_analysis, - 'damo/bert-base-sst2'), + Tasks.text_classification: + (Pipelines.sentiment_classification, + 'damo/nlp_structbert_sentiment-classification_chinese-base'), Tasks.text_generation: (Pipelines.text_generation, 'damo/nlp_palm2.0_text-generation_chinese-base'), Tasks.zero_shot_classification: @@ -80,6 +81,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.ocr_detection: (Pipelines.ocr_detection, 'damo/cv_resnet18_ocr-detection-line-level_damo'), Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), + Tasks.feature_extraction: (Pipelines.feature_extraction, + 'damo/pert_feature-extraction_base-test'), Tasks.action_recognition: (Pipelines.action_recognition, 'damo/cv_TAdaConv_action-recognition'), Tasks.action_detection: (Pipelines.action_detection, diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index a8edc21a..5267b5b2 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -11,12 +11,13 @@ if TYPE_CHECKING: from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline from .document_segmentation_pipeline import DocumentSegmentationPipeline from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline + from .feature_extraction_pipeline import FeatureExtractionPipeline from .fill_mask_pipeline import FillMaskPipeline from .fill_mask_ponet_pipeline import FillMaskPonetPipeline from .information_extraction_pipeline import InformationExtractionPipeline from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline - from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline - from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline + from .passage_ranking_pipeline import PassageRankingPipeline + from .sentence_embedding_pipeline import SentenceEmbeddingPipeline from .sequence_classification_pipeline import SequenceClassificationPipeline from .summarization_pipeline import SummarizationPipeline from .text_classification_pipeline import TextClassificationPipeline @@ -27,8 +28,7 @@ if TYPE_CHECKING: from .translation_pipeline import TranslationPipeline from .word_segmentation_pipeline import WordSegmentationPipeline from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline - from .passage_ranking_pipeline import PassageRankingPipeline - from .sentence_embedding_pipeline import SentenceEmbeddingPipeline + else: _import_structure = { 'conversational_text_to_sql_pipeline': @@ -41,16 +41,15 @@ else: 'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'], 'document_segmentation_pipeline': ['DocumentSegmentationPipeline'], 'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'], + 'feature_extraction_pipeline': ['FeatureExtractionPipeline'], 'fill_mask_pipeline': ['FillMaskPipeline'], 'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'], + 'information_extraction_pipeline': ['InformationExtractionPipeline'], 'named_entity_recognition_pipeline': ['NamedEntityRecognitionPipeline'], - 'information_extraction_pipeline': ['InformationExtractionPipeline'], - 'pair_sentence_classification_pipeline': - ['PairSentenceClassificationPipeline'], + 'passage_ranking_pipeline': ['PassageRankingPipeline'], + 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'], 'sequence_classification_pipeline': ['SequenceClassificationPipeline'], - 'single_sentence_classification_pipeline': - ['SingleSentenceClassificationPipeline'], 'summarization_pipeline': ['SummarizationPipeline'], 'text_classification_pipeline': ['TextClassificationPipeline'], 'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'], @@ -61,8 +60,6 @@ else: 'word_segmentation_pipeline': ['WordSegmentationPipeline'], 'zero_shot_classification_pipeline': ['ZeroShotClassificationPipeline'], - 'passage_ranking_pipeline': ['PassageRankingPipeline'], - 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'] } import sys diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py new file mode 100644 index 00000000..3af0c28d --- /dev/null +++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py @@ -0,0 +1,82 @@ +import os +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import NLPPreprocessor, Preprocessor +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks + +__all__ = ['FeatureExtractionPipeline'] + + +@PIPELINES.register_module( + Tasks.feature_extraction, module_name=Pipelines.feature_extraction) +class FeatureExtractionPipeline(Pipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + first_sequence='sentence', + **kwargs): + """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction + + Args: + model (str or Model): Supply either a local model dir which supported feature extraction task, or a + no-head model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the sentence in. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large') + >>> input = 'Everything you love is treasure' + >>> print(pipe_ins(input)) + + + """ + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + + if preprocessor is None: + preprocessor = NLPPreprocessor( + model.model_dir, + padding=kwargs.pop('padding', False), + sequence_length=kwargs.pop('sequence_length', 128)) + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + self.preprocessor = preprocessor + self.config = Config.from_file( + os.path.join(model.model_dir, ModelFile.CONFIGURATION)) + self.tokenizer = preprocessor.tokenizer + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return self.model(**inputs, **forward_params) + + def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + + return { + OutputKeys.TEXT_EMBEDDING: + inputs[OutputKeys.TEXT_EMBEDDING].tolist() + } diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 12f4b80f..3d515e2d 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -10,7 +10,7 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor +from modelscope.preprocessors import NLPPreprocessor, Preprocessor from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -57,7 +57,7 @@ class FillMaskPipeline(Pipeline): model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( fill_mask_model.model_dir, first_sequence=first_sequence, second_sequence=None, @@ -118,7 +118,10 @@ class FillMaskPipeline(Pipeline): logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy() input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy() pred_ids = np.argmax(logits, axis=-1) - model_type = self.model.config.model_type + if hasattr(self.model.config, 'backbone'): + model_type = self.model.config.backbone.type + else: + model_type = self.model.config.model_type process_type = model_type if model_type in self.mask_id else _type_map[ model_type] rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids, diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py index 07223d07..763e941c 100644 --- a/modelscope/pipelines/nlp/information_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py @@ -36,7 +36,7 @@ class InformationExtractionPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return super().forward(inputs, **forward_params) + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 467d7aba..7275feca 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -9,7 +9,8 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NERPreprocessor, Preprocessor +from modelscope.preprocessors import (Preprocessor, + TokenClassificationPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['NamedEntityRecognitionPipeline'] @@ -46,7 +47,7 @@ class NamedEntityRecognitionPipeline(Pipeline): model = model if isinstance(model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = NERPreprocessor( + preprocessor = TokenClassificationPreprocessor( model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) model.eval() diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py deleted file mode 100644 index bdb75c73..00000000 --- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Union - -from modelscope.models.base import Model -from ...metainfo import Pipelines -from ...preprocessors import (PairSentenceClassificationPreprocessor, - Preprocessor) -from ...utils.constant import Tasks -from ..builder import PIPELINES -from .sequence_classification_pipeline_base import \ - SequenceClassificationPipelineBase - -__all__ = ['PairSentenceClassificationPipeline'] - - -@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) -@PIPELINES.register_module( - Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) -class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): - - def __init__(self, - model: Union[Model, str], - preprocessor: Preprocessor = None, - first_sequence='first_sequence', - second_sequence='second_sequence', - **kwargs): - """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction. - - Args: - model (str or Model): Supply either a local model dir which supported the sequence classification task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the first sentence in. - second_sequence: The key to read the second sentence in. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. - - NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and - 'second_sequence' param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base') - >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好?' - >>> sentence2 = '四川商务职业学院商务管理在哪个校区?' - >>> print(pipeline_ins((sentence1, sentence2))) - >>> # Or use the dict input: - >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2})) - - To view other examples plese check the tests/pipelines/test_nli.py. - """ - if preprocessor is None: - preprocessor = PairSentenceClassificationPreprocessor( - model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence, - second_sequence=second_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py index 7fe8aace..8d0e1dcd 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -1,48 +1,64 @@ from typing import Any, Dict, Union import numpy as np +import torch from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import BertForSequenceClassification +from modelscope.models.base import Model from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import (Preprocessor, + SequenceClassificationPreprocessor) from modelscope.utils.constant import Tasks -__all__ = ['SequenceClassificationPipeline'] - @PIPELINES.register_module( Tasks.text_classification, module_name=Pipelines.sentiment_analysis) +@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) +@PIPELINES.register_module( + Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) +@PIPELINES.register_module( + Tasks.text_classification, module_name=Pipelines.sentiment_classification) class SequenceClassificationPipeline(Pipeline): def __init__(self, - model: Union[BertForSequenceClassification, str], - preprocessor: SequenceClassificationPreprocessor = None, + model: Union[Model, str], + preprocessor: Preprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + """This is the base class for all the sequence classification sub-tasks. Args: - model (BertForSequenceClassification): a model instance - preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + model (str or Model): A model instance or a model local dir or a model id in the model hub. + preprocessor (Preprocessor): a preprocessor instance, must not be None. """ - assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \ - 'model must be a single str or BertForSequenceClassification' - sc_model = model if isinstance( - model, - BertForSequenceClassification) else Model.from_pretrained(model) + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or Model' + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + first_sequence = kwargs.pop('first_sequence', 'first_sequence') + second_sequence = kwargs.pop('second_sequence', None) + if preprocessor is None: preprocessor = SequenceClassificationPreprocessor( - sc_model.model_dir, - first_sequence='sentence', - second_sequence=None, + model.model_dir if isinstance(model, Model) else model, + first_sequence=first_sequence, + second_sequence=second_sequence, sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) - assert hasattr(self.model, 'id2label'), \ - 'id2label map should be initalizaed in init function.' + assert preprocessor is not None + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ + 'as a parameter or make sure the preprocessor has the attribute.' + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], @@ -50,20 +66,18 @@ class SequenceClassificationPipeline(Pipeline): """process the prediction results Args: - inputs (Dict[str, Any]): input data dict - topk (int): return topk classification result. - + inputs (Dict[str, Any]): _description_ + topk (int): The topk probs to take Returns: Dict[str, str]: the prediction results """ - # NxC np.ndarray - probs = inputs['probs'][0] + + probs = inputs[OutputKeys.PROBABILITIES][0] num_classes = probs.shape[0] topk = min(topk, num_classes) top_indices = np.argpartition(probs, -topk)[-topk:] cls_ids = top_indices[np.argsort(probs[top_indices])] probs = probs[cls_ids].tolist() - cls_names = [self.model.id2label[cid] for cid in cls_ids] - + cls_names = [self.id2label[cid] for cid in cls_ids] return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py deleted file mode 100644 index 3d8e8fea..00000000 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.models.base import Model -from modelscope.outputs import OutputKeys -from ...preprocessors import Preprocessor -from ..base import Pipeline - - -class SequenceClassificationPipelineBase(Pipeline): - - def __init__(self, model: Union[Model, str], preprocessor: Preprocessor, - **kwargs): - """This is the base class for all the sequence classification sub-tasks. - - Args: - model (str or Model): A model instance or a model local dir or a model id in the model hub. - preprocessor (Preprocessor): a preprocessor instance, must not be None. - """ - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or Model' - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - assert preprocessor is not None - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label - assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ - 'as a parameter or make sure the preprocessor has the attribute.' - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - topk (int): The topk probs to take - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs[OutputKeys.PROBABILITIES][0] - num_classes = probs.shape[0] - topk = min(topk, num_classes) - top_indices = np.argpartition(probs, -topk)[-topk:] - cls_ids = top_indices[np.argsort(probs[top_indices])] - probs = probs[cls_ids].tolist() - - cls_names = [self.id2label[cid] for cid in cls_ids] - return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py deleted file mode 100644 index 0a2f6d25..00000000 --- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Union - -from ...metainfo import Pipelines -from ...models import Model -from ...preprocessors import (Preprocessor, - SingleSentenceClassificationPreprocessor) -from ...utils.constant import Tasks -from ..builder import PIPELINES -from .sequence_classification_pipeline_base import \ - SequenceClassificationPipelineBase - -__all__ = ['SingleSentenceClassificationPipeline'] - - -@PIPELINES.register_module( - Tasks.sentiment_classification, - module_name=Pipelines.sentiment_classification) -class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): - - def __init__(self, - model: Union[Model, str], - preprocessor: Preprocessor = None, - first_sequence='first_sequence', - **kwargs): - """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction. - - Args: - model (str or Model): Supply either a local model dir which supported the sequence classification task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the first sentence in. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='sentiment-classification', - >>> model='damo/nlp_structbert_sentiment-classification_chinese-base') - >>> sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' - >>> print(pipeline_ins(sentence1)) - >>> # Or use the dict input: - >>> print(pipeline_ins({'first_sequence': sentence1})) - - To view other examples plese check the tests/pipelines/test_sentiment-classification.py. - """ - if preprocessor is None: - preprocessor = SingleSentenceClassificationPreprocessor( - model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index aabf48d8..5367c1a8 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -49,7 +49,7 @@ class TokenClassificationPipeline(Pipeline): text = inputs.pop(OutputKeys.TEXT) with torch.no_grad(): return { - **self.model(inputs, **forward_params), OutputKeys.TEXT: text + **self.model(**inputs, **forward_params), OutputKeys.TEXT: text } def postprocess(self, inputs: Dict[str, Any], diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index b4be1845..90303b65 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -16,17 +16,23 @@ if TYPE_CHECKING: from .kws import WavToLists from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) from .nlp import ( - Tokenize, SequenceClassificationPreprocessor, - TextGenerationPreprocessor, TokenClassificationPreprocessor, - SingleSentenceClassificationPreprocessor, - PairSentenceClassificationPreprocessor, FillMaskPreprocessor, - ZeroShotClassificationPreprocessor, NERPreprocessor, - TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor, - SequenceLabelingPreprocessor, RelationExtractionPreprocessor, - DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, - PassageRankingPreprocessor, SentenceEmbeddingPreprocessor, + DocumentSegmentationPreprocessor, + FaqQuestionAnsweringPreprocessor, + FillMaskPoNetPreprocessor, + NLPPreprocessor, + NLPTokenizerPreprocessorBase, + PassageRankingPreprocessor, + RelationExtractionPreprocessor, + SentenceEmbeddingPreprocessor, + SequenceClassificationPreprocessor, + TokenClassificationPreprocessor, + TextErrorCorrectionPreprocessor, + TextGenerationPreprocessor, Text2TextGenerationPreprocessor, - WordSegmentationBlankSetToLabelPreprocessor) + Tokenize, + WordSegmentationBlankSetToLabelPreprocessor, + ZeroShotClassificationPreprocessor, + ) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor) @@ -49,18 +55,22 @@ else: 'kws': ['WavToLists'], 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], 'nlp': [ - 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'TextErrorCorrectionPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'FillMaskPoNetPreprocessor', + 'NLPPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'SequenceClassificationPreprocessor', + 'TokenClassificationPreprocessor', + 'TextErrorCorrectionPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', - 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + 'ZeroShotClassificationPreprocessor', ], 'space': [ 'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor', diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index 8e75ae98..dfbb5c81 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -6,32 +6,41 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .text_error_correction import TextErrorCorrectionPreprocessor from .nlp_base import ( - Tokenize, SequenceClassificationPreprocessor, - TextGenerationPreprocessor, TokenClassificationPreprocessor, - SingleSentenceClassificationPreprocessor, - Text2TextGenerationPreprocessor, - PairSentenceClassificationPreprocessor, FillMaskPreprocessor, - ZeroShotClassificationPreprocessor, NERPreprocessor, - FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, - RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, - FillMaskPoNetPreprocessor, PassageRankingPreprocessor, + DocumentSegmentationPreprocessor, + FaqQuestionAnsweringPreprocessor, + FillMaskPoNetPreprocessor, + NLPPreprocessor, + NLPTokenizerPreprocessorBase, + PassageRankingPreprocessor, + RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor, - WordSegmentationBlankSetToLabelPreprocessor) + SequenceClassificationPreprocessor, + TokenClassificationPreprocessor, + TextGenerationPreprocessor, + Text2TextGenerationPreprocessor, + Tokenize, + WordSegmentationBlankSetToLabelPreprocessor, + ZeroShotClassificationPreprocessor, + ) else: _import_structure = { 'nlp_base': [ - 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'FillMaskPoNetPreprocessor', + 'NLPPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'SequenceClassificationPreprocessor', + 'TokenClassificationPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', - 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + 'ZeroShotClassificationPreprocessor', ], 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index d6325eed..6b559de9 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -2,14 +2,13 @@ import os.path as osp import re -import uuid from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np -from transformers import AutoTokenizer, BertTokenizerFast +import torch +from transformers import AutoTokenizer from modelscope.metainfo import Models, Preprocessors -from modelscope.models.nlp.structbert import SbertTokenizerFast from modelscope.outputs import OutputKeys from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS @@ -23,24 +22,21 @@ from modelscope.utils.type_assert import type_assert logger = get_logger() __all__ = [ - 'Tokenize', + 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'NLPPreprocessor', + 'FillMaskPoNetPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', + 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', 'Text2TextGenerationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', - 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', - 'PassageRankingPreprocessor', - 'FaqQuestionAnsweringPreprocessor', - 'SequenceLabelingPreprocessor', - 'RelationExtractionPreprocessor', - 'DocumentSegmentationPreprocessor', - 'FillMaskPoNetPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'WordSegmentationBlankSetToLabelPreprocessor', + 'ZeroShotClassificationPreprocessor', ] @@ -48,85 +44,19 @@ __all__ = [ class Tokenize(Preprocessor): def __init__(self, tokenizer_name) -> None: - self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]: if isinstance(data, str): data = {InputFields.text: data} - token_dict = self._tokenizer(data[InputFields.text]) + token_dict = self.tokenizer(data[InputFields.text]) data.update(token_dict) return data -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) -class SequenceClassificationPreprocessor(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - from easynlp.modelzoo import AutoTokenizer - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) - - self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) - print(f'this is the tokenzier {self.tokenizer}') - self.label2id = parse_label_mapping(self.model_dir) - - @type_assert(object, (str, tuple, Dict)) - def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: - feature = super().__call__(data) - if isinstance(data, str): - new_data = {self.first_sequence: data} - elif isinstance(data, tuple): - sentence1, sentence2 = data - new_data = { - self.first_sequence: sentence1, - self.second_sequence: sentence2 - } - else: - new_data = data - - # preprocess the data for the model input - - rst = { - 'id': [], - 'input_ids': [], - 'attention_mask': [], - 'token_type_ids': [], - } - - max_seq_length = self.sequence_length - - text_a = new_data[self.first_sequence] - text_b = new_data.get(self.second_sequence, None) - - feature = self.tokenizer( - text_a, - text_b, - padding='max_length', - truncation=True, - max_length=max_seq_length) - - rst['id'].append(new_data.get('id', str(uuid.uuid4()))) - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - rst['token_type_ids'].append(feature['token_type_ids']) - return rst - - class NLPTokenizerPreprocessorBase(Preprocessor): - def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs): + def __init__(self, model_dir: str, mode: str, **kwargs): """The NLP tokenizer preprocessor base class. Any nlp preprocessor which uses the hf tokenizer can inherit from this class. @@ -138,7 +68,6 @@ class NLPTokenizerPreprocessorBase(Preprocessor): label: The label key label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping if this mapping is not supplied. - pair (bool): Pair sentence input or single sentence input. mode: Run this preprocessor in either 'train'/'eval'/'inference' mode kwargs: These kwargs will be directly fed into the tokenizer. """ @@ -148,7 +77,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor): self.first_sequence: str = kwargs.pop('first_sequence', 'first_sequence') self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.pair = pair + self.sequence_length = kwargs.pop('sequence_length', 128) + self._mode = mode self.label = kwargs.pop('label', OutputKeys.LABEL) self.label2id = None @@ -158,6 +88,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): self.label2id = parse_label_mapping(self.model_dir) self.tokenize_kwargs = kwargs + self.tokenizer = self.build_tokenizer(model_dir) @property @@ -179,20 +110,38 @@ class NLPTokenizerPreprocessorBase(Preprocessor): @param model_dir: The local model dir. @return: The initialized tokenizer. """ - + self.is_transformer_based_model = 'lstm' not in model_dir + # fast version lead to parallel inference failed model_type = get_model_type(model_dir) if model_type in (Models.structbert, Models.gpt3, Models.palm, Models.plug): - from modelscope.models.nlp.structbert import SbertTokenizer - return SbertTokenizer.from_pretrained(model_dir, use_fast=False) + from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast + return SbertTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained( + model_dir) elif model_type == Models.veco: - from modelscope.models.nlp.veco import VecoTokenizer - return VecoTokenizer.from_pretrained(model_dir) + from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast + return VecoTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained( + model_dir) elif model_type == Models.deberta_v2: - from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer - return DebertaV2Tokenizer.from_pretrained(model_dir) + from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast + return DebertaV2Tokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained( + model_dir) + elif not self.is_transformer_based_model: + from transformers import BertTokenizer, BertTokenizerFast + return BertTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained( + model_dir) else: - return AutoTokenizer.from_pretrained(model_dir, use_fast=False) + return AutoTokenizer.from_pretrained( + model_dir, + use_fast=False if self._mode == ModeKeys.INFERENCE else True) def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: """process the raw input data @@ -239,7 +188,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): if len(data) == 3: text_a, text_b, labels = data elif len(data) == 2: - if self.pair: + if self._mode == ModeKeys.INFERENCE: text_a, text_b = data else: text_a, labels = data @@ -277,6 +226,22 @@ class NLPTokenizerPreprocessorBase(Preprocessor): output[OutputKeys.LABELS] = labels +@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.feature_extraction) +class NLPPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in MLM task. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(model_dir, mode=mode, **kwargs) + + @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.passage_ranking) class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase): @@ -337,22 +302,12 @@ class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase): Fields.nlp, module_name=Preprocessors.nli_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) -class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in pair sentence classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get( - 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=True, mode=mode, **kwargs) - - +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in single sentence classification. +class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in sequence classification. """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): @@ -360,7 +315,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): kwargs['padding'] = kwargs.get( 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) @PREPROCESSORS.register_module( @@ -421,7 +376,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): model_dir (str): model path """ self.sequence_length = kwargs.pop('sequence_length', 512) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) def __call__(self, data: Union[str, Dict], hypothesis_template: str, candidate_labels: list) -> Dict[str, Any]: @@ -496,14 +451,12 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): tokenizer=None, mode=ModeKeys.INFERENCE, **kwargs): - self.tokenizer = self.build_tokenizer( - model_dir) if tokenizer is None else tokenizer kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', False) kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) @staticmethod def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: @@ -541,20 +494,6 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): } -@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) -class FillMaskPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in MLM task. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - True) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) - - @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.word_segment_text_to_label_preprocessor) @@ -592,21 +531,40 @@ class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): } +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.ner_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in normal token classification task. + """The tokenizer preprocessor used in normal NER task. """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get( 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) self.label_all_tokens = kwargs.pop('label_all_tokens', False) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) - def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: + if 'is_split_into_words' in kwargs: + self.is_split_into_words = kwargs.pop('is_split_into_words') + else: + self.is_split_into_words = self.tokenizer.init_kwargs.get( + 'is_split_into_words', False) + if 'label2id' in kwargs: + kwargs.pop('label2id') + self.tokenize_kwargs = kwargs + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: """process the raw input data Args: @@ -618,23 +576,84 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): Dict[str, Any]: the preprocessed data """ - text_a = None + # preprocess the data for the model input + text = None labels_list = None if isinstance(data, str): - text_a = data + text = data elif isinstance(data, dict): - text_a = data.get(self.first_sequence) + text = data.get(self.first_sequence) labels_list = data.get(self.label) - if isinstance(text_a, str): - text_a = text_a.replace(' ', '').strip() + input_ids = [] + label_mask = [] + offset_mapping = [] + if self.is_split_into_words: + for offset, token in enumerate(list(data)): + subtoken_ids = self.tokenizer.encode( + token, add_special_tokens=False) + if len(subtoken_ids) == 0: + subtoken_ids = [self.tokenizer.unk_token_id] + input_ids.extend(subtoken_ids) + label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) + offset_mapping.extend([(offset, offset + 1)]) + else: + if self.tokenizer.is_fast: + encodings = self.tokenizer( + text, + add_special_tokens=False, + return_offsets_mapping=True, + **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + word_ids = encodings.word_ids() + for i in range(len(word_ids)): + if word_ids[i] is None: + label_mask.append(0) + elif word_ids[i] == word_ids[i - 1]: + label_mask.append(0) + offset_mapping[-1] = ( + offset_mapping[-1][0], + encodings['offset_mapping'][i][1]) + else: + label_mask.append(1) + offset_mapping.append(encodings['offset_mapping'][i]) + else: + encodings = self.tokenizer( + text, add_special_tokens=False, **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( + text) + + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + offset_mapping = offset_mapping[:sum(label_mask)] - tokenized_inputs = self.tokenizer( - [t for t in text_a], - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - is_split_into_words=True, - **self.tokenize_kwargs) + if not self.is_transformer_based_model: + input_ids = input_ids[1:-1] + attention_mask = attention_mask[1:-1] + label_mask = label_mask[1:-1] + + if self._mode == ModeKeys.INFERENCE: + input_ids = torch.tensor(input_ids).unsqueeze(0) + attention_mask = torch.tensor(attention_mask).unsqueeze(0) + label_mask = torch.tensor( + label_mask, dtype=torch.bool).unsqueeze(0) + + # the token classification + output = { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping + } + # align the labels with tokenized text if labels_list is not None: assert self.label2id is not None # Map that sends B-Xxx label to its I-Xxx counterpart @@ -653,7 +672,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): b_to_i_label.append(idx) label_row = [self.label2id[lb] for lb in labels_list] - word_ids = tokenized_inputs.word_ids() previous_word_idx = None label_ids = [] for word_idx in word_ids: @@ -668,229 +686,66 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_ids.append(-100) previous_word_idx = word_idx labels = label_ids - tokenized_inputs['labels'] = labels - # new code end - - if self._mode == ModeKeys.INFERENCE: - tokenized_inputs[OutputKeys.TEXT] = text_a - return tokenized_inputs - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.ner_tokenizer) -class NERPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal NER task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - self.is_transformer_based_model = 'lstm' not in model_dir - if self.is_transformer_based_model: - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=True) - else: - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, use_fast=True) - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ + output['labels'] = labels + return output - # preprocess the data for the model input - text = data - if self.is_split_into_words: - input_ids = [] - label_mask = [] - offset_mapping = [] - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)] - + [(offset + 1, offset + 1)] - * (len(subtoken_ids) - 1)) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - offset_mapping = offset_mapping[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - else: - encodings = self.tokenizer( - text, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_offsets_mapping=True) - input_ids = encodings['input_ids'] - attention_mask = encodings['attention_mask'] - word_ids = encodings.word_ids() - label_mask = [] - offset_mapping = [] - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = (offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) + def get_tokenizer_class(self): + tokenizer_class = self.tokenizer.__class__.__name__ + if tokenizer_class.endswith( + 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': + tokenizer_class = tokenizer_class[:-4] + return tokenizer_class + + def get_label_mask_and_offset_mapping(self, text): + label_mask = [] + offset_mapping = [] + tokens = self.tokenizer.tokenize(text) + offset = 0 + if self.get_tokenizer_class() == 'BertTokenizer': + for token in tokens: + is_start = (token[:2] != '##') + if is_start: + label_mask.append(True) else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) - - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - return { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) -class SequenceLabelingPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal NER task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - - if 'lstm' in model_dir or 'gcnn' in model_dir: - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, use_fast=False) - elif 'structbert' in model_dir: - self.tokenizer = SbertTokenizerFast.from_pretrained( - model_dir, use_fast=False) - else: - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=False) - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - text = data - if self.is_split_into_words: - input_ids = [] - label_mask = [] - offset_mapping = [] - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)] - + [(offset + 1, offset + 1)] - * (len(subtoken_ids) - 1)) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - offset_mapping = offset_mapping[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - else: - encodings = self.tokenizer( - text, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_offsets_mapping=True) - input_ids = encodings['input_ids'] - attention_mask = encodings['attention_mask'] - word_ids = encodings.word_ids() - label_mask = [] - offset_mapping = [] - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = (offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) + token = token[2:] + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if is_start: + offset_mapping.append((start, end)) else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + elif self.get_tokenizer_class() == 'XLMRobertaTokenizer': + last_is_blank = False + for token in tokens: + is_start = (token[0] == '▁') + if is_start: + token = token[1:] + label_mask.append(True) + if len(token) == 0: + last_is_blank = True + continue + else: + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if last_is_blank or is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + last_is_blank = False + else: + raise NotImplementedError - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - return { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } + return label_mask, offset_mapping @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.re_tokenizer) class RelationExtractionPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal RE task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. + """The relation extraction preprocessor used in normal RE task. """ def __init__(self, model_dir: str, *args, **kwargs): @@ -937,7 +792,7 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): super(FaqQuestionAnsweringPreprocessor, self).__init__( - model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs) + model_dir, mode=ModeKeys.INFERENCE, **kwargs) import os from transformers import BertTokenizer @@ -1026,7 +881,7 @@ class DocumentSegmentationPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) - + from transformers import BertTokenizerFast self.tokenizer = BertTokenizerFast.from_pretrained( model_dir, use_fast=True, diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 75add1d9..b19c0fce 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -115,6 +115,7 @@ class NLPTasks(object): conversational_text_to_sql = 'conversational-text-to-sql' information_extraction = 'information-extraction' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' class AudioTasks(object): diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index 3cf88114..7a9c79e2 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -74,7 +74,6 @@ class Registry(object): raise KeyError(f'{module_name} is already registered in ' f'{self._name}[{group_key}]') self._modules[group_key][module_name] = module_cls - module_cls.group_key = group_key def register_module(self, group_key: str = default_group, @@ -196,6 +195,7 @@ def build_from_cfg(cfg, if obj_cls is None: raise KeyError(f'{obj_type} is not in the {registry.name}' f' registry group {group_key}') + obj_cls.group_key = group_key elif inspect.isclass(obj_type) or inspect.isfunction(obj_type): obj_cls = obj_type else: diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 762530f4..91a3b5c5 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -75,7 +75,8 @@ class MsDatasetTest(unittest.TestCase): preprocessor = SequenceClassificationPreprocessor( nlp_model.model_dir, first_sequence='premise', - second_sequence=None) + second_sequence=None, + padding='max_length') ms_ds_train = MsDataset.load( 'xcopa', subset_name='translation-et', diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py index 4f3206cd..549d2cb3 100644 --- a/tests/pipelines/test_deberta_tasks.py +++ b/tests/pipelines/test_deberta_tasks.py @@ -6,11 +6,9 @@ import torch from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import DebertaV2ForMaskedLM -from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer, - DebertaV2TokenizerFast) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.preprocessors import NLPPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -24,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): model_dir = snapshot_download(self.model_id_deberta) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = DebertaV2ForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -40,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): # sbert print(self.model_id_deberta) model = Model.from_pretrained(self.model_id_deberta) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( task=Tasks.fill_mask, model=model, preprocessor=preprocessor) diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py new file mode 100644 index 00000000..39291e76 --- /dev/null +++ b/tests/pipelines/test_feature_extraction.py @@ -0,0 +1,67 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import numpy as np + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp import FeatureExtractionModel +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import FeatureExtractionPipeline +from modelscope.preprocessors import NLPPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class FeatureExtractionTaskModelTest(unittest.TestCase, + DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.feature_extraction + self.model_id = 'damo/pert_feature-extraction_base-test' + + sentence1 = '测试embedding' + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_direct_file_download(self): + cache_path = snapshot_download(self.model_id) + tokenizer = NLPPreprocessor(cache_path, padding=False) + model = FeatureExtractionModel.from_pretrained(self.model_id) + pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer) + pipeline2 = pipeline( + Tasks.feature_extraction, model=model, preprocessor=tokenizer) + result = pipeline1(input=self.sentence1) + + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{np.shape(result[OutputKeys.TEXT_EMBEDDING])}') + result = pipeline2(input=self.sentence1) + print(f'sentence1: {self.sentence1}\n' + f'pipeline1: {np.shape(result[OutputKeys.TEXT_EMBEDDING])}') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + tokenizer = NLPPreprocessor(model.model_dir, padding=False) + pipeline_ins = pipeline( + task=Tasks.feature_extraction, model=model, preprocessor=tokenizer) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.feature_extraction, model=self.model_id) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_default_model(self): + pipeline_ins = pipeline(task=Tasks.feature_extraction) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index cec8966f..0e5e242b 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -1,13 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest +from regex import R + from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM, VecoForMaskedLM) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.preprocessors import NLPPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -51,7 +53,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # sbert for language in ['zh']: model_dir = snapshot_download(self.model_id_sbert[language]) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = StructBertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -66,7 +68,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model_dir = snapshot_download(self.model_id_veco) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = VecoForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -80,13 +82,28 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n' ) + # bert + language = 'zh' + model_dir = snapshot_download(self.model_id_bert, revision='beta') + preprocessor = NLPPreprocessor( + model_dir, first_sequence='sentence', second_sequence=None) + model = Model.from_pretrained(model_dir) + pipeline1 = FillMaskPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + ori_text = self.ori_texts[language] + test_input = self.test_inputs[language] + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: ' + f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): + # sbert for language in ['zh']: print(self.model_id_sbert[language]) model = Model.from_pretrained(self.model_id_sbert[language]) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) @@ -100,7 +117,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model = Model.from_pretrained(self.model_id_veco) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -113,6 +130,18 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') + # bert + language = 'zh' + model = Model.from_pretrained(self.model_id_bert, revision='beta') + preprocessor = NLPPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + pipeline_ins.model, f'fill_mask_bert_{language}' + print( + f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' + f'{pipeline_ins(self.test_inputs[language])}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): # veco @@ -131,6 +160,16 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'{pipeline_ins(self.test_inputs[language])}\n') + # Bert + language = 'zh' + pipeline_ins = pipeline( + task=Tasks.fill_mask, + model=self.model_id_bert, + model_revision='beta') + print( + f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' + f'{pipeline_ins(self.test_inputs[language])}\n') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.fill_mask) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 9fae2d09..3658cf3f 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -7,7 +7,7 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition, TransformerCRFForNamedEntityRecognition) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline -from modelscope.preprocessors import NERPreprocessor +from modelscope.preprocessors import TokenClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -26,7 +26,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): cache_path = snapshot_download(self.tcrf_model_id) - tokenizer = NERPreprocessor(cache_path) + tokenizer = TokenClassificationPreprocessor(cache_path) model = TransformerCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -43,7 +43,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_by_direct_model_download(self): cache_path = snapshot_download(self.lcrf_model_id) - tokenizer = NERPreprocessor(cache_path) + tokenizer = TokenClassificationPreprocessor(cache_path) model = LSTMCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -60,7 +60,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_tcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.tcrf_model_id) - tokenizer = NERPreprocessor(model.model_dir) + tokenizer = TokenClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, @@ -70,7 +70,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.lcrf_model_id) - tokenizer = NERPreprocessor(model.model_dir) + tokenizer = TokenClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index a53ac3b3..db4b9912 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import PairSentenceClassificationPipeline -from modelscope.preprocessors import PairSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -26,9 +26,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(cache_path) + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = PairSentenceClassificationPipeline( + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' @@ -40,7 +40,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.nli, model=model, preprocessor=tokenizer) print(pipeline_ins(input=(self.sentence1, self.sentence2))) diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 4079455d..288d38c7 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import PairSentenceClassificationPipeline -from modelscope.preprocessors import PairSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -26,9 +26,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): cache_path = snapshot_download(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(cache_path) + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = PairSentenceClassificationPipeline( + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentence_similarity, model=model, preprocessor=tokenizer) @@ -43,7 +43,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=model, diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index 3db9971a..d0b1b40f 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -6,8 +6,8 @@ from modelscope.models import Model from modelscope.models.nlp.task_models.sequence_classification import \ SequenceClassificationModel from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline -from modelscope.preprocessors import SingleSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -17,23 +17,21 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, DemoCompatibilityCheck): def setUp(self) -> None: - self.task = Tasks.sentiment_classification + self.task = Tasks.text_classification self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): - cache_path = snapshot_download(self.model_id) - tokenizer = SingleSentenceClassificationPreprocessor(cache_path) + cache_path = snapshot_download(self.model_id, revision='beta') + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SequenceClassificationModel.from_pretrained( - self.model_id, num_labels=2) - pipeline1 = SingleSentenceClassificationPipeline( + self.model_id, num_labels=2, revision='beta') + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( - Tasks.sentiment_classification, - model=model, - preprocessor=tokenizer) + Tasks.text_classification, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\n' f'pipeline1:{pipeline1(input=self.sentence1)}') print(f'sentence1: {self.sentence1}\n' @@ -41,10 +39,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id) - tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir) + model = Model.from_pretrained(self.model_id, revision='beta') + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( - task=Tasks.sentiment_classification, + task=Tasks.text_classification, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.sentence1)) @@ -54,14 +52,17 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline( - task=Tasks.sentiment_classification, model=self.model_id) + task=Tasks.text_classification, + model=self.model_id, + model_revision='beta') print(pipeline_ins(input=self.sentence1)) self.assertTrue( isinstance(pipeline_ins.model, SequenceClassificationModel)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_default_model(self): - pipeline_ins = pipeline(task=Tasks.sentiment_classification) + pipeline_ins = pipeline( + task=Tasks.text_classification, model_revision='beta') print(pipeline_ins(input=self.sentence1)) self.assertTrue( isinstance(pipeline_ins.model, SequenceClassificationModel)) diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index 71b9f3e2..39dbac99 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -12,6 +12,7 @@ from modelscope.utils.test_utils import test_level class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): + sentence1 = 'i like this wonderful place' def setUp(self) -> None: self.model_id = 'damo/bert-base-sst2' @@ -46,7 +47,8 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.text_classification, model=model, preprocessor=preprocessor) - self.predict(pipeline_ins) + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{pipeline_ins(input=self.sentence1)}') # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skip('nlp model does not support tensor input, skipped') diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index 4271e201..f9f4d93f 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -32,6 +32,82 @@ class NLPPreprocessorTest(unittest.TestCase): output['attention_mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + def test_token_classification_tokenize(self): + with self.subTest(tokenizer_type='bert'): + cfg = dict( + type='token-cls-tokenizer', + model_dir='bert-base-cased', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, + 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, + 119, 102 + ]) + self.assertEqual(output['attention_mask'].tolist()[0], [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1 + ]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, True, True, + False, True, True, True, True, True, True, True, True, True, + True, False + ]) + self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), + (7, 13), (14, 16), + (17, 20), (21, 28), + (29, 31), (32, 39), + (39, 40), (41, 44), + (45, 49), (50, 53), + (54, 60), (61, 64), + (65, 70), (71, 73), + (74, 79), (79, 80)]) + + with self.subTest(tokenizer_type='roberta'): + cfg = dict( + type='token-cls-tokenizer', + model_dir='xlm-roberta-base', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239, + 99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56, + 5, 2 + ]) + self.assertEqual(output['attention_mask'].tolist()[0], [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + ]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, False, True, + True, False, False, False, True, True, True, True, False, True, + True, True, True, False, False, False + ]) + self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), + (7, 13), (14, 16), + (17, 20), (21, 28), + (29, 31), (32, 40), + (41, 44), (45, 49), + (50, 53), (54, 60), + (61, 64), (65, 70), + (71, 73), (74, 80)]) + if __name__ == '__main__': unittest.main() diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index de99a7b8..9a8ab828 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -30,7 +30,7 @@ class AstScaningTest(unittest.TestCase): def test_ast_scaning_class(self): astScaner = AstScaning() pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp', - 'sequence_classification_pipeline.py') + 'text_generation_pipeline.py') output = astScaner.generate_ast(pipeline_file) self.assertTrue(output['imports'] is not None) self.assertTrue(output['from_imports'] is not None) @@ -40,14 +40,12 @@ class AstScaningTest(unittest.TestCase): self.assertIsInstance(imports, dict) self.assertIsInstance(from_imports, dict) self.assertIsInstance(decorators, list) - self.assertListEqual( - list(set(imports.keys()) - set(['typing', 'numpy'])), []) - self.assertEqual(len(from_imports.keys()), 9) + self.assertListEqual(list(set(imports.keys()) - set(['torch'])), []) + self.assertEqual(len(from_imports.keys()), 7) self.assertTrue(from_imports['modelscope.metainfo'] is not None) self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines']) - self.assertEqual( - decorators, - [('PIPELINES', 'text-classification', 'sentiment-analysis')]) + self.assertEqual(decorators, + [('PIPELINES', 'text-generation', 'text-generation')]) def test_files_scaning_method(self): fileScaner = FilesAstScaning()