diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index a85d787f..ed753e50 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
-size 63349
+oid sha256:3b38bfb5a851d35d5fba4d59eda926557666dbd62c70e3e3b24c22605e7d9c4a
+size 40771
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index 7cee331b..7a11f73a 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -7,7 +7,8 @@ from torch.utils.data.dataloader import default_collate
 from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
-from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.preprocessors import (
+    TextClassificationTransformersPreprocessor, build_preprocessor)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModeKeys, Tasks
 
@@ -59,12 +60,13 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
             'mode': ModeKeys.TRAIN,
             **sequence_length
         })
-        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
+        preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor(
+            cfg, field_name)
         if pair:
-            first_sequence = preprocessor.tokenizer.unk_token
-            second_sequence = preprocessor.tokenizer.unk_token
+            first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
+            second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
         else:
-            first_sequence = preprocessor.tokenizer.unk_token
+            first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
             second_sequence = None
 
         batched = []
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index 1fe1c329..dc11c3d8 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -19,18 +19,27 @@ from .builder import METRICS, MetricKeys
 class SequenceClassificationMetric(Metric):
     """The metric computation class for sequence classification tasks.
 
-    This metric class calculates accuracy of the whole input batches.
+    This metric class calculates accuracy/F1 of all the input batches.
+
+    Args:
+        label_name: The key of label column in the 'inputs' arg.
+        logit_name: The key of logits column in the 'inputs' arg.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self,
+                 label_name=OutputKeys.LABELS,
+                 logit_name=OutputKeys.LOGITS,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.preds = []
         self.labels = []
+        self.label_name = label_name
+        self.logit_name = logit_name
 
     def add(self, outputs: Dict, inputs: Dict):
-        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
-        ground_truths = inputs[label_name]
-        eval_results = outputs[OutputKeys.LOGITS]
+        ground_truths = inputs[self.label_name]
+        eval_results = outputs[self.logit_name]
         self.preds.append(
             torch_nested_numpify(torch_nested_detach(eval_results)))
         self.labels.append(
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index 08df5235..3d6e6964 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -18,16 +18,22 @@ class TextGenerationMetric(Metric):
     """The metric computation class for text generation classes.
 
     This metric class calculates F1 of the rouge scores for the whole evaluation dataset.
+
+    Args:
+        target_text: The key of the target text column in the `inputs` arg.
+        pred_text: The key of the predicted text column in the `outputs` arg.
     """
 
-    def __init__(self):
+    def __init__(self, target_text='tgts', pred_text='preds'):
         self.preds: List[str] = []
         self.tgts: List[str] = []
         self.rouge = Rouge()
+        self.target_text = target_text
+        self.pred_text = pred_text
 
     def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]):
-        ground_truths = inputs['tgts']
-        eval_results = outputs['preds']
+        ground_truths = inputs[self.target_text]
+        eval_results = outputs[self.pred_text]
         for truth in ground_truths:
             self.tgts.append(rebuild_chinese_str(truth))
         for result in eval_results:
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index f8595fc1..5d1ece4a 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -21,20 +21,16 @@ class TokenClassificationMetric(Metric):
     This metric class uses seqeval to calculate the scores.
 
     Args:
-        return_entity_level_metrics (bool, *optional*):
+        label_name(str, `optional`): The key of label column in the 'inputs' arg.
+        logit_name(str, `optional`): The key of logits column in the 'inputs' arg.
+        return_entity_level_metrics (bool, `optional`):
             Whether to return every label's detail metrics, default False.
+        label2id(dict, `optional`): The label2id information to get the token labels.
     """
 
-    def add(self, outputs: Dict, inputs: Dict):
-        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
-        ground_truths = inputs[label_name]
-        eval_results = outputs[OutputKeys.LOGITS]
-        self.preds.append(
-            torch_nested_numpify(torch_nested_detach(eval_results)))
-        self.labels.append(
-            torch_nested_numpify(torch_nested_detach(ground_truths)))
-
     def __init__(self,
+                 label_name=OutputKeys.LABELS,
+                 logit_name=OutputKeys.LOGITS,
                  return_entity_level_metrics=False,
                  label2id=None,
                  *args,
@@ -44,6 +40,16 @@ class TokenClassificationMetric(Metric):
         self.preds = []
         self.labels = []
         self.label2id = label2id
+        self.label_name = label_name
+        self.logit_name = logit_name
+
+    def add(self, outputs: Dict, inputs: Dict):
+        ground_truths = inputs[self.label_name]
+        eval_results = outputs[self.logit_name]
+        self.preds.append(
+            torch_nested_numpify(torch_nested_detach(eval_results)))
+        self.labels.append(
+            torch_nested_numpify(torch_nested_detach(ground_truths)))
 
     def evaluate(self):
         label2id = self.label2id
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 1f464bf3..94757641 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -6,7 +6,8 @@ from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
-from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
+from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
+                                         save_pretrained)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
 from modelscope.utils.device import verify_device
@@ -129,11 +130,9 @@ class Model(ABC):
             model_cfg[k] = v
         if device is not None:
             model_cfg.device = device
-            model = build_model(
-                model_cfg, task_name=task_name, default_args=kwargs)
+            model = build_model(model_cfg, task_name=task_name)
         else:
-            model = build_model(
-                model_cfg, task_name=task_name, default_args=kwargs)
+            model = build_model(model_cfg, task_name=task_name)
 
         # dynamically add pipeline info to model for pipeline inference
         if hasattr(cfg, 'pipeline'):
@@ -142,6 +141,7 @@ class Model(ABC):
         if not hasattr(model, 'cfg'):
             model.cfg = cfg
 
+        model_cfg.pop('model_dir', None)
         model.name = model_name_or_path
         model.model_dir = local_model_dir
         return model
@@ -151,6 +151,7 @@ class Model(ABC):
                         save_checkpoint_names: Union[str, List[str]] = None,
                         save_function: Callable = save_checkpoint,
                         config: Optional[dict] = None,
+                        save_config_function: Callable = save_configuration,
                         **kwargs):
         """save the pretrained model, its configuration and other related files to a directory,
             so that it can be re-loaded
@@ -168,18 +169,15 @@ class Model(ABC):
             config (Optional[dict], optional):
             The config for the configuration.json, might not be identical with model.config
 
+            save_config_function (Callble, optional):
+            The function to use to save the configuration.
+
         """
         if config is None and hasattr(self, 'cfg'):
             config = self.cfg
-        assert config is not None, 'Cannot save the model because the model config is empty.'
-        if isinstance(config, Config):
-            config = config.to_dict()
-        if 'preprocessor' in config and config['preprocessor'] is not None:
-            if 'mode' in config['preprocessor']:
-                config['preprocessor']['mode'] = 'inference'
-            elif 'val' in config['preprocessor'] and 'mode' in config[
-                    'preprocessor']['val']:
-                config['preprocessor']['val']['mode'] = 'inference'
+
+        if config is not None:
+            save_config_function(target_folder, config)
 
         save_pretrained(self, target_folder, save_checkpoint_names,
-                        save_function, config, **kwargs)
+                        save_function, **kwargs)
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index 3c99a1f2..ff059f7b 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -6,6 +6,7 @@ import torch
 from torch import nn
 
 from modelscope.utils.file_utils import func_receive_dict_inputs
+from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 from .base_model import Model
 
diff --git a/modelscope/models/nlp/T5/backbone.py b/modelscope/models/nlp/T5/backbone.py
index 9a46d980..e8abfbae 100644
--- a/modelscope/models/nlp/T5/backbone.py
+++ b/modelscope/models/nlp/T5/backbone.py
@@ -36,9 +36,7 @@ from transformers.utils.model_parallel_utils import (assert_device_map,
 from modelscope.metainfo import Models
 from modelscope.models.base import Model, Tensor, TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.outputs import (BaseModelOutput,
-                                BaseModelOutputWithPastAndCrossAttentions,
-                                Seq2SeqModelOutput)
+from modelscope.outputs import AttentionBackboneModelOutput, Seq2SeqModelOutput
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from .configuration import T5Config
@@ -1182,7 +1180,7 @@ class T5Stack(T5PreTrainedModel):
                 all_attentions,
                 all_cross_attentions,
             ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=hidden_states,
             past_key_values=present_key_value_states,
             hidden_states=all_hidden_states,
@@ -1475,8 +1473,9 @@ class T5Model(T5PreTrainedModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
+        elif return_dict and not isinstance(encoder_outputs,
+                                            AttentionBackboneModelOutput):
+            encoder_outputs = AttentionBackboneModelOutput(
                 last_hidden_state=encoder_outputs[0],
                 hidden_states=encoder_outputs[1]
                 if len(encoder_outputs) > 1 else None,
diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py
index c4dcdfdb..0275ecb9 100644
--- a/modelscope/models/nlp/T5/text2text_generation.py
+++ b/modelscope/models/nlp/T5/text2text_generation.py
@@ -24,7 +24,8 @@ from transformers.utils.model_parallel_utils import (assert_device_map,
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput
+from modelscope.outputs import (AttentionBackboneModelOutput, Seq2SeqLMOutput,
+                                TokenGeneratorOutput)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from .backbone import T5PreTrainedModel, T5Stack
@@ -311,8 +312,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
+        elif return_dict and not isinstance(encoder_outputs,
+                                            AttentionBackboneModelOutput):
+            encoder_outputs = AttentionBackboneModelOutput(
                 last_hidden_state=encoder_outputs[0],
                 hidden_states=encoder_outputs[1]
                 if len(encoder_outputs) > 1 else None,
@@ -426,6 +428,16 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self._shift_right(labels)
 
+    def generate(
+        self,
+        *args,
+        **kwargs,
+    ):
+        output = super().generate(*args, **kwargs)
+        return TokenGeneratorOutput(
+            sequences=output if isinstance(output, torch.Tensor) else output[0]
+        )
+
     def _reorder_cache(self, past, beam_idx):
         # if decoder past is not included in output
         # speedy decoding is disabled and no need to reorder
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index ef2dc424..26205bcb 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -30,9 +30,7 @@ if TYPE_CHECKING:
         SbertForMaskedLM,
         SbertForSequenceClassification,
         SbertForTokenClassification,
-        SbertTokenizer,
         SbertModel,
-        SbertTokenizerFast,
     )
     from .T5 import T5ForConditionalGeneration
     from .mglm import MGLMForTextSummarization
@@ -51,8 +49,7 @@ if TYPE_CHECKING:
     )
     from .veco import (VecoConfig, VecoForMaskedLM,
                        VecoForSequenceClassification,
-                       VecoForTokenClassification, VecoModel, VecoTokenizer,
-                       VecoTokenizerFast)
+                       VecoForTokenClassification, VecoModel)
     from .bloom import BloomModel
 else:
     _import_structure = {
@@ -66,8 +63,6 @@ else:
             'SbertForMaskedLM',
             'SbertForSequenceClassification',
             'SbertForTokenClassification',
-            'SbertTokenizer',
-            'SbertTokenizerFast',
             'SbertModel',
         ],
         'veco': [
@@ -76,8 +71,6 @@ else:
             'VecoForSequenceClassification',
             'VecoForTokenClassification',
             'VecoModel',
-            'VecoTokenizer',
-            'VecoTokenizerFast',
         ],
         'bert': [
             'BertForMaskedLM',
diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
index 27abedb5..ab765190 100644
--- a/modelscope/models/nlp/bart/text_error_correction.py
+++ b/modelscope/models/nlp/bart/text_error_correction.py
@@ -7,6 +7,7 @@ import torch.cuda
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.outputs import TextErrorCorrectionOutput
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['BartForTextErrorCorrection']
@@ -55,7 +56,7 @@ class BartForTextErrorCorrection(TorchModel):
 
         self.task = task
 
-    def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]:
+    def forward(self, input: Dict[str, Dict]) -> TextErrorCorrectionOutput:
         """return the result by the model
 
         Args:
@@ -91,4 +92,4 @@ class BartForTextErrorCorrection(TorchModel):
 
         # get 1-best List[Tensor]
         preds = translations[0][0]['tokens']
-        return {'predictions': preds}
+        return TextErrorCorrectionOutput(predictions=preds)
diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py
index df0aebd2..bd432509 100755
--- a/modelscope/models/nlp/bert/backbone.py
+++ b/modelscope/models/nlp/bert/backbone.py
@@ -16,9 +16,6 @@
 """PyTorch BERT model. """
 
 import math
-import os
-from dataclasses import dataclass
-from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -33,11 +30,10 @@ from transformers.modeling_utils import (PreTrainedModel,
 from modelscope.metainfo import Models
 from modelscope.models import Model, TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions,
-                                BaseModelOutputWithPoolingAndCrossAttentions)
+from modelscope.outputs import AttentionBackboneModelOutput
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
 from .configuration import BertConfig
 
 logger = get_logger(__name__)
@@ -562,7 +558,7 @@ class BertEncoder(nn.Module):
                 all_self_attentions,
                 all_cross_attentions,
             ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
@@ -639,30 +635,15 @@ class BertPreTrainedModel(TorchModel, PreTrainedModel):
             The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
 
-        model_dir = kwargs.get('model_dir', None)
+        model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
         if model_dir is None:
-            config = BertConfig(**kwargs)
+            config = BertConfig(**model_args)
             model = cls(config)
         else:
-            model_kwargs = {}
-            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
-            id2label = kwargs.get(
-                'id2label', None if label2id is None else
-                {id: label
-                 for label, id in label2id.items()})
-            if id2label is not None and label2id is None:
-                label2id = {label: id for id, label in id2label.items()}
-
-            num_labels = kwargs.get(
-                'num_labels', None if label2id is None else len(label2id))
-            if num_labels is not None:
-                model_kwargs['num_labels'] = num_labels
-            if label2id is not None:
-                model_kwargs['label2id'] = label2id
-            if id2label is not None:
-                model_kwargs['id2label'] = id2label
             model = super(Model, cls).from_pretrained(
-                pretrained_model_name_or_path=model_dir, **model_kwargs)
+                pretrained_model_name_or_path=model_dir, **model_args)
         model.model_dir = model_dir
         return model
 
@@ -750,7 +731,7 @@ class BertModel(BertPreTrainedModel):
                 output_attentions=None,
                 output_hidden_states=None,
                 return_dict=None,
-                **kwargs):
+                **kwargs) -> AttentionBackboneModelOutput:
         r"""
         Args:
         input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
@@ -936,7 +917,7 @@ class BertModel(BertPreTrainedModel):
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPoolingAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             past_key_values=encoder_outputs.past_key_values,
diff --git a/modelscope/models/nlp/bert/document_segmentation.py b/modelscope/models/nlp/bert/document_segmentation.py
index ca27a166..36c39f43 100644
--- a/modelscope/models/nlp/bert/document_segmentation.py
+++ b/modelscope/models/nlp/bert/document_segmentation.py
@@ -5,37 +5,22 @@ from typing import Any, Dict
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import TokenClassifierOutput
-from transformers.models.bert.modeling_bert import (BertModel,
-                                                    BertPreTrainedModel)
 
 from modelscope.metainfo import Models
-from modelscope.models.base import Model
+from modelscope.models import Model
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.ponet import PoNetConfig
+from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+from .configuration import BertConfig
 
 __all__ = ['BertForDocumentSegmentation']
 
 
 @MODELS.register_module(
     Tasks.document_segmentation, module_name=Models.bert_for_ds)
-class BertForDocumentSegmentation(Model):
-
-    def __init__(self, model_dir: str, model_config: Dict[str, Any], *args,
-                 **kwargs):
-        super().__init__(model_dir, model_config, *args, **kwargs)
-        self.model_cfg = model_config
-
-    def build_with_config(self, config):
-        self.bert_model = BertForDocumentSegmentationBase.from_pretrained(
-            self.model_dir, from_tf=False, config=config)
-        return self.bert_model
-
-    def forward(self) -> Dict[str, Any]:
-        return self.model_cfg
-
-
-class BertForDocumentSegmentationBase(BertPreTrainedModel):
+class BertForDocumentSegmentation(BertPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r'pooler']
 
@@ -103,9 +88,25 @@ class BertForDocumentSegmentationBase(BertPreTrainedModel):
             output = (logits, ) + outputs[2:]
             return ((loss, ) + output) if loss is not None else output
 
-        return TokenClassifierOutput(
+        return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+    @classmethod
+    def _instantiate(cls, model_dir, model_config: Dict[str, Any], **kwargs):
+        if model_config['type'] == 'bert':
+            config = BertConfig.from_pretrained(model_dir, num_labels=2)
+        elif model_config['type'] == 'ponet':
+            config = PoNetConfig.from_pretrained(model_dir, num_labels=2)
+        else:
+            raise ValueError(
+                f'Expected config type bert and ponet, which is : {model_config["type"]}'
+            )
+        model = super(Model, cls).from_pretrained(
+            model_dir, from_tf=False, config=config)
+        model.model_dir = model_dir
+        model.model_cfg = model_config
+        return model
diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py
index 4f81f62d..1f44365c 100644
--- a/modelscope/models/nlp/bert/fill_mask.py
+++ b/modelscope/models/nlp/bert/fill_mask.py
@@ -121,7 +121,7 @@ class BertForMaskedLM(BertPreTrainedModel):
 
     Preprocessor:
         This is the fill_mask model of Structbert, the preprocessor of this model
-        is `modelscope.preprocessors.NLPPreprocessor`.
+        is `modelscope.preprocessors.FillMaskTransformersPreprocessor`.
 
     Parameters:
         config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py
index b1d18d0f..ff4a2418 100644
--- a/modelscope/models/nlp/bert/text_classification.py
+++ b/modelscope/models/nlp/bert/text_classification.py
@@ -51,7 +51,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Preprocessor:
         This is the fill_mask model of Bert, the preprocessor of this model
-        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+        is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`.
 
     Trainer:
         This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py
index 5dc6b0ce..15ea3231 100644
--- a/modelscope/models/nlp/bert/token_classification.py
+++ b/modelscope/models/nlp/bert/token_classification.py
@@ -22,7 +22,7 @@ from torch.nn import CrossEntropyLoss
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import TokenClassifierOutput
+from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
 from .backbone import BertModel, BertPreTrainedModel
@@ -47,7 +47,7 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     Preprocessor:
         This is the fill_mask model of Bert, the preprocessor of this model
-        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+        is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`.
 
     Trainer:
         This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
@@ -169,7 +169,7 @@ class BertForTokenClassification(BertPreTrainedModel):
             - 0 for tokens that are **masked**.
 
         Returns:
-            Returns `modelscope.outputs.TokenClassifierOutput`
+            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
 
         Examples:
             >>> from modelscope.models import Model
@@ -212,14 +212,25 @@ class BertForTokenClassification(BertPreTrainedModel):
                 loss = loss_fct(
                     logits.view(-1, self.num_labels), labels.view(-1))
 
+        if label_mask is not None:
+            mask = label_mask
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
         if not return_dict:
             output = (logits, ) + outputs[2:]
             return ((loss, ) + output) if loss is not None else output
 
-        return TokenClassifierOutput(
+        return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             offset_mapping=offset_mapping,
+            label_mask=label_mask,
         )
diff --git a/modelscope/models/nlp/deberta_v2/backbone.py b/modelscope/models/nlp/deberta_v2/backbone.py
index cca38133..0daa8c7d 100644
--- a/modelscope/models/nlp/deberta_v2/backbone.py
+++ b/modelscope/models/nlp/deberta_v2/backbone.py
@@ -22,7 +22,6 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import LayerNorm
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import softmax_backward_data
 
@@ -574,7 +573,7 @@ class DebertaV2Encoder(nn.Module):
             return tuple(
                 v for v in [output_states, all_hidden_states, all_attentions]
                 if v is not None)
-        return BaseModelOutput(
+        return AttentionBackboneModelOutput(
             last_hidden_state=output_states,
             hidden_states=all_hidden_states,
             attentions=all_attentions)
diff --git a/modelscope/models/nlp/deberta_v2/fill_mask.py b/modelscope/models/nlp/deberta_v2/fill_mask.py
index ed127d4c..e8adf1b5 100644
--- a/modelscope/models/nlp/deberta_v2/fill_mask.py
+++ b/modelscope/models/nlp/deberta_v2/fill_mask.py
@@ -44,7 +44,7 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
 
     Preprocessor:
         This is the fill_mask model of Deberta_v2, the preprocessor of this model
-        is `modelscope.preprocessors.NLPPreprocessor`.
+        is `modelscope.preprocessors.FillMaskTransformersPreprocessor`.
 
     Parameters:
         config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py
index 45ab6621..c3fef28a 100644
--- a/modelscope/models/nlp/palm_v2/__init__.py
+++ b/modelscope/models/nlp/palm_v2/__init__.py
@@ -18,18 +18,16 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .configuration import PalmConfig
-    from .backbone import (
+    from .text_generation import (
         AbsSummarizer,
-        PalmForConditionalGeneration,
+        PalmForTextGeneration,
         Translator,
     )
-    from .text_generation import PalmForTextGeneration
 else:
     _import_structure = {
         'configuration': ['PalmConfig'],
-        'backbone':
-        ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
-        'text_generation': ['PalmForTextGeneration'],
+        'text_generation':
+        ['AbsSummarizer', 'Translator', 'PalmForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/palm_v2/backbone.py b/modelscope/models/nlp/palm_v2/backbone.py
deleted file mode 100644
index afee2e3f..00000000
--- a/modelscope/models/nlp/palm_v2/backbone.py
+++ /dev/null
@@ -1,1327 +0,0 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import codecs
-import copy
-import math
-import os
-import subprocess
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
-
-import addict
-import json
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from torch.nn.init import xavier_uniform_
-from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig,
-                          RobertaModel, RobertaTokenizer)
-from transformers.activations import ACT2FN
-from transformers.modeling_utils import PreTrainedModel
-
-from modelscope.utils import logger as logging
-from .configuration import PalmConfig
-from .dureader_eval import compute_bleu_rouge, normalize
-
-CONFIG_NAME = 'config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
-
-
-class MultiHeadedAttention(nn.Module):  # SelfAttention
-    """
-    Multi-Head Attention module from
-    "Attention is All You Need"
-    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
-
-    Similar to standard `dot` attention but uses
-    multiple attention distributions simulataneously
-    to select relevant items.
-
-    .. mermaid::
-
-       graph BT
-          A[key]
-          B[value]
-          C[query]
-          O[output]
-          subgraph Attn
-            D[Attn 1]
-            E[Attn 2]
-            F[Attn N]
-          end
-          A --> D
-          C --> D
-          A --> E
-          C --> E
-          A --> F
-          C --> F
-          D --> O
-          E --> O
-          F --> O
-          B --> O
-
-    Also includes several additional tricks.
-
-    Args:
-       head_count (int): number of parallel heads
-       model_dim (int): the dimension of keys/values/queries,
-           must be divisible by head_count
-       dropout (float): dropout parameter
-    """
-
-    def __init__(self,
-                 head_count,
-                 model_dim,
-                 dropout=0.1,
-                 use_final_linear=True):
-        assert model_dim % head_count == 0
-        self.dim_per_head = model_dim // head_count
-        self.model_dim = model_dim
-
-        super().__init__()
-        self.head_count = head_count
-
-        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.linear_values = nn.Linear(model_dim,
-                                       head_count * self.dim_per_head)
-        self.linear_query = nn.Linear(model_dim,
-                                      head_count * self.dim_per_head)
-        self.softmax = nn.Softmax(dim=-1)
-        self.dropout = nn.Dropout(dropout)
-        self.use_final_linear = use_final_linear
-        if (self.use_final_linear):
-            self.final_linear = nn.Linear(model_dim, model_dim)
-
-    def forward(self,
-                key,
-                value,
-                query,
-                mask=None,
-                layer_cache=None,
-                type=None,
-                predefined_graph_1=None,
-                return_attn=False):
-        """
-        Compute the context vector and the attention vectors.
-
-        Args:
-           key (`FloatTensor`): set of `key_len`
-                key vectors `[batch, key_len, dim]`
-           value (`FloatTensor`): set of `key_len`
-                value vectors `[batch, key_len, dim]`
-           query (`FloatTensor`): set of `query_len`
-                 query vectors  `[batch, query_len, dim]`
-           mask: binary mask indicating which keys have
-                 non-zero attention `[batch, query_len, key_len]`
-        Returns:
-           (`FloatTensor`, `FloatTensor`) :
-
-           * output context vectors `[batch, query_len, dim]`
-           * one of the attention vectors `[batch, query_len, key_len]`
-        """
-
-        batch_size = key.size(0)
-        dim_per_head = self.dim_per_head
-        head_count = self.head_count
-
-        def shape(x):
-            """  projection """
-            return x.view(batch_size, -1, head_count, dim_per_head) \
-                .transpose(1, 2)
-
-        def unshape(x):
-            """  compute context """
-            return x.transpose(1, 2).contiguous() \
-                .view(batch_size, -1, head_count * dim_per_head)
-
-        # 1) Project key, value, and query.
-        if layer_cache is not None:
-            if type == 'self':
-                query, key, value = self.linear_query(query), self.linear_keys(
-                    query), self.linear_values(query)
-
-                key = shape(key)
-                value = shape(value)
-
-                device = key.device
-                if layer_cache['self_keys'] is not None:
-                    key = torch.cat((layer_cache['self_keys'].to(device), key),
-                                    dim=2)
-                if layer_cache['self_values'] is not None:
-                    value = torch.cat(
-                        (layer_cache['self_values'].to(device), value), dim=2)
-                layer_cache['self_keys'] = key
-                layer_cache['self_values'] = value
-            elif type == 'context':
-                query = self.linear_query(query)
-                if layer_cache['memory_keys'] is None:
-                    key, value = self.linear_keys(key), self.linear_values(
-                        value)
-                    key = shape(key)
-                    value = shape(value)
-                else:
-                    key, value = layer_cache['memory_keys'], layer_cache[
-                        'memory_values']
-                layer_cache['memory_keys'] = key
-                layer_cache['memory_values'] = value
-        else:
-            key = self.linear_keys(key)
-            value = self.linear_values(value)
-            query = self.linear_query(query)
-            key = shape(key)
-            value = shape(value)
-
-        query = shape(query)
-
-        # 2) Calculate and scale scores.
-        query = query / math.sqrt(dim_per_head)
-        scores = torch.matmul(query, key.transpose(2, 3))
-
-        if mask is not None:
-            mask = mask.unsqueeze(1).expand_as(scores)
-            scores = scores.masked_fill(mask, -1e18)
-
-        # 3) Apply attention dropout and compute context vectors.
-
-        attn = self.softmax(scores)
-
-        if predefined_graph_1 is not None:
-            attn_masked = attn[:, -1] * predefined_graph_1
-            attn_masked = attn_masked / (
-                torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
-
-            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
-
-        drop_attn = self.dropout(attn)
-        if self.use_final_linear:
-            context = unshape(torch.matmul(drop_attn, value))
-            output = self.final_linear(context)
-            if return_attn:
-                return output, attn
-            else:
-                return output
-        else:
-            context = torch.matmul(drop_attn, value)
-            if return_attn:
-                return context, attn
-            else:
-                return context
-
-
-class PositionwiseFeedForward(nn.Module):  # Output
-    """ A two-layer Feed-Forward-Network with residual layer norm.
-
-    Args:
-        d_model (int): the size of input for the first-layer of the FFN.
-        d_ff (int): the hidden layer size of the second-layer
-            of the FNN.
-        dropout (float): dropout probability in :math:`[0, 1)`.
-    """
-
-    def __init__(self, d_model, d_ff, dropout=0.1):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.w_1 = nn.Linear(d_model, d_ff)
-        self.actv = ACT2FN['gelu_new']
-        self.dropout_1 = nn.Dropout(dropout)
-        self.w_2 = nn.Linear(d_ff, d_model)
-        self.dropout_2 = nn.Dropout(dropout)
-
-    def forward(self, x):
-        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
-        output = self.dropout_2(self.w_2(inter))
-        return output + x
-
-
-class TransformerDecoderLayer(nn.Module):  # Layer
-    """
-    Args:
-      d_model (int): the dimension of keys/values/queries in
-                       MultiHeadedAttention, also the input size of
-                       the first-layer of the PositionwiseFeedForward.
-      heads (int): the number of heads for MultiHeadedAttention.
-      d_ff (int): the second-layer of the PositionwiseFeedForward.
-      dropout (float): dropout probability(0-1.0).
-      self_attn_type (string): type of self-attention scaled-dot, average
-    """
-    MAX_SIZE = 5000
-
-    def __init__(self, d_model, heads, d_ff, dropout):
-        super().__init__()
-
-        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
-
-        self.context_attn = MultiHeadedAttention(
-            heads, d_model, dropout=dropout)
-        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
-        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
-        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
-        self.drop = nn.Dropout(dropout)
-        mask = self._get_attn_subsequent_mask(self.MAX_SIZE)
-        # Register self.mask as a buffer in TransformerDecoderLayer, so
-        # it gets TransformerDecoderLayer's cuda behavior automatically.
-        self.register_buffer('mask', mask)
-
-    def forward(self,
-                inputs,
-                memory_bank,
-                src_pad_mask,
-                tgt_pad_mask,
-                previous_input=None,
-                layer_cache=None,
-                step=None):
-        """
-        Args:
-            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
-            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
-            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
-            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
-
-        Returns:
-            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
-
-            * output `[batch_size x 1 x model_dim]`
-            * attn `[batch_size x 1 x src_len]`
-            * all_input `[batch_size x current_step x model_dim]`
-
-        """
-        dec_mask = torch.gt(
-            tgt_pad_mask.type(torch.uint8)
-            + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type(
-                torch.uint8), 0)
-        input_norm = self.layer_norm_1(inputs)
-        all_input = input_norm
-        if previous_input is not None:
-            all_input = torch.cat((previous_input, input_norm), dim=1)
-            dec_mask = None
-
-        query = self.self_attn(
-            all_input,
-            all_input,
-            input_norm,
-            mask=dec_mask,
-            layer_cache=layer_cache,
-            type='self')
-
-        query = self.drop(query) + inputs
-
-        query_norm = self.layer_norm_2(query)
-        mid, attn = self.context_attn(
-            memory_bank,
-            memory_bank,
-            query_norm,
-            mask=src_pad_mask,
-            layer_cache=layer_cache,
-            type='context',
-            return_attn=True)
-        output = self.feed_forward(self.drop(mid) + query)
-
-        return output, attn, all_input
-
-    def _get_attn_subsequent_mask(self, size):
-        """
-        Get an attention mask to avoid using the subsequent info.
-
-        Args:
-            size: int
-
-        Returns:
-            (`LongTensor`):
-
-            * subsequent_mask `[1 x size x size]`
-        """
-        attn_shape = (1, size, size)
-        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
-        subsequent_mask = torch.from_numpy(subsequent_mask)
-        return subsequent_mask
-
-
-class PositionalEncoding(nn.Module):
-
-    def __init__(self, dropout, dim, max_len=5000):
-        super().__init__()
-        pe = torch.zeros(max_len, dim)
-        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
-                              * -(math.log(10000.0) / dim)))
-        pe[:, 0::2] = torch.sin(position.float() * div_term)
-        pe[:, 1::2] = torch.cos(position.float() * div_term)
-        pe = pe.unsqueeze(0)
-        self.register_buffer('pe', pe)
-        self.dropout = nn.Dropout(dropout)
-        self.dim = dim
-
-    def forward(self, emb, step=None):
-        emb = emb * math.sqrt(self.dim)
-        if (step):
-            emb = emb + self.pe[:, step][:, None, :]
-
-        else:
-            emb = emb + self.pe[:, :emb.size(1)]
-        emb = self.dropout(emb)
-        return emb
-
-    def get_emb(self, emb):
-        return self.pe[:, :emb.size(1)]
-
-
-class TransformerDecoderState:
-
-    def __init__(self, src: Tensor, cache_num_layers: int = -1):
-        self.src: Tensor = src
-        self.previous_input: Tensor = None
-        self.previous_layer_inputs: Tensor = None
-        self.cache: Optional[Dict[str, Any]] = None
-        if cache_num_layers != -1:
-            self._init_cache(cache_num_layers)
-
-    def update_state(self, new_input, previous_layer_inputs):
-        self.previous_input = new_input
-        self.previous_layer_inputs = previous_layer_inputs
-        self.cache = None
-
-    def _init_cache(self, num_layers):
-        self.cache = {}
-        for num in range(num_layers):
-            layer_cache = {'memory_keys': None, 'memory_values': None}
-            layer_cache['self_keys'] = None
-            layer_cache['self_values'] = None
-            self.cache['layer_{}'.format(num)] = layer_cache
-
-    def map_batch_fn(self, fn):
-
-        def _recursive_map(struct, batch_dim=0):
-            for k, v in struct.items():
-                if v is not None:
-                    if isinstance(v, dict):
-                        _recursive_map(v)
-                    else:
-                        struct[k] = fn(v, batch_dim)
-
-        self.src = fn(self.src, 0)
-        if self.cache is not None:
-            _recursive_map(self.cache)
-
-
-class TransformerDecoder(nn.Module):  # Decoder
-    """
-    The Transformer decoder from "Attention is All You Need".
-
-
-    .. mermaid::
-
-       graph BT
-          A[input]
-          B[multi-head self-attn]
-          BB[multi-head src-attn]
-          C[feed forward]
-          O[output]
-          A --> B
-          B --> BB
-          BB --> C
-          C --> O
-
-
-    Args:
-       num_layers (int): number of encoder layers.
-       d_model (int): size of the model
-       heads (int): number of heads
-       d_ff (int): size of the inner FF layer
-       dropout (float): dropout parameters
-       embeddings (:obj:`onmt.modules.Embeddings`):
-          embeddings to use, should have positional encodings
-       attn_type (str): if using a seperate copy attention
-    """
-    decoder_type = 'transformer'
-
-    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):
-        super().__init__()
-
-        # Basic attributes.
-        self.num_layers = num_layers
-        self.embeddings = embeddings
-        self.pos_emb = PositionalEncoding(dropout,
-                                          self.embeddings.embedding_dim)
-
-        # Build TransformerDecoder.
-        self.transformer_layers = nn.ModuleList([
-            TransformerDecoderLayer(d_model, heads, d_ff, dropout)
-            for _ in range(num_layers)
-        ])
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.state = None
-
-    def forward(self,
-                state: TransformerDecoderState,
-                tgt: Tensor,
-                memory_bank: Tensor,
-                step: int = None,
-                memory_masks: Tensor = None):
-        src_words = state.src
-        tgt_words = tgt
-        src_batch, src_len = src_words.size()
-        tgt_batch, tgt_len = tgt_words.size()
-
-        # Run the forward pass of the TransformerDecoder.
-        # emb = self.embeddings(tgt, step=step)
-        emb = self.embeddings(tgt)
-        assert emb.dim() == 3  # len x batch x embedding_dim
-        output = self.pos_emb(emb, step)
-
-        src_memory_bank = memory_bank
-        padding_idx = self.embeddings.padding_idx
-        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
-            .expand(tgt_batch, tgt_len, tgt_len)
-
-        if memory_masks is not None:
-            src_len = memory_masks.size(-1)
-            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)
-        else:
-            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \
-                .expand(src_batch, tgt_len, src_len)
-
-        if state.cache is None:
-            saved_inputs = []
-        attns = []
-        for i in range(self.num_layers):
-            prev_layer_input = None
-            if state.cache is None:
-                if state.previous_input is not None:
-                    prev_layer_input = state.previous_layer_inputs[i]
-            output, attn, all_input \
-                = self.transformer_layers[i](
-                    output, src_memory_bank,
-                    src_pad_mask, tgt_pad_mask,
-                    previous_input=prev_layer_input,
-                    layer_cache=state.cache['layer_{}'.format(i)]
-                    if state.cache is not None else None,
-                    step=step)
-            if state.cache is None:
-                saved_inputs.append(all_input)
-            attns.append(attn)
-
-        if state.cache is None:
-            saved_inputs = torch.stack(saved_inputs)
-
-        output = self.layer_norm(output)
-
-        # Process the result and update the attentions.
-        if state.cache is None:
-            state.update_state(tgt, saved_inputs)
-
-        return output, attns, state
-
-
-class PalmPointerGenerator(nn.Module):
-
-    def __init__(self, hidden_size, vocab_size):
-        super().__init__()
-        self.dense = nn.Linear(hidden_size, vocab_size)
-        self.gen_func = nn.LogSoftmax(-1)
-
-    def forward(self, x):
-        x = self.dense(x)
-        x = self.gen_func(x)
-        return x
-
-
-class PalmPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = PalmConfig
-    base_model_prefix = 'palm'
-
-    @classmethod
-    def from_pretrained(
-            cls, pretrained_model_name_or_path: Optional[Union[str,
-                                                               os.PathLike]],
-            **kwargs):
-        config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        config = PalmConfig.from_json_file(config_file) if os.path.isfile(
-            config_file) else PalmConfig()
-        config.encoder_pth = os.path.join(pretrained_model_name_or_path,
-                                          config.encoder_pth)
-        checkpoint_file = os.path.join(pretrained_model_name_or_path,
-                                       WEIGHTS_NAME)
-        checkpoint = torch.load(checkpoint_file) if os.path.isfile(
-            checkpoint_file) else None
-        return cls(config, checkpoint, **kwargs)
-
-
-class AbsSummarizer(PalmPreTrainedModel):  # Model
-
-    def __init__(self, config, checkpoint=None):
-        super().__init__(config)
-        self.config = config
-        if config.encoder == 'bert' or config.encoder == 'zh_bert':
-            self.bert = BertModel(
-                BertConfig.from_pretrained(config.encoder_pth))
-        elif config.encoder == 'roberta':
-            self.bert = RobertaModel(
-                RobertaConfig.from_pretrained(config.encoder_pth))
-
-        if (config.max_pos > 512):
-            my_pos_embeddings = nn.Embedding(
-                config.max_pos, self.bert.model.config.hidden_size)
-            my_pos_embeddings.weight.data[:
-                                          512] = self.bert.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[
-                512:] = self.bert.embeddings.position_embeddings.weight.data[
-                    -1][None, :].repeat(config.max_pos - 512, 1)
-            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        self.vocab_size = self.bert.config.vocab_size
-        tgt_embeddings = nn.Embedding(
-            self.vocab_size,
-            self.bert.config.hidden_size,
-            padding_idx=1 if config.encoder == 'roberta' else 0)
-
-        if config.share_emb:
-            tgt_embeddings.weight = copy.deepcopy(
-                self.bert.model.embeddings.word_embeddings.weight)
-        self.decoder = TransformerDecoder(
-            config.dec_layers,
-            config.dec_hidden_size,
-            heads=config.dec_heads,
-            d_ff=config.dec_ff_size,
-            dropout=config.dec_dropout,
-            embeddings=tgt_embeddings)
-        self.generator = PalmPointerGenerator(config.dec_hidden_size,
-                                              self.vocab_size)
-        self.generator.dense.weight = self.decoder.embeddings.weight
-
-        if checkpoint is not None:
-            if 'model' in checkpoint:
-                checkpoint = checkpoint['model']
-            for key in list(checkpoint.keys()):
-                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
-            self.load_state_dict(checkpoint, strict=False)
-        else:
-            for module in self.decoder.modules():
-                if isinstance(module, (nn.Linear, nn.Embedding)):
-                    module.weight.data.normal_(mean=0.0, std=0.02)
-                elif isinstance(module, nn.LayerNorm):
-                    module.bias.data.zero_()
-                    module.weight.data.fill_(1.0)
-                if isinstance(module, nn.Linear) and module.bias is not None:
-                    module.bias.data.zero_()
-            for p in self.generator.parameters():
-                if p.dim() > 1:
-                    xavier_uniform_(p)
-                else:
-                    p.data.zero_()
-            if config.use_bert_emb:
-                if config.encoder == 'roberta':
-                    tgt_embeddings = nn.Embedding(
-                        self.vocab_size,
-                        self.bert.config.hidden_size,
-                        padding_idx=1)
-                else:
-                    tgt_embeddings = nn.Embedding(
-                        self.vocab_size,
-                        self.bert.config.hidden_size,
-                        padding_idx=0)
-                tgt_embeddings.weight = copy.deepcopy(
-                    self.bert.embeddings.word_embeddings.weight)
-                self.decoder.embeddings = tgt_embeddings
-            self.generator.dense.weight = self.decoder.embeddings.weight
-
-    def forward(self, src, tgt, mask_src):
-        top_vec, _ = self.bert(src, mask_src, return_dict=False)
-        state = TransformerDecoderState(src)
-        decoder_outputs, attns, _ = self.decoder(state, tgt[:, :-1], top_vec)
-        return decoder_outputs, attns[-1], top_vec
-
-
-class LabelSmoothingLoss(nn.Module):
-    """
-    With label smoothing,
-    KL-divergence between q_{smoothed ground truth prob.}(w)
-    and p_{prob. computed by model}(w) is minimized.
-    """
-
-    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
-        assert 0.0 < label_smoothing <= 1.0
-        self.padding_idx = ignore_index
-        super(LabelSmoothingLoss, self).__init__()
-
-        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
-        one_hot = torch.full((tgt_vocab_size, ), smoothing_value)
-        one_hot[self.padding_idx] = 0
-        self.register_buffer('one_hot', one_hot.unsqueeze(0))
-        self.confidence = 1.0 - label_smoothing
-
-    def forward(self, output, target):
-        """
-        output (FloatTensor): batch_size x n_classes
-        target (LongTensor): batch_size
-        """
-        model_prob = self.one_hot.repeat(target.size(0), 1)
-        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
-        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)
-
-        return F.kl_div(output, model_prob, reduction='sum')
-
-
-class NMTLossCompute(nn.Module):
-    """
-    Standard NMT Loss Computation.
-    """
-
-    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
-        super().__init__()
-        self.generator = generator
-        self.padding_idx = symbols['PAD']
-        if label_smoothing > 0:
-            self.criterion = LabelSmoothingLoss(
-                label_smoothing, vocab_size, ignore_index=self.padding_idx)
-        else:
-            self.criterion = nn.NLLLoss(
-                ignore_index=self.padding_idx, reduction='sum')
-
-    def _bottle(self, _v):
-        return _v.view(-1, _v.size(2))
-
-    def _unbottle(self, _v, batch_size):
-        return _v.view(-1, batch_size, _v.size(1))
-
-    def forward(self, tgt, output):
-        target = tgt[:, 1:]
-        normalization = target.ne(self.padding_idx).sum()
-        bottled_output = self._bottle(output)
-        scores = self.generator(bottled_output)
-        gtruth = target.contiguous().view(-1)
-        loss = self.criterion(scores, gtruth)
-        loss.div(float(normalization))
-        return loss
-
-
-class PalmForConditionalGeneration(PalmPreTrainedModel):
-
-    def __init__(self, config, checkpoint=None):
-        super().__init__(config)
-        self.config = config
-        if config.encoder == 'roberta':
-            tokenizer = RobertaTokenizer.from_pretrained(
-                config.encoder_pth, do_lower_case=False)
-            symbols = {
-                'BOS': tokenizer.cls_token_id,
-                'EOS': tokenizer.sep_token_id,
-                'PAD': tokenizer.pad_token_id,
-                'EOQ': tokenizer.unk_token_id
-            }
-        elif config.encoder == 'bert' or config.encoder == 'zh_bert':
-            tokenizer = BertTokenizer.from_pretrained(
-                config.encoder_pth, do_lower_case=True)
-            symbols = {
-                'BOS': tokenizer.vocab['[CLS]'],
-                'EOS': tokenizer.vocab['[SEP]'],
-                'PAD': tokenizer.vocab['[PAD]'],
-                'EOQ': tokenizer.vocab['[unused2]']
-            }
-        self.tokenizer = tokenizer
-        self.symbols = symbols
-        self.palm = AbsSummarizer(config, checkpoint)
-        self.loss = NMTLossCompute(self.palm.generator, symbols,
-                                   self.palm.vocab_size,
-                                   config.label_smoothing)
-
-    def forward(self, input_ids, attention_mask, labels):
-        output = self.palm(
-            src=input_ids, tgt=labels, mask_src=attention_mask)[0]
-        loss = self.loss(labels, output)
-        return addict.Dict(loss=loss)
-
-
-class Translator(object):
-    """
-    Uses a model to translate a batch of sentences.
-    """
-
-    @dataclass
-    class Batch:
-        batch_size: int
-        src: torch.Tensor
-        tgt: torch.Tensor
-        mask_src: torch.Tensor
-        query_id: List[None] = None
-        src_str: List[List[str]] = None
-        tgt_str: List[str] = None
-
-    def __init__(self,
-                 model: PalmForConditionalGeneration,
-                 dataset: str = 'cnn'):
-        super().__init__()
-        self.logger = logging.get_logger(__name__)
-        self.args = model.config
-        self.args.dataset = dataset
-        self.model = model.palm
-        self.generator = self.model.generator
-        self.vocab = model.tokenizer
-        self.symbols = model.symbols
-        self.start_token = self.symbols['BOS']
-        self.end_token = self.symbols['EOS']
-        self.alpha = self.args.alpha
-        self.beam_size = self.args.beam_size
-        self.min_length = self.args.min_length
-        self.max_length = self.args.max_length
-
-    def from_batch(self, translation_batch):
-        batch = translation_batch['batch']
-        assert (len(translation_batch['gold_score']) == len(
-            translation_batch['predictions']))
-        batch_size = batch.batch_size
-
-        preds, pred_score, tgt_str, src, src_str = translation_batch[
-            'predictions'], translation_batch[
-                'scores'], batch.tgt_str, batch.src, batch.src_str
-        query_id = batch.query_id
-        '''
-        try:
-            query_id = batch.query_id
-        except:
-            query_id = None
-        '''
-        translations = []
-        for b in range(batch_size):
-            if self.args.dataset == 'qg_ranking_test':
-                if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert':
-                    pred_sents = [
-                        ' '.join(
-                            self.vocab.convert_ids_to_tokens(
-                                [int(n) for n in each])).replace(' ##', '')
-                        for each in preds[b]
-                    ]
-                elif self.args.encoder == 'roberta':
-                    pred_sents = [
-                        self.vocab.decode([int(n) for n in each
-                                           ]).replace('<s>',
-                                                      '').replace('</s>', '')
-                        for each in preds[b]
-                    ]
-            elif self.args.encoder == 'roberta':
-                pred_sents = self.vocab.decode([int(n)
-                                                for n in preds[b][0]]).replace(
-                                                    '<s>',
-                                                    '').replace('</s>', '')
-            elif self.args.encoder == 'bert':
-                pred_sents = self.vocab.convert_ids_to_tokens(
-                    [int(n) for n in preds[b][0]])
-                pred_sents = ' '.join(pred_sents).replace(' ##', '')
-            elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase':
-                pred_sents = [
-                    self.vocab.convert_ids_to_tokens([int(n) for n in pred])
-                    for pred in preds[b]
-                ]
-                pred_sents = [
-                    ''.join(pred).replace(' ##', '') for pred in pred_sents
-                ]
-            elif self.args.encoder == 'zh_bert':
-                pred_sents = self.vocab.convert_ids_to_tokens(
-                    [int(n) for n in preds[b][0]])
-                pred_sents = ''.join(pred_sents).replace('##', '')
-            gold_sent = tgt_str[b]
-
-            if self.args.encoder == 'roberta':
-                raw_src = self.vocab.decode([int(t) for t in src[b]])
-                raw_src = ' '.join(src_str[b])
-            else:
-                raw_src = [self.vocab.ids_to_tokens[int(t)]
-                           for t in src[b]][:500]
-                raw_src = ' '.join(raw_src)
-            if self.args.dataset == 'faq':
-                translation = (pred_sents, gold_sent, src_str[b], query_id[b],
-                               pred_score[b])
-            else:
-                translation = (pred_sents, gold_sent, raw_src, query_id[b],
-                               pred_score[b])
-            # translation = (pred_sents[0], gold_sent)
-            translations.append(translation)
-
-        return translations
-
-    def translate(self, data_iter, step):
-        gold_path = self.args.result_path + '.%d.gold' % step
-        can_path = self.args.result_path + '.%d.candidate' % step
-        self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
-        self.can_out_file = codecs.open(can_path, 'w', 'utf-8')
-        self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w',
-                                                    'utf-8')
-        if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta':
-            out = '\t'.join([
-                'query_id', 'source_query', 'target_query', 'predict_query'
-            ]) + '\n'
-            self.pred_json_score_out_file.write(out)
-
-        raw_src_path = self.args.result_path + '.%d.raw_src' % step
-        self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8')
-
-        pred_results, gold_results = [], []
-        cnt = 0
-        pred_dict, ref_dict = {}, {}
-        for i, batch in enumerate(data_iter):
-            self.logger.info(f'data: {i + 1} / {len(data_iter)}')
-            batch_data = self.translate_batch(batch)
-            translations = self.from_batch(batch_data)
-
-            for trans in translations:
-                pred, gold, src, query_id, pred_score = trans
-                src = src.replace('<pad>', '').replace('##', '').strip()
-                if self.args.dataset == 'qg_ranking_test':
-                    pred_str = '\t'.join([
-                        each.replace('[unused0]', '').replace(
-                            '[PAD]', '').replace('[unused1]', '').replace(
-                                r' +', ' ').replace('[SEP]', '').replace(
-                                    '[unused2]',
-                                    '').replace(r' +', ' ').replace(
-                                        '<mask>',
-                                        '<q>').replace('<pad>', '').replace(
-                                            '<s>',
-                                            '').replace('</s>', '').replace(
-                                                '<unk>', ' ').strip()
-                        for each in pred
-                    ])
-                else:
-                    pred_str = pred.replace('[unused0]', '').replace(
-                        '[PAD]', '').replace('[unused1]', '').replace(
-                            r' +', ' ').replace('[SEP]', '').replace(
-                                '[unused2]', '').replace('[CLS]', '').replace(
-                                    '[SEP]', '').replace('[UNK]', '').strip()
-                    pred_str = pred_str.replace(r' +', ' ').replace(
-                        '<mask>',
-                        '<q>').replace('<pad>', '').replace('<s>', '').replace(
-                            '</s>', '').replace('<unk>', ' ').strip()
-                gold_str = gold.replace('<mask>', '<q>').strip().replace(
-                    '[UNK]', '').replace('[unused1]', '').replace(
-                        '[unused2]',
-                        '').replace('##', '').replace('[CLS]', '').replace(
-                            '[SEP]', '').strip().replace('<s>', '').replace(
-                                '</s>', '').replace('<unk>', ' ').strip()
-                if (self.args.recall_eval):
-                    _pred_str = ''
-                    for sent in pred_str.split('<q>'):
-                        can_pred_str = _pred_str + '<q>' + sent.strip()
-                        if len(can_pred_str.split()) >= len(
-                                gold_str.split()) + 10:
-                            pred_str = _pred_str
-                            break
-                        else:
-                            _pred_str = can_pred_str
-
-                if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
-                    pred_str = pred_str.replace('<q>', ' ')
-                    if query_id is not None:
-                        pred_json = {
-                            'query_id': query_id,
-                            'answers': [pred_str]
-                        }
-                        gold_json = {
-                            'query_id': query_id,
-                            'answers': [gold_str]
-                        }
-                        pred_json_score = {
-                            'query_id': query_id,
-                            'answers': [pred_str],
-                            'scores': pred_score[0].cpu().numpy().tolist()
-                        }
-                    else:
-                        pred_json = {'query_id': cnt, 'answers': [pred_str]}
-                        gold_json = {'query_id': cnt, 'answers': [gold_str]}
-                        pred_json_score = {
-                            'query_id': cnt,
-                            'answers': [pred_str],
-                            'scores': pred_score[0].cpu().numpy().tolist()
-                        }
-                    json.dump(pred_json, self.can_out_file)
-                    self.can_out_file.write('\n')
-                    json.dump(gold_json, self.gold_out_file)
-                    self.gold_out_file.write('\n')
-                    json.dump(pred_json_score, self.pred_json_score_out_file)
-                    self.pred_json_score_out_file.write('\n')
-                    self.src_out_file.write(src.strip() + '\n')
-                elif self.args.dataset == 'cnn':
-                    self.can_out_file.write(pred_str + '\n')
-                    self.gold_out_file.write(gold_str + '\n')
-                    self.src_out_file.write(src.strip() + '\n')
-                elif self.args.dataset == 'dureader':
-                    if query_id is None:
-                        query_id = str(cnt)
-                    pred_results.extend(normalize([pred_str]))
-                    gold_results.extend(normalize([gold_str]))
-                    self.can_out_file.write(pred_str + '\n')
-                    self.gold_out_file.write('\t'.join([src[0], gold_str])
-                                             + '\n')
-
-                elif self.args.dataset == 'paraphrase':
-                    if query_id is None:
-                        query_id = str(cnt)
-                    if self.args.encoder == 'roberta':
-                        pred_str = [pred_str]
-                    pred_dict[query_id] = normalize([pred_str[0]])
-                    ref_dict[query_id] = normalize([gold_str])
-                    self.pred_json_score_out_file.write(
-                        '\t'.join([str(query_id), src, gold_str, pred_str[0]])
-                        + '\n')
-                elif self.args.dataset == 'faq':
-                    if pred_score[0].cpu().numpy().tolist() < -3.5:
-                        continue
-                    self.can_out_file.write(
-                        '\t'.join([str(query_id), src, pred_str]) + '\n')
-                    self.gold_out_file.write(
-                        '\t'.join([str(query_id), src, gold_str]) + '\n')
-                    # passage, answer, question, score
-                    self.pred_json_score_out_file.write('\t'.join([
-                        str(query_id), gold_str, src, pred_str,
-                        str(pred_score[0].cpu().numpy().tolist())
-                    ]) + '\n')
-                elif self.args.dataset == 'qg_ranking_test':
-                    self.can_out_file.write(
-                        str(query_id) + '\t' + pred_str + '\n')
-
-                cnt += 1
-            self.can_out_file.flush()
-            self.gold_out_file.flush()
-            self.src_out_file.flush()
-        self.logger.info('cnt: %s' % cnt)
-        self.can_out_file.close()
-        self.gold_out_file.close()
-        self.src_out_file.close()
-
-        if (step != -1):
-            if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
-                cnn_results = subprocess.getoutput(
-                    './run.sh %s %s' % (gold_path, can_path))  # run.sh ...
-                self.logger.info(cnn_results)
-            elif self.args.dataset == 'cnn':
-                self.logger.info('Calculating Rouge')
-                from rouge import Rouge
-                candidates = [
-                    line.strip() for line in open(can_path, encoding='utf-8')
-                ]
-                references = [
-                    line.strip() for line in open(gold_path, encoding='utf-8')
-                ]
-                rouge_score = Rouge().get_scores(
-                    candidates, references, avg=True)
-                # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges)))
-                print(rouge_score)
-            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
-
-                def postprocess_text(preds, labels):
-                    preds = [pred.strip().replace('.', '') for pred in preds]
-                    labels = [label.strip() for label in labels]
-                    while '' in preds:
-                        idx = preds.index('')
-                        preds[idx] = '。'
-                    return preds, labels
-
-                pred_results, gold_results = postprocess_text(
-                    pred_results, gold_results)
-                pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)}
-                gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)}
-                bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict)
-                print(bleu_rouge)
-            # unreachable
-            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
-                pred_results, gold_results = postprocess_text(
-                    pred_results, gold_results)
-                bleu_score = cal_bleu(pred_results, gold_results)
-                from rouge import Rouge
-                rouge = Rouge()
-                rouge_score = rouge.get_scores(
-                    pred_results, gold_results, avg=True)
-                print("'Dev eval result: Bleu-4={}, {}".format(
-                    bleu_score, rouge_score))
-
-    def translate_batch(self, batch: 'Batch', fast: bool = False):
-        """
-        Translate a batch of sentences.
-
-        Mostly a wrapper around :obj:`Beam`.
-
-        Args:
-           batch (:obj:`Batch`): a batch from a dataset object
-           data (:obj:`Dataset`): the dataset object
-           fast (bool): enables fast beam search (may not support all features)
-
-        Todo:
-           Shouldn't need the original dataset.
-        """
-        self.model.eval()
-        with torch.no_grad():
-            return self._fast_translate_batch(
-                batch, self.max_length, min_length=self.min_length)
-
-    def _tile(self, x, count, dim=0):
-        perm = list(range(len(x.size())))
-        if dim != 0:
-            perm[0], perm[dim] = perm[dim], perm[0]
-            x = x.permute(perm).contiguous()
-        out_size = list(x.size())
-        out_size[0] *= count
-        batch = x.size(0)
-        x = x.view(batch, -1) \
-            .transpose(0, 1) \
-            .repeat(count, 1) \
-            .transpose(0, 1) \
-            .contiguous() \
-            .view(*out_size)
-        if dim != 0:
-            x = x.permute(perm).contiguous()
-        return x
-
-    def _top_k_top_p_filtering(self,
-                               logits,
-                               top_k=10,
-                               top_p=1.0,
-                               filter_value=-float('Inf'),
-                               min_tokens_to_keep=1):
-        if top_k > 0:
-            top_k = min(max(top_k, min_tokens_to_keep),
-                        logits.size(-1))  # Safety check
-            # Remove all tokens with a probability less than the last token of the top-k
-            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
-                                                                      None]
-            logits[indices_to_remove] = filter_value
-
-        if top_p < 1.0:
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-            cumulative_probs = torch.cumsum(
-                F.softmax(sorted_logits, dim=-1), dim=-1)
-
-            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-            sorted_indices_to_remove = cumulative_probs > top_p
-            if min_tokens_to_keep > 1:
-                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-            # Shift the indices to the right to keep also the first token above the threshold
-            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-                ..., :-1].clone()
-            sorted_indices_to_remove[..., 0] = 0
-
-            # scatter sorted tensors to original indexing
-            indices_to_remove = sorted_indices_to_remove.scatter(
-                1, sorted_indices, sorted_indices_to_remove)
-            logits[indices_to_remove] = filter_value
-        return logits
-
-    def _fast_translate_batch(self,
-                              batch: 'Batch',
-                              max_length: int,
-                              min_length: int = 0):
-        # TODO: faster code path for beam_size == 1.
-        # TODO: support these blacklisted features.
-
-        beam_size = self.beam_size
-        batch_size = batch.batch_size
-        src = batch.src
-        mask_src = batch.mask_src
-
-        src_features, _ = self.model.bert(src, mask_src, return_dict=False)
-        state = TransformerDecoderState(src, self.model.decoder.num_layers)
-        device = src_features.device
-
-        # Tile states and memory beam_size times.
-        state.map_batch_fn(
-            lambda state, dim: self._tile(state, beam_size, dim=dim))
-        src_features = self._tile(src_features, beam_size, dim=0)
-        batch_offset = torch.arange(
-            batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(
-            0,
-            batch_size * beam_size,
-            step=beam_size,
-            dtype=torch.long,
-            device=device)
-        alive_seq = torch.full([batch_size * beam_size, 1],
-                               self.start_token,
-                               dtype=torch.long,
-                               device=device)
-
-        # Give full probability to the first beam on the first step.
-        topk_log_probs = (
-            torch.tensor(
-                [0.0] + [float('-inf')] * (beam_size - 1),
-                device=device).repeat(batch_size))
-
-        # Structure that holds finished hypotheses.
-        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
-
-        results = {}
-        results['predictions'] = [[] for _ in range(batch_size)]  # noqa: F812
-        results['scores'] = [[] for _ in range(batch_size)]  # noqa: F812
-        results['gold_score'] = [0] * batch_size
-        results['batch'] = batch
-
-        for step in range(max_length):
-            decoder_input = alive_seq[:, -1].view(1, -1)
-
-            # Decoder forward.
-            decoder_input = decoder_input.transpose(0, 1)
-            dec_out, attns, state = self.model.decoder(
-                state, decoder_input, src_features, step=step)
-
-            # Generator forward.
-            log_probs = self.generator.forward(
-                dec_out.transpose(0, 1).squeeze(0))
-            vocab_size = log_probs.size(-1)
-
-            if step < min_length:
-                log_probs[:, self.end_token] = -1e20
-
-            # Multiply probs by the beam probability.
-
-            length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha
-            if self.args.sample_topk:
-                temperature = self.args.temperature
-                _scores = log_probs / temperature
-                _scores = self._top_k_top_p_filtering(
-                    _scores,
-                    top_k=self.args.top_k,
-                    top_p=self.args.top_p,
-                    min_tokens_to_keep=1
-                )  # (batch_size * num_beams, vocab_size)
-                # Sample 2 next words for each beam (so we have some spare tokens
-                # and match output of greedy beam search)
-                topk_ids = torch.multinomial(
-                    F.softmax(_scores, dim=-1),
-                    num_samples=1)  # (batch_size * num_beams, 2)
-                # Compute next scores
-                _scores = F.log_softmax(
-                    _scores, dim=1)  # (batch_size * num_beams, vocab_size)
-
-                _scores += topk_log_probs.view(-1).unsqueeze(1)
-                _scores = _scores / length_penalty
-                topk_scores = torch.gather(
-                    _scores, -1, topk_ids)  # (batch_size * num_beams, 2)
-                # Match shape of greedy beam search
-                topk_ids = topk_ids.view(
-                    -1, beam_size)  # (batch_size, 2 * num_beams)
-                topk_scores = topk_scores.view(
-                    -1, beam_size)  # (batch_size, 2 * num_beams)
-            else:
-                log_probs += topk_log_probs.view(-1).unsqueeze(1)
-                curr_scores = log_probs / length_penalty
-
-                curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
-                topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
-            if self.args.block_trigram:
-                cur_len = alive_seq.size(1)
-                if cur_len > 3:
-                    for i in range(alive_seq.size(0)):
-                        fail = False
-                        words = [int(w) for w in alive_seq[i]]
-                        if self.args.encoder == 'roberta':
-                            words = self.vocab.decode(words).strip().split()
-                        else:
-                            words = [
-                                self.vocab.ids_to_tokens[w] for w in words
-                            ]
-                            words = ' '.join(words).replace(' ##', '').split()
-                        if len(words) <= 3:
-                            continue
-                        trigrams = [(words[i - 1], words[i], words[i + 1])
-                                    for i in range(1,
-                                                   len(words) - 1)]
-                        trigram = tuple(trigrams[-1])
-                        if trigram in trigrams[:-1]:
-                            fail = True
-                        if fail:
-                            curr_scores[i] = -10e20
-            # Recover log probs.
-            topk_log_probs = topk_scores * length_penalty
-
-            # Resolve beam origin and true word ids.
-            topk_beam_index = topk_ids // vocab_size
-            topk_ids = topk_ids.fmod(vocab_size)
-
-            # Map beam_index to batch_index in the flat representation.
-            batch_index = (
-                topk_beam_index
-                + beam_offset[:topk_beam_index.size(0)].unsqueeze(1))
-            select_indices = batch_index.view(-1)
-
-            # Append last prediction.
-            alive_seq = torch.cat([
-                alive_seq.index_select(0, select_indices),
-                topk_ids.view(-1, 1)
-            ], -1)
-
-            is_finished = topk_ids.eq(self.end_token)
-            if step + 1 == max_length:
-                is_finished.fill_(self.end_token)
-            # End condition is top beam is finished.
-            end_condition = is_finished[:, 0].eq(1)
-            # Save finished hypotheses.
-            if is_finished.any():
-                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
-                for i in range(is_finished.size(0)):
-                    b = batch_offset[i]
-                    if end_condition[i]:
-                        is_finished[i].fill_(self.end_token)
-                    finished_hyp = is_finished[i].nonzero().view(-1)
-                    # Store finished hypotheses for this batch.
-                    for j in finished_hyp:
-                        hypotheses[b].append(
-                            (topk_scores[i, j], predictions[i, j, 1:]))
-                    # If the batch reached the end, save the n_best hypotheses.
-                    if end_condition[i]:
-                        best_hyp = sorted(
-                            hypotheses[b], key=lambda x: x[0], reverse=True)
-                        if self.args.dataset == 'qg_ranking_test' or (
-                                self.args.dataset == 'paraphrase'
-                                and not self.args.sample_topk):
-                            for each in best_hyp[:beam_size]:
-                                score, pred = each
-                                results['scores'][b].append(score)
-                                results['predictions'][b].append(pred)
-                        else:
-                            score, pred = best_hyp[0]
-                            results['scores'][b].append(score)
-                            results['predictions'][b].append(pred)
-                non_finished = end_condition.eq(0).nonzero().view(-1)
-                # If all sentences are translated, no need to go further.
-                if len(non_finished) == 0:
-                    break
-                # Remove finished batches for the next step.
-                topk_log_probs = topk_log_probs.index_select(0, non_finished)
-                batch_index = batch_index.index_select(0, non_finished)
-                batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished) \
-                    .view(-1, alive_seq.size(-1))
-            # Reorder states.
-            select_indices = batch_index.view(-1)
-            src_features = src_features.index_select(0, select_indices)
-            state.map_batch_fn(
-                lambda state, dim: state.index_select(dim, select_indices))
-
-        return results
-
-    def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
-                 **kwargs) -> Dict[str, torch.Tensor]:
-        batch = self.Batch(
-            batch_size=input_ids.size()[0],
-            src=input_ids,
-            tgt=None,
-            mask_src=attention_mask)
-        translation_batch = self.translate_batch(batch)
-
-        preds = translation_batch['predictions']
-        return {'predictions': preds}
diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py
index d83860db..f1c8e414 100644
--- a/modelscope/models/nlp/palm_v2/text_generation.py
+++ b/modelscope/models/nlp/palm_v2/text_generation.py
@@ -1,50 +1,1364 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict, List
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import copy
+import math
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn.init import xavier_uniform_
+from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig,
+                          RobertaModel, RobertaTokenizer)
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.metainfo import Models
-from modelscope.models.base import Tensor, TorchModel
+from modelscope.models import Model
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import TextGenerationModelOutput, TokenGeneratorOutput
+from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
+from .configuration import PalmConfig
+from .dureader_eval import compute_bleu_rouge, normalize
 
-__all__ = ['PalmForTextGeneration']
+CONFIG_NAME = 'config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
 
 
-@MODELS.register_module(Tasks.text_generation, module_name=Models.palm)
-class PalmForTextGeneration(TorchModel):
+class MultiHeadedAttention(nn.Module):  # SelfAttention
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self,
+                 head_count,
+                 model_dim,
+                 dropout=0.1,
+                 use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super().__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim,
+                                       head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim,
+                                      head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if (self.use_final_linear):
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                layer_cache=None,
+                type=None,
+                predefined_graph_1=None,
+                return_attn=False):
+        """
+        Compute the context vector and the attention vectors.
 
         Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
         """
-        super().__init__(model_dir, *args, **kwargs)
 
-        from modelscope.models.nlp.palm_v2 import (
-            PalmForConditionalGeneration, Translator)
-        self.model = PalmForConditionalGeneration.from_pretrained(model_dir)
-        self.tokenizer = self.model.tokenizer
-        self.generator = Translator(self.model)
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head) \
+                .transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous() \
+                .view(batch_size, -1, head_count * dim_per_head)
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == 'self':
+                query, key, value = self.linear_query(query), self.linear_keys(
+                    query), self.linear_values(query)
+
+                key = shape(key)
+                value = shape(value)
+
+                device = key.device
+                if layer_cache['self_keys'] is not None:
+                    key = torch.cat((layer_cache['self_keys'].to(device), key),
+                                    dim=2)
+                if layer_cache['self_values'] is not None:
+                    value = torch.cat(
+                        (layer_cache['self_values'].to(device), value), dim=2)
+                layer_cache['self_keys'] = key
+                layer_cache['self_values'] = value
+            elif type == 'context':
+                query = self.linear_query(query)
+                if layer_cache['memory_keys'] is None:
+                    key, value = self.linear_keys(key), self.linear_values(
+                        value)
+                    key = shape(key)
+                    value = shape(value)
+                else:
+                    key, value = layer_cache['memory_keys'], layer_cache[
+                        'memory_values']
+                layer_cache['memory_keys'] = key
+                layer_cache['memory_values'] = value
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if predefined_graph_1 is not None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (
+                torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            if return_attn:
+                return output, attn
+            else:
+                return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            if return_attn:
+                return context, attn
+            else:
+                return context
+
+
+class PositionwiseFeedForward(nn.Module):  # Output
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.actv = ACT2FN['gelu_new']
+        self.dropout_1 = nn.Dropout(dropout)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+class TransformerDecoderLayer(nn.Module):  # Layer
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+    MAX_SIZE = 5000
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super().__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(
+            heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(self.MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer('mask', mask)
+
+    def forward(self,
+                inputs,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=None,
+                layer_cache=None,
+                step=None):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask.type(torch.uint8)
+            + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type(
+                torch.uint8), 0)
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type='self')
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid, attn = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type='context',
+            return_attn=True)
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, attn, all_input
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
+                              * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+        self.dropout = nn.Dropout(dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if (step):
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, :emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, :emb.size(1)]
+
+
+class TransformerDecoderState:
+
+    def __init__(self, src: Tensor, cache_num_layers: int = -1):
+        self.src: Tensor = src
+        self.previous_input: Tensor = None
+        self.previous_layer_inputs: Tensor = None
+        self.cache: Optional[Dict[str, Any]] = None
+        if cache_num_layers != -1:
+            self._init_cache(cache_num_layers)
+
+    def update_state(self, new_input, previous_layer_inputs):
+        self.previous_input = new_input
+        self.previous_layer_inputs = previous_layer_inputs
+        self.cache = None
+
+    def _init_cache(self, num_layers):
+        self.cache = {}
+        for num in range(num_layers):
+            layer_cache = {'memory_keys': None, 'memory_values': None}
+            layer_cache['self_keys'] = None
+            layer_cache['self_values'] = None
+            self.cache['layer_{}'.format(num)] = layer_cache
+
+    def map_batch_fn(self, fn):
+
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+
+        self.src = fn(self.src, 0)
+        if self.cache is not None:
+            _recursive_map(self.cache)
+
+
+class TransformerDecoder(nn.Module):  # Decoder
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+
+    .. mermaid::
+
+       graph BT
+          A[input]
+          B[multi-head self-attn]
+          BB[multi-head src-attn]
+          C[feed forward]
+          O[output]
+          A --> B
+          B --> BB
+          BB --> C
+          C --> O
+
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+    decoder_type = 'transformer'
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):
+        super().__init__()
+
+        # Basic attributes.
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout,
+                                          self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList([
+            TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.state = None
+
+    def forward(self,
+                state: TransformerDecoderState,
+                tgt: Tensor,
+                memory_bank: Tensor,
+                step: int = None,
+                memory_masks: Tensor = None):
+        src_words = state.src
+        tgt_words = tgt
+        src_batch, src_len = src_words.size()
+        tgt_batch, tgt_len = tgt_words.size()
+
+        # Run the forward pass of the TransformerDecoder.
+        # emb = self.embeddings(tgt, step=step)
+        emb = self.embeddings(tgt)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+        output = self.pos_emb(emb, step)
+
+        src_memory_bank = memory_bank
+        padding_idx = self.embeddings.padding_idx
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
+            .expand(tgt_batch, tgt_len, tgt_len)
+
+        if memory_masks is not None:
+            src_len = memory_masks.size(-1)
+            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \
+                .expand(src_batch, tgt_len, src_len)
+
+        if state.cache is None:
+            saved_inputs = []
+        attns = []
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if state.cache is None:
+                if state.previous_input is not None:
+                    prev_layer_input = state.previous_layer_inputs[i]
+            output, attn, all_input \
+                = self.transformer_layers[i](
+                    output, src_memory_bank,
+                    src_pad_mask, tgt_pad_mask,
+                    previous_input=prev_layer_input,
+                    layer_cache=state.cache['layer_{}'.format(i)]
+                    if state.cache is not None else None,
+                    step=step)
+            if state.cache is None:
+                saved_inputs.append(all_input)
+            attns.append(attn)
+
+        if state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        # Process the result and update the attentions.
+        if state.cache is None:
+            state.update_state(tgt, saved_inputs)
 
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """return the result by the model
+        return output, attns, state
+
+
+class PalmPointerGenerator(nn.Module):
+
+    def __init__(self, hidden_size, vocab_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, vocab_size)
+        self.gen_func = nn.LogSoftmax(-1)
+
+    def forward(self, x):
+        x = self.dense(x)
+        x = self.gen_func(x)
+        return x
+
+
+class PalmPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PalmConfig
+    base_model_prefix = 'palm'
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    @classmethod
+    def _from_pretrained(
+            cls, pretrained_model_name_or_path: Optional[Union[str,
+                                                               os.PathLike]],
+            **kwargs):
+        config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        config = PalmConfig.from_json_file(config_file) if os.path.isfile(
+            config_file) else PalmConfig()
+        config.encoder_pth = os.path.join(pretrained_model_name_or_path,
+                                          config.encoder_pth)
+        checkpoint_file = os.path.join(pretrained_model_name_or_path,
+                                       WEIGHTS_NAME)
+        checkpoint = torch.load(checkpoint_file) if os.path.isfile(
+            checkpoint_file) else None
+        return cls(config, checkpoint, **kwargs)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
 
         Args:
-            input (Dict[str, Tensor]): the preprocessed data
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
 
         Returns:
-            Dict[str, Tensor]: results
-                Example:
-                    {
-                        'loss': Tensor([12.34]), # loss for backward
-                    }
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
-        return self.model(**input)
 
-    def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        model_dir = kwargs.pop('model_dir')
+        model = cls._from_pretrained(
+            pretrained_model_name_or_path=model_dir, **kwargs)
+        model.model_dir = model_dir
+        return model
+
+
+class AbsSummarizer(PalmPreTrainedModel):  # Model
+
+    def __init__(self, config, checkpoint=None, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        if config.encoder == 'bert' or config.encoder == 'zh_bert':
+            self.bert = BertModel(
+                BertConfig.from_pretrained(config.encoder_pth))
+        elif config.encoder == 'roberta':
+            self.bert = RobertaModel(
+                RobertaConfig.from_pretrained(config.encoder_pth))
+
+        if config.max_pos > 512:
+            my_pos_embeddings = nn.Embedding(
+                config.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:
+                                          512] = self.bert.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[
+                512:] = self.bert.embeddings.position_embeddings.weight.data[
+                    -1][None, :].repeat(config.max_pos - 512, 1)
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        self.vocab_size = self.bert.config.vocab_size
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size,
+            self.bert.config.hidden_size,
+            padding_idx=1 if config.encoder == 'roberta' else 0)
+
+        if config.share_emb:
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight)
+        self.decoder = TransformerDecoder(
+            config.dec_layers,
+            config.dec_hidden_size,
+            heads=config.dec_heads,
+            d_ff=config.dec_ff_size,
+            dropout=config.dec_dropout,
+            embeddings=tgt_embeddings)
+        self.generator = PalmPointerGenerator(config.dec_hidden_size,
+                                              self.vocab_size)
+        self.generator.dense.weight = self.decoder.embeddings.weight
+
+        if checkpoint is not None:
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            for key in list(checkpoint.keys()):
+                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
+            self.load_state_dict(checkpoint, strict=False)
+        else:
+            for module in self.decoder.modules():
+                if isinstance(module, (nn.Linear, nn.Embedding)):
+                    module.weight.data.normal_(mean=0.0, std=0.02)
+                elif isinstance(module, nn.LayerNorm):
+                    module.bias.data.zero_()
+                    module.weight.data.fill_(1.0)
+                if isinstance(module, nn.Linear) and module.bias is not None:
+                    module.bias.data.zero_()
+            for p in self.generator.parameters():
+                if p.dim() > 1:
+                    xavier_uniform_(p)
+                else:
+                    p.data.zero_()
+            if config.use_bert_emb:
+                if config.encoder == 'roberta':
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=1)
+                else:
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=0)
+                tgt_embeddings.weight = copy.deepcopy(
+                    self.bert.embeddings.word_embeddings.weight)
+                self.decoder.embeddings = tgt_embeddings
+            self.generator.dense.weight = self.decoder.embeddings.weight
+
+    def forward(self, src, tgt, mask_src):
+        top_vec, _ = self.bert(src, mask_src, return_dict=False)
+        state = TransformerDecoderState(src)
+        decoder_outputs, attns, _ = self.decoder(state, tgt[:, :-1], top_vec)
+        return decoder_outputs, attns[-1], top_vec
+
+
+class LabelSmoothingLoss(nn.Module):
+    """
+    With label smoothing,
+    KL-divergence between q_{smoothed ground truth prob.}(w)
+    and p_{prob. computed by model}(w) is minimized.
+    """
+
+    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
+        assert 0.0 < label_smoothing <= 1.0
+        self.padding_idx = ignore_index
+        super(LabelSmoothingLoss, self).__init__()
+
+        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
+        one_hot = torch.full((tgt_vocab_size, ), smoothing_value)
+        one_hot[self.padding_idx] = 0
+        self.register_buffer('one_hot', one_hot.unsqueeze(0))
+        self.confidence = 1.0 - label_smoothing
+
+    def forward(self, output, target):
+        """
+        output (FloatTensor): batch_size x n_classes
+        target (LongTensor): batch_size
+        """
+        model_prob = self.one_hot.repeat(target.size(0), 1)
+        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
+        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)
+
+        return F.kl_div(output, model_prob, reduction='sum')
+
+
+class NMTLossCompute(nn.Module):
+    """
+    Standard NMT Loss Computation.
+    """
+
+    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
+        super().__init__()
+        self.generator = generator
+        self.padding_idx = symbols['PAD']
+        if label_smoothing > 0:
+            self.criterion = LabelSmoothingLoss(
+                label_smoothing, vocab_size, ignore_index=self.padding_idx)
+        else:
+            self.criterion = nn.NLLLoss(
+                ignore_index=self.padding_idx, reduction='sum')
+
+    def _bottle(self, _v):
+        return _v.view(-1, _v.size(2))
+
+    def _unbottle(self, _v, batch_size):
+        return _v.view(-1, batch_size, _v.size(1))
+
+    def forward(self, tgt, output):
+        target = tgt[:, 1:]
+        normalization = target.ne(self.padding_idx).sum()
+        bottled_output = self._bottle(output)
+        scores = self.generator(bottled_output)
+        gtruth = target.contiguous().view(-1)
+        loss = self.criterion(scores, gtruth)
+        loss.div(float(normalization))
+        return loss
+
+
+class Translator(object):
+    """
+    Uses a model to translate a batch of sentences.
+    """
+
+    @dataclass
+    class Batch:
+        batch_size: int
+        src: torch.Tensor
+        tgt: torch.Tensor
+        mask_src: torch.Tensor
+        query_id: List[None] = None
+        src_str: List[List[str]] = None
+        tgt_str: List[str] = None
+
+    def __init__(self, model, dataset: str = 'cnn'):
+        super().__init__()
+        self.logger = logging.get_logger(__name__)
+        self.args = model.config
+        self.args.dataset = dataset
+        self.model = model.palm
+        self.generator = self.model.generator
+        self.vocab = model.tokenizer
+        self.symbols = model.symbols
+        self.start_token = self.symbols['BOS']
+        self.end_token = self.symbols['EOS']
+        self.alpha = self.args.alpha
+        self.beam_size = self.args.beam_size
+        self.min_length = self.args.min_length
+        self.max_length = self.args.max_length
+
+    def from_batch(self, translation_batch):
+        batch = translation_batch['batch']
+        assert (len(translation_batch['gold_score']) == len(
+            translation_batch['predictions']))
+        batch_size = batch.batch_size
+
+        preds, pred_score, tgt_str, src, src_str = translation_batch[
+            'predictions'], translation_batch[
+                'scores'], batch.tgt_str, batch.src, batch.src_str
+        query_id = batch.query_id
+        '''
+        try:
+            query_id = batch.query_id
+        except:
+            query_id = None
+        '''
+        translations = []
+        for b in range(batch_size):
+            if self.args.dataset == 'qg_ranking_test':
+                if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert':
+                    pred_sents = [
+                        ' '.join(
+                            self.vocab.convert_ids_to_tokens(
+                                [int(n) for n in each])).replace(' ##', '')
+                        for each in preds[b]
+                    ]
+                elif self.args.encoder == 'roberta':
+                    pred_sents = [
+                        self.vocab.decode([int(n) for n in each
+                                           ]).replace('<s>',
+                                                      '').replace('</s>', '')
+                        for each in preds[b]
+                    ]
+            elif self.args.encoder == 'roberta':
+                pred_sents = self.vocab.decode([int(n)
+                                                for n in preds[b][0]]).replace(
+                                                    '<s>',
+                                                    '').replace('</s>', '')
+            elif self.args.encoder == 'bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ' '.join(pred_sents).replace(' ##', '')
+            elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase':
+                pred_sents = [
+                    self.vocab.convert_ids_to_tokens([int(n) for n in pred])
+                    for pred in preds[b]
+                ]
+                pred_sents = [
+                    ''.join(pred).replace(' ##', '') for pred in pred_sents
+                ]
+            elif self.args.encoder == 'zh_bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ''.join(pred_sents).replace('##', '')
+            gold_sent = tgt_str[b]
+
+            if self.args.encoder == 'roberta':
+                raw_src = self.vocab.decode([int(t) for t in src[b]])
+                raw_src = ' '.join(src_str[b])
+            else:
+                raw_src = [self.vocab.ids_to_tokens[int(t)]
+                           for t in src[b]][:500]
+                raw_src = ' '.join(raw_src)
+            if self.args.dataset == 'faq':
+                translation = (pred_sents, gold_sent, src_str[b], query_id[b],
+                               pred_score[b])
+            else:
+                translation = (pred_sents, gold_sent, raw_src, query_id[b],
+                               pred_score[b])
+            # translation = (pred_sents[0], gold_sent)
+            translations.append(translation)
+
+        return translations
+
+    def translate(self, data_iter, step):
+        gold_path = self.args.result_path + '.%d.gold' % step
+        can_path = self.args.result_path + '.%d.candidate' % step
+        self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
+        self.can_out_file = codecs.open(can_path, 'w', 'utf-8')
+        self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w',
+                                                    'utf-8')
+        if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta':
+            out = '\t'.join([
+                'query_id', 'source_query', 'target_query', 'predict_query'
+            ]) + '\n'
+            self.pred_json_score_out_file.write(out)
+
+        raw_src_path = self.args.result_path + '.%d.raw_src' % step
+        self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8')
+
+        pred_results, gold_results = [], []
+        cnt = 0
+        pred_dict, ref_dict = {}, {}
+        for i, batch in enumerate(data_iter):
+            self.logger.info(f'data: {i + 1} / {len(data_iter)}')
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+
+            for trans in translations:
+                pred, gold, src, query_id, pred_score = trans
+                src = src.replace('<pad>', '').replace('##', '').strip()
+                if self.args.dataset == 'qg_ranking_test':
+                    pred_str = '\t'.join([
+                        each.replace('[unused0]', '').replace(
+                            '[PAD]', '').replace('[unused1]', '').replace(
+                                r' +', ' ').replace('[SEP]', '').replace(
+                                    '[unused2]',
+                                    '').replace(r' +', ' ').replace(
+                                        '<mask>',
+                                        '<q>').replace('<pad>', '').replace(
+                                            '<s>',
+                                            '').replace('</s>', '').replace(
+                                                '<unk>', ' ').strip()
+                        for each in pred
+                    ])
+                else:
+                    pred_str = pred.replace('[unused0]', '').replace(
+                        '[PAD]', '').replace('[unused1]', '').replace(
+                            r' +', ' ').replace('[SEP]', '').replace(
+                                '[unused2]', '').replace('[CLS]', '').replace(
+                                    '[SEP]', '').replace('[UNK]', '').strip()
+                    pred_str = pred_str.replace(r' +', ' ').replace(
+                        '<mask>',
+                        '<q>').replace('<pad>', '').replace('<s>', '').replace(
+                            '</s>', '').replace('<unk>', ' ').strip()
+                gold_str = gold.replace('<mask>', '<q>').strip().replace(
+                    '[UNK]', '').replace('[unused1]', '').replace(
+                        '[unused2]',
+                        '').replace('##', '').replace('[CLS]', '').replace(
+                            '[SEP]', '').strip().replace('<s>', '').replace(
+                                '</s>', '').replace('<unk>', ' ').strip()
+                if self.args.recall_eval:
+                    _pred_str = ''
+                    for sent in pred_str.split('<q>'):
+                        can_pred_str = _pred_str + '<q>' + sent.strip()
+                        if len(can_pred_str.split()) >= len(
+                                gold_str.split()) + 10:
+                            pred_str = _pred_str
+                            break
+                        else:
+                            _pred_str = can_pred_str
+
+                if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                    pred_str = pred_str.replace('<q>', ' ')
+                    if query_id is not None:
+                        pred_json = {
+                            'query_id': query_id,
+                            'answers': [pred_str]
+                        }
+                        gold_json = {
+                            'query_id': query_id,
+                            'answers': [gold_str]
+                        }
+                        pred_json_score = {
+                            'query_id': query_id,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    else:
+                        pred_json = {'query_id': cnt, 'answers': [pred_str]}
+                        gold_json = {'query_id': cnt, 'answers': [gold_str]}
+                        pred_json_score = {
+                            'query_id': cnt,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    json.dump(pred_json, self.can_out_file)
+                    self.can_out_file.write('\n')
+                    json.dump(gold_json, self.gold_out_file)
+                    self.gold_out_file.write('\n')
+                    json.dump(pred_json_score, self.pred_json_score_out_file)
+                    self.pred_json_score_out_file.write('\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'cnn':
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write(gold_str + '\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'dureader':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    pred_results.extend(normalize([pred_str]))
+                    gold_results.extend(normalize([gold_str]))
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write('\t'.join([src[0], gold_str])
+                                             + '\n')
+
+                elif self.args.dataset == 'paraphrase':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    if self.args.encoder == 'roberta':
+                        pred_str = [pred_str]
+                    pred_dict[query_id] = normalize([pred_str[0]])
+                    ref_dict[query_id] = normalize([gold_str])
+                    self.pred_json_score_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str, pred_str[0]])
+                        + '\n')
+                elif self.args.dataset == 'faq':
+                    if pred_score[0].cpu().numpy().tolist() < -3.5:
+                        continue
+                    self.can_out_file.write(
+                        '\t'.join([str(query_id), src, pred_str]) + '\n')
+                    self.gold_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str]) + '\n')
+                    # passage, answer, question, score
+                    self.pred_json_score_out_file.write('\t'.join([
+                        str(query_id), gold_str, src, pred_str,
+                        str(pred_score[0].cpu().numpy().tolist())
+                    ]) + '\n')
+                elif self.args.dataset == 'qg_ranking_test':
+                    self.can_out_file.write(
+                        str(query_id) + '\t' + pred_str + '\n')
+
+                cnt += 1
+            self.can_out_file.flush()
+            self.gold_out_file.flush()
+            self.src_out_file.flush()
+        self.logger.info('cnt: %s' % cnt)
+        self.can_out_file.close()
+        self.gold_out_file.close()
+        self.src_out_file.close()
+
+        if step != -1:
+            if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                cnn_results = subprocess.getoutput(
+                    './run.sh %s %s' % (gold_path, can_path))  # run.sh ...
+                self.logger.info(cnn_results)
+            elif self.args.dataset == 'cnn':
+                self.logger.info('Calculating Rouge')
+                from rouge import Rouge
+                candidates = [
+                    line.strip() for line in open(can_path, encoding='utf-8')
+                ]
+                references = [
+                    line.strip() for line in open(gold_path, encoding='utf-8')
+                ]
+                rouge_score = Rouge().get_scores(
+                    candidates, references, avg=True)
+                # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges)))
+                print(rouge_score)
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+
+                def postprocess_text(preds, labels):
+                    preds = [pred.strip().replace('.', '') for pred in preds]
+                    labels = [label.strip() for label in labels]
+                    while '' in preds:
+                        idx = preds.index('')
+                        preds[idx] = '。'
+                    return preds, labels
+
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)}
+                gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)}
+                bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict)
+                print(bleu_rouge)
+            # unreachable
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                bleu_score = cal_bleu(pred_results, gold_results)
+                from rouge import Rouge
+                rouge = Rouge()
+                rouge_score = rouge.get_scores(
+                    pred_results, gold_results, avg=True)
+                print("'Dev eval result: Bleu-4={}, {}".format(
+                    bleu_score, rouge_score))
+
+    def translate_batch(self, batch: 'Batch', fast: bool = False):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length)
+
+    def _tile(self, x, count, dim=0):
+        perm = list(range(len(x.size())))
+        if dim != 0:
+            perm[0], perm[dim] = perm[dim], perm[0]
+            x = x.permute(perm).contiguous()
+        out_size = list(x.size())
+        out_size[0] *= count
+        batch = x.size(0)
+        x = x.view(batch, -1) \
+            .transpose(0, 1) \
+            .repeat(count, 1) \
+            .transpose(0, 1) \
+            .contiguous() \
+            .view(*out_size)
+        if dim != 0:
+            x = x.permute(perm).contiguous()
+        return x
+
+    def _top_k_top_p_filtering(self,
+                               logits,
+                               top_k=10,
+                               top_p=1.0,
+                               filter_value=-float('Inf'),
+                               min_tokens_to_keep=1):
+        if top_k > 0:
+            top_k = min(max(top_k, min_tokens_to_keep),
+                        logits.size(-1))  # Safety check
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+
+    def _fast_translate_batch(self,
+                              batch: 'Batch',
+                              max_length: int,
+                              min_length: int = 0):
+        # TODO: faster code path for beam_size == 1.
+        # TODO: support these blacklisted features.
+
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        mask_src = batch.mask_src
+
+        src_features, _ = self.model.bert(src, mask_src, return_dict=False)
+        state = TransformerDecoderState(src, self.model.decoder.num_layers)
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        state.map_batch_fn(
+            lambda state, dim: self._tile(state, beam_size, dim=dim))
+        src_features = self._tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(
+            batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0,
+            batch_size * beam_size,
+            step=beam_size,
+            dtype=torch.long,
+            device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1],
+                               self.start_token,
+                               dtype=torch.long,
+                               device=device)
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = (
+            torch.tensor(
+                [0.0] + [float('-inf')] * (beam_size - 1),
+                device=device).repeat(batch_size))
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results['predictions'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['scores'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['gold_score'] = [0] * batch_size
+        results['batch'] = batch
+
+        for step in range(max_length):
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+            dec_out, attns, state = self.model.decoder(
+                state, decoder_input, src_features, step=step)
+
+            # Generator forward.
+            log_probs = self.generator.forward(
+                dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # Multiply probs by the beam probability.
+
+            length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha
+            if self.args.sample_topk:
+                temperature = self.args.temperature
+                _scores = log_probs / temperature
+                _scores = self._top_k_top_p_filtering(
+                    _scores,
+                    top_k=self.args.top_k,
+                    top_p=self.args.top_p,
+                    min_tokens_to_keep=1
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next words for each beam (so we have some spare tokens
+                # and match output of greedy beam search)
+                topk_ids = torch.multinomial(
+                    F.softmax(_scores, dim=-1),
+                    num_samples=1)  # (batch_size * num_beams, 2)
+                # Compute next scores
+                _scores = F.log_softmax(
+                    _scores, dim=1)  # (batch_size * num_beams, vocab_size)
+
+                _scores += topk_log_probs.view(-1).unsqueeze(1)
+                _scores = _scores / length_penalty
+                topk_scores = torch.gather(
+                    _scores, -1, topk_ids)  # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                topk_ids = topk_ids.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+                topk_scores = topk_scores.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+            else:
+                log_probs += topk_log_probs.view(-1).unsqueeze(1)
+                curr_scores = log_probs / length_penalty
+
+                curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+                topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        if self.args.encoder == 'roberta':
+                            words = self.vocab.decode(words).strip().split()
+                        else:
+                            words = [
+                                self.vocab.ids_to_tokens[w] for w in words
+                            ]
+                            words = ' '.join(words).replace(' ##', '').split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [(words[i - 1], words[i], words[i + 1])
+                                    for i in range(1,
+                                                   len(words) - 1)]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+
+            # Resolve beam origin and true word ids.
+            topk_beam_index = topk_ids // vocab_size
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = (
+                topk_beam_index
+                + beam_offset[:topk_beam_index.size(0)].unsqueeze(1))
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat([
+                alive_seq.index_select(0, select_indices),
+                topk_ids.view(-1, 1)
+            ], -1)
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(self.end_token)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(self.end_token)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append(
+                            (topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(
+                            hypotheses[b], key=lambda x: x[0], reverse=True)
+                        if self.args.dataset == 'qg_ranking_test' or (
+                                self.args.dataset == 'paraphrase'
+                                and not self.args.sample_topk):
+                            for each in best_hyp[:beam_size]:
+                                score, pred = each
+                                results['scores'][b].append(score)
+                                results['predictions'][b].append(pred)
+                        else:
+                            score, pred = best_hyp[0]
+                            results['scores'][b].append(score)
+                            results['predictions'][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished) \
+                    .view(-1, alive_seq.size(-1))
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            state.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices))
+
+        return results
+
+    def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
+                 **kwargs) -> Dict[str, torch.Tensor]:
+        batch = self.Batch(
+            batch_size=input_ids.size()[0],
+            src=input_ids,
+            tgt=None,
+            mask_src=attention_mask)
+        translation_batch = self.translate_batch(batch)
+
+        preds = translation_batch['predictions']
+        return {'predictions': preds}
+
+
+@MODELS.register_module(Tasks.text_generation, module_name=Models.palm)
+class PalmForTextGeneration(PalmPreTrainedModel):
+
+    def __init__(self, config, checkpoint=None, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        if config.encoder == 'roberta':
+            tokenizer = RobertaTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=False)
+            symbols = {
+                'BOS': tokenizer.cls_token_id,
+                'EOS': tokenizer.sep_token_id,
+                'PAD': tokenizer.pad_token_id,
+                'EOQ': tokenizer.unk_token_id
+            }
+        elif config.encoder == 'bert' or config.encoder == 'zh_bert':
+            tokenizer = BertTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=True)
+            symbols = {
+                'BOS': tokenizer.vocab['[CLS]'],
+                'EOS': tokenizer.vocab['[SEP]'],
+                'PAD': tokenizer.vocab['[PAD]'],
+                'EOQ': tokenizer.vocab['[unused2]']
+            }
+        self.tokenizer = tokenizer
+        self.symbols = symbols
+        self.palm = AbsSummarizer(config, checkpoint)
+        self.loss = NMTLossCompute(self.palm.generator, symbols,
+                                   self.palm.vocab_size,
+                                   config.label_smoothing)
+        self.generator = Translator(self)
+
+    def forward(self, input_ids, attention_mask, labels):
+        output = self.palm(src=input_ids, tgt=labels, mask_src=attention_mask)
+        loss = self.loss(labels, output[0])
+        return TextGenerationModelOutput(
+            loss=loss,
+            logits=output[0],
+        )
+
+    def generate(self, input: Dict[str, Tensor]) -> TokenGeneratorOutput:
         outputs = self.generator(**input)
         preds = outputs['predictions']
-        return {'sequences': [pred[0] for pred in preds]}
+        return TokenGeneratorOutput(sequences=[pred[0] for pred in preds])
diff --git a/modelscope/models/nlp/ponet/backbone.py b/modelscope/models/nlp/ponet/backbone.py
index f13b362b..22114f28 100644
--- a/modelscope/models/nlp/ponet/backbone.py
+++ b/modelscope/models/nlp/ponet/backbone.py
@@ -23,8 +23,6 @@ import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import \
-    BaseModelOutputWithPastAndCrossAttentions
 from transformers.modeling_utils import (PreTrainedModel,
                                          apply_chunking_to_forward,
                                          find_pruneable_heads_and_indices,
@@ -573,7 +571,7 @@ class PoNetEncoder(nn.Module):
                 all_self_attentions,
                 all_cross_attentions,
             ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
@@ -642,34 +640,6 @@ class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
         return model
 
 
-class PoNetPreTrainedModelV2(PreTrainedModel):
-    """
-    A base class to handle weights initialization and a simple interface for loading pretrained models.
-    """
-
-    config_class = PoNetConfig
-    base_model_prefix = 'ponet'
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
 @MODELS.register_module(Tasks.backbone, module_name=Models.ponet)
 class PoNetModel(PoNetPreTrainedModel):
     """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.
diff --git a/modelscope/models/nlp/ponet/document_segmentation.py b/modelscope/models/nlp/ponet/document_segmentation.py
index 2ef8c8b8..5e933491 100644
--- a/modelscope/models/nlp/ponet/document_segmentation.py
+++ b/modelscope/models/nlp/ponet/document_segmentation.py
@@ -5,13 +5,15 @@ from typing import Any, Dict
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import TokenClassifierOutput
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils.constant import Tasks
-from .backbone import PoNetModel, PoNetPreTrainedModelV2
+from .backbone import PoNetModel, PoNetPreTrainedModel
+from .configuration import PoNetConfig
 
 __all__ = ['PoNetForDocumentSegmentation']
 
@@ -20,23 +22,7 @@ __all__ = ['PoNetForDocumentSegmentation']
     Tasks.document_segmentation, module_name=Models.ponet_for_ds)
 @MODELS.register_module(
     Tasks.extractive_summarization, module_name=Models.ponet_for_ds)
-class PoNetForDocumentSegmentation(Model):
-
-    def __init__(self, model_dir: str, model_config: Dict[str, Any], *args,
-                 **kwargs):
-        super().__init__(model_dir, model_config, *args, **kwargs)
-        self.model_cfg = model_config
-
-    def build_with_config(self, config):
-        self.ponet_model = PoNetForDocumentSegmentationBase.from_pretrained(
-            self.model_dir, config=config)
-        return self.ponet_model
-
-    def forward(self) -> Dict[str, Any]:
-        return self.model_cfg
-
-
-class PoNetForDocumentSegmentationBase(PoNetPreTrainedModelV2):
+class PoNetForDocumentSegmentation(PoNetPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r'pooler']
 
     def __init__(self, config):
@@ -107,9 +93,24 @@ class PoNetForDocumentSegmentationBase(PoNetPreTrainedModelV2):
             output = (logits, ) + outputs[2:]
             return ((loss, ) + output) if loss is not None else output
 
-        return TokenClassifierOutput(
+        return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+    @classmethod
+    def _instantiate(cls, model_dir, model_config: Dict[str, Any], **kwargs):
+        if model_config['type'] == 'bert':
+            config = BertConfig.from_pretrained(model_dir, num_labels=2)
+        elif model_config['type'] == 'ponet':
+            config = PoNetConfig.from_pretrained(model_dir, num_labels=2)
+        else:
+            raise ValueError(
+                f'Expected config type bert and ponet, which is : {model_config["type"]}'
+            )
+        model = super(Model, cls).from_pretrained(model_dir, config=config)
+        model.model_dir = model_dir
+        model.model_cfg = model_config
+        return model
diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py
index e3b358d4..e90c2b5a 100644
--- a/modelscope/models/nlp/space/model/tokenization_space.py
+++ b/modelscope/models/nlp/space/model/tokenization_space.py
@@ -15,14 +15,14 @@
 # limitations under the License
 """Tokenization classes for Space. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
 
-from modelscope.models.nlp.structbert import (BasicTokenizer, SbertTokenizer,
-                                              WordpieceTokenizer)
+from transformers import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+
 from modelscope.utils import logger as logging
 
 logger = logging.get_logger(__name__)
 
 
-class SpaceTokenizer(SbertTokenizer):
+class SpaceTokenizer(BertTokenizer):
     """
     This class overrides [`SpaceTokenizer`]. Please check the superclass for the appropriate
     documentation alongside usage examples.
diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py
index 60d369e0..1d81116e 100644
--- a/modelscope/models/nlp/structbert/__init__.py
+++ b/modelscope/models/nlp/structbert/__init__.py
@@ -24,9 +24,6 @@ if TYPE_CHECKING:
     from .fill_mask import SbertForMaskedLM
     from .text_classification import SbertForSequenceClassification
     from .token_classification import SbertForTokenClassification
-    from .tokenization import (BasicTokenizer, SbertTokenizer,
-                               WordpieceTokenizer)
-    from .tokenization_fast import SbertTokenizerFast
 else:
     _import_structure = {
         'backbone': ['SbertModel', 'SbertPreTrainedModel'],
@@ -35,9 +32,6 @@ else:
         'faq_question_answering': ['SbertForFaqQuestionAnswering'],
         'text_classification': ['SbertForSequenceClassification'],
         'token_classification': ['SbertForTokenClassification'],
-        'tokenization':
-        ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'],
-        'tokenization_fast': ['SbertTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py
index 039db3ce..9d50dc1f 100755
--- a/modelscope/models/nlp/structbert/backbone.py
+++ b/modelscope/models/nlp/structbert/backbone.py
@@ -18,15 +18,13 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
 from packaging import version
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import \
-    BaseModelOutputWithPastAndCrossAttentions
 from transformers.modeling_utils import (PreTrainedModel,
                                          apply_chunking_to_forward,
                                          find_pruneable_heads_and_indices,
@@ -37,8 +35,8 @@ from modelscope.models import Model, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import AttentionBackboneModelOutput
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
 from .configuration import SbertConfig
 
 logger = get_logger(__name__)
@@ -563,7 +561,7 @@ class SbertEncoder(nn.Module):
                 all_self_attentions,
                 all_cross_attentions,
             ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=hidden_states,
             past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
@@ -641,29 +639,15 @@ class SbertPreTrainedModel(TorchModel, PreTrainedModel):
         """
 
         model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+
         if model_dir is None:
-            config = SbertConfig(**kwargs)
+            config = SbertConfig(**model_args)
             model = cls(config)
         else:
-            model_kwargs = {}
-            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
-            id2label = kwargs.get(
-                'id2label', None if label2id is None else
-                {id: label
-                 for label, id in label2id.items()})
-            if id2label is not None and label2id is None:
-                label2id = {label: id for id, label in id2label.items()}
-
-            num_labels = kwargs.get(
-                'num_labels', None if label2id is None else len(label2id))
-            if num_labels is not None:
-                model_kwargs['num_labels'] = num_labels
-            if label2id is not None:
-                model_kwargs['label2id'] = label2id
-            if id2label is not None:
-                model_kwargs['id2label'] = id2label
             model = super(Model, cls).from_pretrained(
-                pretrained_model_name_or_path=model_dir, **model_kwargs)
+                pretrained_model_name_or_path=model_dir, **model_args)
         return model
 
 
diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
index c8dbf302..a37b8b2d 100644
--- a/modelscope/models/nlp/structbert/faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -14,6 +14,7 @@ from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.structbert import SbertConfig, SbertModel
 from modelscope.models.nlp.task_models.task_model import BaseTaskModel
+from modelscope.outputs import FaqQuestionAnsweringOutput
 from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -208,10 +209,10 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
                     Predicted scores of all classes for each query.
         Examples:
             >>> from modelscope.hub.snapshot_download import snapshot_download
-            >>> from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+            >>> from modelscope.preprocessors import FaqQuestionAnsweringTransformersPreprocessor
             >>> from modelscope.models.nlp import SbertForFaqQuestionAnswering
             >>> cache_path = snapshot_download('damo/nlp_structbert_faq-question-answering_chinese-base')
-            >>> preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(cache_path)
+            >>> preprocessor = FaqQuestionAnsweringTransformersPreprocessor.from_pretrained(cache_path)
             >>> model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
             >>> param = {
             >>>            'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
@@ -270,7 +271,7 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
         scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
         if self.metrics_layer.name == 'relation':
             scores = torch.sigmoid(scores)
-        return {'scores': scores}
+        return FaqQuestionAnsweringOutput(scores=scores)
 
     def _get_onehot_labels(self, labels, support_size, num_cls):
         labels_ = labels.view(support_size, 1)
diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py
index e611aa88..ded32020 100644
--- a/modelscope/models/nlp/structbert/fill_mask.py
+++ b/modelscope/models/nlp/structbert/fill_mask.py
@@ -105,7 +105,7 @@ class SbertForMaskedLM(SbertPreTrainedModel):
 
     Preprocessor:
         This is the fill_mask model of StructBERT, the preprocessor of this model
-        is `modelscope.preprocessors.NLPPreprocessor`.
+        is `modelscope.preprocessors.FillMaskTransformersPreprocessor`.
 
     Parameters:
         config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
@@ -213,9 +213,9 @@ class SbertForMaskedLM(SbertPreTrainedModel):
 
         Examples:
             >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor, NLPPreprocessor
+            >>> from modelscope.preprocessors import Preprocessor, FillMaskTransformersPreprocessor
             >>> model = Model.from_pretrained('damo/nlp_structbert_fill-mask_chinese-large')
-            >>> preprocessor = NLPPreprocessor('damo/nlp_structbert_fill-mask_chinese-large')
+            >>> preprocessor = FillMaskTransformersPreprocessor('damo/nlp_structbert_fill-mask_chinese-large')
             >>> # Call the model, return some tensors
             >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
             >>> # Call the pipeline
diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py
index 8797beb3..ab5b127e 100644
--- a/modelscope/models/nlp/structbert/text_classification.py
+++ b/modelscope/models/nlp/structbert/text_classification.py
@@ -55,7 +55,7 @@ class SbertForSequenceClassification(SbertPreTrainedModel):
 
     Preprocessor:
         This is the text classification model of StructBERT, the preprocessor of this model
-        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+        is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`.
 
     Trainer:
         This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py
index a040ff3e..677dcf31 100644
--- a/modelscope/models/nlp/structbert/token_classification.py
+++ b/modelscope/models/nlp/structbert/token_classification.py
@@ -22,7 +22,7 @@ from torch.nn import CrossEntropyLoss
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import TokenClassifierOutput
+from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
 from .adv_utils import compute_adv_loss
@@ -50,7 +50,7 @@ class SbertForTokenClassification(SbertPreTrainedModel):
 
     Preprocessor:
         This is the token-classification model of StructBERT, the preprocessor of this model
-        is `modelscope.preprocessors.TokenClassificationPreprocessor`.
+        is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`.
 
     Trainer:
         This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
@@ -168,7 +168,7 @@ class SbertForTokenClassification(SbertPreTrainedModel):
             - 0 for tokens that are **masked**.
 
         Returns:
-            Returns `modelscope.outputs.TokenClassifierOutput`
+            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
 
         Examples:
             >>> from modelscope.models import Model
@@ -220,10 +220,21 @@ class SbertForTokenClassification(SbertPreTrainedModel):
                     with_attention_mask=attention_mask is not None,
                     **outputs.kwargs)
 
-        return TokenClassifierOutput(
+        if label_mask is not None:
+            mask = label_mask
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             offset_mapping=offset_mapping,
+            label_mask=label_mask,
         )
diff --git a/modelscope/models/nlp/structbert/tokenization.py b/modelscope/models/nlp/structbert/tokenization.py
deleted file mode 100644
index 3171e31d..00000000
--- a/modelscope/models/nlp/structbert/tokenization.py
+++ /dev/null
@@ -1,519 +0,0 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert`"""
-
-import collections
-import os
-import unicodedata
-from typing import List, Optional, Tuple
-
-from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
-                                             _is_punctuation, _is_whitespace)
-
-from modelscope.utils.constant import ModelFile
-from modelscope.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
-
-PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'nlp_structbert_backbone_large_std': 512,
-    'nlp_structbert_backbone_base_std': 512,
-    'nlp_structbert_backbone_lite_std': 512,
-    'nlp_structbert_backbone_tiny_std': 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    'english_sbert-large-std-512': {
-        'do_lower_case': True
-    },
-}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, 'r', encoding='utf-8') as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
-        vocab[token] = index
-    return vocab
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class SbertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a SBERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(self,
-                 vocab_file,
-                 do_lower_case=True,
-                 do_basic_tokenize=True,
-                 never_split=None,
-                 unk_token='[UNK]',
-                 sep_token='[SEP]',
-                 pad_token='[PAD]',
-                 cls_token='[CLS]',
-                 mask_token='[MASK]',
-                 tokenize_chinese_chars=True,
-                 strip_accents=None,
-                 **kwargs):
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
-                'model use `tokenizer = SbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([
-            (ids, tok) for tok, ids in self.vocab.items()
-        ])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(
-            vocab=self.vocab, unk_token=self.unk_token)
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                    text, never_split=self.all_special_tokens):
-
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A SBERT sequence has the following format:
-
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0,
-                token_ids_1=token_ids_1,
-                already_has_special_tokens=True)
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + (
-                [0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
-        pair mask has the following format:
-
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
-                                                        + sep) * [1]
-
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory,
-                (filename_prefix + '-' if filename_prefix else '')
-                + VOCAB_FILES_NAMES['vocab_file'])
-        else:
-            vocab_file = (filename_prefix
-                          + '-' if filename_prefix else '') + save_directory
-        with open(vocab_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(
-                    self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.'
-                        ' Please check that the vocabulary is not corrupted!')
-                    index = token_index
-                writer.write(token + '\n')
-                index += 1
-        return (vocab_file, )
-
-
-class BasicTokenizer(object):
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-    """
-
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=None,
-                 tokenize_chinese_chars=True,
-                 strip_accents=None):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
-
-        Args:
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(
-            set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(' '.join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize('NFD', text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == 'Mn':
-                continue
-            output.append(char)
-        return ''.join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return [''.join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(' ')
-                output.append(char)
-                output.append(' ')
-            else:
-                output.append(char)
-        return ''.join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF)
-                or (0x20000 <= cp <= 0x2A6DF) or (0x2A700 <= cp <= 0x2B73F)
-                or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF)
-                or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(' ')
-            else:
-                output.append(char)
-        return ''.join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = ''.join(chars[start:end])
-                    if start > 0:
-                        substr = '##' + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
diff --git a/modelscope/models/nlp/structbert/tokenization_fast.py b/modelscope/models/nlp/structbert/tokenization_fast.py
deleted file mode 100644
index 6f7b7ba7..00000000
--- a/modelscope/models/nlp/structbert/tokenization_fast.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert_fast`"""
-
-from typing import List, Optional, Tuple
-
-import json
-import transformers
-from tokenizers import normalizers
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
-from modelscope.utils.constant import ModelFile
-from modelscope.utils.logger import get_logger
-from .tokenization import SbertTokenizer
-
-logger = get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    'vocab_file': ModelFile.VOCAB_FILE,
-    'tokenizer_file': 'tokenizer.json'
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file': {},
-    'tokenizer_file': {},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'nlp_structbert_backbone_large_std': 512,
-    'nlp_structbert_backbone_base_std': 512,
-    'nlp_structbert_backbone_lite_std': 512,
-    'nlp_structbert_backbone_tiny_std': 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    'english_sbert-large-std-512': {
-        'do_lower_case': True
-    },
-}
-
-transformers.SLOW_TO_FAST_CONVERTERS[
-    'SbertTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS['BertTokenizer']
-
-
-class SbertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" SBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
-    methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-        wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    slow_tokenizer_class = SbertTokenizer
-
-    def __init__(self,
-                 vocab_file=None,
-                 tokenizer_file=None,
-                 do_lower_case=True,
-                 unk_token='[UNK]',
-                 sep_token='[SEP]',
-                 pad_token='[PAD]',
-                 cls_token='[CLS]',
-                 mask_token='[MASK]',
-                 tokenize_chinese_chars=True,
-                 strip_accents=None,
-                 **kwargs):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        pre_tok_state = json.loads(
-            self.backend_tokenizer.normalizer.__getstate__())
-        if (pre_tok_state.get('lowercase', do_lower_case) != do_lower_case
-                or pre_tok_state.get('strip_accents',
-                                     strip_accents) != strip_accents):
-            pre_tok_class = getattr(normalizers, pre_tok_state.pop('type'))
-            pre_tok_state['lowercase'] = do_lower_case
-            pre_tok_state['strip_accents'] = strip_accents
-            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A SBERT sequence has the following format:
-
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def create_token_type_ids_from_sequences(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
-        pair mask has the following format:
-
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
-                                                        + sep) * [1]
-
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        files = self._tokenizer.model.save(
-            save_directory, name=filename_prefix)
-        return tuple(files)
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
index 9360ec08..f6214e9c 100644
--- a/modelscope/models/nlp/task_models/feature_extraction.py
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -5,12 +5,10 @@ import numpy as np
 
 from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertConfig
 from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import FeatureExtractionOutput, OutputKeys
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
 
 __all__ = ['FeatureExtractionModel']
 
@@ -31,9 +29,9 @@ class FeatureExtractionModel(SingleBackboneTaskModelBase):
 
         self.build_backbone(self.backbone_cfg)
 
-    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> FeatureExtractionOutput:
         # backbone do not need labels, only head need for loss compute
         input.pop(OutputKeys.LABELS, None)
         outputs = super().forward(input)
         sequence_output = outputs.last_hidden_state
-        return {OutputKeys.TEXT_EMBEDDING: sequence_output}
+        return FeatureExtractionOutput(text_embedding=sequence_output)
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index ce0e21a3..3a8380a6 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -7,7 +7,7 @@ from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import InformationExtractionOutput, OutputKeys
 from modelscope.utils.constant import Tasks
 
 __all__ = ['InformationExtractionModel']
@@ -31,9 +31,9 @@ class InformationExtractionModel(SingleBackboneTaskModelBase):
         self.build_backbone(self.backbone_cfg)
         self.build_head(self.head_cfg)
 
-    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> InformationExtractionOutput:
         outputs = super().forward(input)
         sequence_output = outputs.last_hidden_state
         outputs = self.head.forward(sequence_output, input['text'],
                                     input['offsets'])
-        return {OutputKeys.SPO_LIST: outputs}
+        return InformationExtractionOutput(spo_list=outputs)
diff --git a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
index 79ce365d..864a04d3 100644
--- a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
@@ -12,7 +12,7 @@ from transformers import AutoConfig, AutoModel
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.outputs import TokenClassifierWithPredictionsOutput
+from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = [
@@ -115,7 +115,7 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
             - 0 for tokens that are **masked**.
 
         Returns:
-            Returns `modelscope.outputs.TokenClassifierOutput`
+            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
 
         Examples:
             >>> from modelscope.models import Model
@@ -138,17 +138,16 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def postprocess(self, input: Dict[str, Any], **kwargs):
         predicts = self.model.decode(input)
-        offset_len = len(input['offset_mapping'])
-        predictions = torch.narrow(
-            predicts, 1, 0,
-            offset_len)  # index_select only move loc, not resize
-        return TokenClassifierWithPredictionsOutput(
+        offset_mapping = input.get('offset_mapping')
+        mask = input.get('label_mask')
+        return AttentionTokenClassificationModelOutput(
             loss=None,
             logits=None,
             hidden_states=None,
             attentions=None,
-            offset_mapping=input['offset_mapping'],
-            predictions=predictions,
+            label_mask=mask,
+            offset_mapping=offset_mapping,
+            predictions=predicts,
         )
 
 
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 982bce32..0e216496 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -1,18 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
-import numpy as np
 import torch
 
 from modelscope.metainfo import Models, TaskModels
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys, TokenClassifierOutput
+from modelscope.outputs import (AttentionTokenClassificationModelOutput,
+                                OutputKeys)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 
 __all__ = ['TokenClassificationModel']
 
@@ -48,7 +46,10 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
         self.build_backbone(self.backbone_cfg)
         self.build_head(self.head_cfg)
 
-    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(
+            self,
+            **input: Dict[str,
+                          Any]) -> AttentionTokenClassificationModelOutput:
         labels = None
         if OutputKeys.LABEL in input:
             labels = input.pop(OutputKeys.LABEL)
@@ -62,16 +63,23 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
         if labels in input:
             loss = self.compute_loss(outputs, labels)
 
-        # apply label mask to logits
-        logits = logits[input['label_mask']].unsqueeze(0)
+        if 'label_mask' in input:
+            mask = input['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
 
-        return TokenClassifierOutput(
+        return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            offset_mapping=input['offset_mapping'],
-        )
+            offset_mapping=input.get('offset_mapping'),
+            label_mask=input.get('label_mask'))
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py
index 0774e9b4..5f70f3f6 100644
--- a/modelscope/models/nlp/veco/__init__.py
+++ b/modelscope/models/nlp/veco/__init__.py
@@ -23,8 +23,6 @@ if TYPE_CHECKING:
     from .text_classification import VecoForSequenceClassification
     from .token_classification import VecoForTokenClassification
     from .fill_mask import VecoForMaskedLM
-    from .tokenization import VecoTokenizer
-    from .tokenization_fast import VecoTokenizerFast
 else:
     _import_structure = {
         'configuration': ['VecoConfig'],
@@ -32,8 +30,6 @@ else:
         'text_classification': ['VecoForSequenceClassification'],
         'fill_mask': ['VecoForMaskedLM'],
         'token_classification': ['VecoForTokenClassification'],
-        'tokenization': ['VecoTokenizer'],
-        'tokenization_fast': ['VecoTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/veco/fill_mask.py b/modelscope/models/nlp/veco/fill_mask.py
index de2cdb4a..fc37f920 100644
--- a/modelscope/models/nlp/veco/fill_mask.py
+++ b/modelscope/models/nlp/veco/fill_mask.py
@@ -40,7 +40,7 @@ class VecoForMaskedLM(TorchModel, RobertaForMaskedLM):
 
     Preprocessor:
         This is the fill_mask model of StructBERT, the preprocessor of this model
-        is `modelscope.preprocessors.NLPPreprocessor`.
+        is `modelscope.preprocessors.FillMaskTransformersPreprocessor`.
 
     Parameters:
         config ([`VecoConfig`]): Model configuration class with all the parameters of the
diff --git a/modelscope/models/nlp/veco/text_classification.py b/modelscope/models/nlp/veco/text_classification.py
index e4e74d8f..64f3aadd 100644
--- a/modelscope/models/nlp/veco/text_classification.py
+++ b/modelscope/models/nlp/veco/text_classification.py
@@ -22,7 +22,7 @@ from modelscope.models import Model, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import AttentionTextClassificationModelOutput
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.nlp.utils import parse_labels_in_order
 from .configuration import VecoConfig
 
 
@@ -46,7 +46,7 @@ class VecoForSequenceClassification(TorchModel,
 
     Preprocessor:
         This is the text classification model of Veco, the preprocessor of this model
-        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+        is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`.
 
     Trainer:
         This model should be trained by dataset which has mixed languages,
@@ -124,27 +124,13 @@ class VecoForSequenceClassification(TorchModel,
         """
 
         model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+
         if model_dir is None:
-            config = VecoConfig(**kwargs)
+            config = VecoConfig(**model_args)
             model = cls(config)
         else:
-            model_kwargs = {}
-            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
-            id2label = kwargs.get(
-                'id2label', None if label2id is None else
-                {id: label
-                 for label, id in label2id.items()})
-            if id2label is not None and label2id is None:
-                label2id = {label: id for id, label in id2label.items()}
-
-            num_labels = kwargs.get(
-                'num_labels', None if label2id is None else len(label2id))
-            if num_labels is not None:
-                model_kwargs['num_labels'] = num_labels
-            if label2id is not None:
-                model_kwargs['label2id'] = label2id
-            if id2label is not None:
-                model_kwargs['id2label'] = id2label
             model = super(Model, cls).from_pretrained(
-                pretrained_model_name_or_path=model_dir, **model_kwargs)
+                pretrained_model_name_or_path=model_dir, **model_args)
         return model
diff --git a/modelscope/models/nlp/veco/token_classification.py b/modelscope/models/nlp/veco/token_classification.py
index f6252209..4fc96c71 100644
--- a/modelscope/models/nlp/veco/token_classification.py
+++ b/modelscope/models/nlp/veco/token_classification.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 from transformers import RobertaForTokenClassification
 
 from modelscope.metainfo import Models
@@ -22,7 +23,7 @@ from modelscope.models import Model, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import AttentionTokenClassificationModelOutput
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.nlp.utils import parse_labels_in_order
 from .configuration import VecoConfig
 
 
@@ -58,6 +59,7 @@ class VecoForTokenClassification(TorchModel, RobertaForTokenClassification):
     def forward(self, *args, **kwargs):
         kwargs['return_dict'] = True
         outputs = super(Model, self).forward(*args, **kwargs)
+
         return AttentionTokenClassificationModelOutput(
             loss=outputs.loss,
             logits=outputs.logits,
@@ -81,27 +83,13 @@ class VecoForTokenClassification(TorchModel, RobertaForTokenClassification):
         """
 
         model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+
         if model_dir is None:
-            config = VecoConfig(**kwargs)
+            config = VecoConfig(**model_args)
             model = cls(config)
         else:
-            model_kwargs = {}
-            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
-            id2label = kwargs.get(
-                'id2label', None if label2id is None else
-                {id: label
-                 for label, id in label2id.items()})
-            if id2label is not None and label2id is None:
-                label2id = {label: id for id, label in id2label.items()}
-
-            num_labels = kwargs.get(
-                'num_labels', None if label2id is None else len(label2id))
-            if num_labels is not None:
-                model_kwargs['num_labels'] = num_labels
-            if label2id is not None:
-                model_kwargs['label2id'] = label2id
-            if id2label is not None:
-                model_kwargs['id2label'] = id2label
             model = super(Model, cls).from_pretrained(
-                pretrained_model_name_or_path=model_dir, **model_kwargs)
+                pretrained_model_name_or_path=model_dir, **model_args)
         return model
diff --git a/modelscope/models/nlp/veco/tokenization.py b/modelscope/models/nlp/veco/tokenization.py
deleted file mode 100644
index 21711456..00000000
--- a/modelscope/models/nlp/veco/tokenization.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-"""Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
-
-import os
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
-
-import sentencepiece as spm
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-
-from modelscope.utils import logger as logging
-
-logger = logging.get_logger(__name__)
-
-SPIECE_UNDERLINE = '▁'
-
-VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
-
-PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
-
-
-class VecoTokenizer(PreTrainedTokenizer):
-    """
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            <Tip>
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            </Tip>
-
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
-
-            <Tip>
-
-            When building a sequence using special tokens, this is not the token that is used for the end of
-            sequence. The token used is the `sep_token`.
-
-            </Tip>
-
-        sep_token (`str`, *optional*, defaults to `"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method.
-            The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python)
-            can be used, among other things, to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ['input_ids', 'attention_mask']
-
-    def __init__(self,
-                 vocab_file,
-                 bos_token='<s>',
-                 eos_token='</s>',
-                 sep_token='</s>',
-                 cls_token='<s>',
-                 unk_token='<unk>',
-                 pad_token='<pad>',
-                 mask_token='<mask>',
-                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
-                 **kwargs) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(
-            mask_token, lstrip=True, rstrip=False) if isinstance(
-                mask_token, str) else mask_token
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {
-            '<s>': 0,
-            '<pad>': 1,
-            '</s>': 2,
-            '<unk>': 3
-        }
-
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
-
-        self.fairseq_tokens_to_ids['<mask>'] = len(
-            self.sp_model) + self.fairseq_offset
-        self.fairseq_ids_to_tokens = {
-            v: k
-            for k, v in self.fairseq_tokens_to_ids.items()
-        }
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state['sp_model'] = None
-        state['sp_model_proto'] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, 'sp_model_kwargs'):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    def build_inputs_with_special_tokens(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An Veco sequence has the following format:
-
-        - single sequence: `<s> X </s>`
-        - pair of sequences: `<s> A </s></s> B </s>`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0,
-                token_ids_1=token_ids_1,
-                already_has_special_tokens=True)
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + (
-            [0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(
-            self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
-
-    def get_vocab(self):
-        vocab = {
-            self.convert_ids_to_tokens(i): i
-            for i in range(self.vocab_size)
-        }
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> List[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
-        return out_string
-
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(
-                f'Vocabulary path ({save_directory}) should be a directory')
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + '-' if filename_prefix else '')
-            + VOCAB_FILES_NAMES['vocab_file'])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/veco/tokenization_fast.py b/modelscope/models/nlp/veco/tokenization_fast.py
deleted file mode 100644
index b41a5c3b..00000000
--- a/modelscope/models/nlp/veco/tokenization_fast.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-"""Fast Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta_fast`"""
-
-import os
-from shutil import copyfile
-from typing import List, Optional, Tuple
-
-import transformers
-from transformers.file_utils import is_sentencepiece_available
-from transformers.tokenization_utils import AddedToken
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
-from modelscope.utils import logger as logging
-
-if is_sentencepiece_available():
-    from .tokenization import VecoTokenizer
-else:
-    VecoTokenizer = None
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    'vocab_file': 'sentencepiece.bpe.model',
-    'tokenizer_file': 'tokenizer.json'
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file': {},
-    'tokenizer_file': {},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
-
-transformers.SLOW_TO_FAST_CONVERTERS[
-    'VecoTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS[
-        'XLMRobertaTokenizer']
-
-
-class VecoTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`].
-    Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
-    methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            <Tip>
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            </Tip>
-
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
-
-            <Tip>
-
-            When building a sequence using special tokens, this is not the token that is used for the end of
-            sequence. The token used is the `sep_token`.
-
-            </Tip>
-
-        sep_token (`str`, *optional*, defaults to `"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ['input_ids', 'attention_mask']
-    slow_tokenizer_class = VecoTokenizer
-
-    def __init__(self,
-                 vocab_file=None,
-                 tokenizer_file=None,
-                 bos_token='<s>',
-                 eos_token='</s>',
-                 sep_token='</s>',
-                 cls_token='<s>',
-                 unk_token='<unk>',
-                 pad_token='<pad>',
-                 mask_token='<mask>',
-                 **kwargs):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(
-            mask_token, lstrip=True, rstrip=False) if isinstance(
-                mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-        self.can_save_slow_tokenizer = False if not self.vocab_file else True
-
-    def build_inputs_with_special_tokens(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An Veco sequence has the following format:
-
-        - single sequence: `<s> X </s>`
-        - pair of sequences: `<s> A </s></s> B </s>`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def create_token_type_ids_from_sequences(
-            self,
-            token_ids_0: List[int],
-            token_ids_1: Optional[List[int]] = None) -> List[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
-                'tokenizer.')
-
-        if not os.path.isdir(save_directory):
-            logger.error(
-                f'Vocabulary path ({save_directory}) should be a directory.')
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + '-' if filename_prefix else '')
-            + VOCAB_FILES_NAMES['vocab_file'])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file, )
diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp/model_outputs.py
index 46267007..464ba7ef 100644
--- a/modelscope/outputs/nlp/model_outputs.py
+++ b/modelscope/outputs/nlp/model_outputs.py
@@ -1,179 +1,13 @@
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
+
+import numpy as np
 
 from modelscope.outputs.outputs import ModelOutputBase
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 
 
-@dataclass
-class TextClassificationModelOutput(ModelOutputBase):
-    """The output class for text classification models.
-
-    Args:
-        logits (`Tensor`): The logits output of the model. loss (`Tensor`,
-        *optional*) The loss of the model, available when training.
-        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
-        output of each layer plus the optional initial embedding outputs.
-    """
-
-    logits: Tensor = None
-    loss: Tensor = None
-
-
-@dataclass
-class TokenClassificationModelOutput(ModelOutputBase):
-    """The output class for token classification models.
-        logits (`Tensor`): The logits output of the model.
-        loss (`Tensor`, *optional*) The loss of the model, available when training.
-    """
-
-    logits: Tensor = None
-    loss: Tensor = None
-    offset_mapping: Tensor = None
-
-
-@dataclass
-class FillMaskModelOutput(ModelOutputBase):
-    """The output class for text classification models.
-
-    Args:
-        logits (`Tensor`): The logits output of the model.
-        loss (`Tensor`, *optional*) The loss of the model, available when training.
-        input_ids (`Tensor`, *optional*) The input id tensor fed into the model.
-        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
-            output of each layer plus the optional initial embedding outputs.
-    """
-
-    logits: Tensor = None
-    loss: Tensor = None
-    input_ids: Tensor = None
-    hidden_states: Tensor = None
-
-
-@dataclass
-class TokenClassifierOutput(ModelOutputBase):
-    """
-    Base class for outputs of token classification models.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
-        `labels` is provided) :
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
-        config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the sentence.
-            Selected in the range ``[0, sequence_length - 1]``.
-
-    """
-
-    loss: Tensor = None
-    logits: Tensor = None
-    hidden_states: Tensor = None
-    attentions: Tensor = None
-    offset_mapping: Tensor = None
-
-
-@dataclass
-class TokenClassifierWithPredictionsOutput(ModelOutputBase):
-    """
-    Base class for outputs of token classification models.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
-        `labels` is provided) :
-            Classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
-        config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the sentence.
-            Selected in the range ``[0, sequence_length - 1]``.
-        predictions: A PyTorch tensor of the best tag sequence for each batch of shape
-            (nbest, batch_size, seq_length)
-
-    """
-
-    loss: Tensor = None
-    logits: Tensor = None
-    hidden_states: Tensor = None
-    attentions: Tensor = None
-    offset_mapping: Tensor = None
-    predictions: Tensor = None
-
-
-@dataclass
-class BaseModelOutput(ModelOutputBase):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the
-            model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-    """
-
-    last_hidden_state: Tensor = None
-    hidden_states: Optional[Tuple[Tensor]] = None
-    attentions: Optional[Tuple[Tensor]] = None
-
-
 @dataclass
 class BackboneModelOutput(ModelOutputBase):
     """The output class for text classification models.
@@ -196,81 +30,6 @@ class AttentionBackboneModelOutput(BackboneModelOutput):
     """The output class for backbones of attention based models.
 
     Args:
-        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
-        attention softmax, used to compute the weighted average in the
-        self-attention heads.
-    """
-    attentions: Tensor = None
-    past_key_values: Tensor = None
-    cross_attentions: Tensor = None
-
-
-@dataclass
-class AttentionTextClassificationModelOutput(TextClassificationModelOutput):
-    """The output class for backbones of attention based models.
-
-    Args:
-        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
-        attention softmax, used to compute the weighted average in the
-        self-attention heads.
-    """
-    attentions: Tensor = None
-    hidden_states: Tensor = None
-
-
-@dataclass
-class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput):
-    """The output class for backbones of attention based models.
-
-    Args:
-        attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax,
-        used to compute the weighted average in the self-attention heads.
-    """
-    attentions: Tensor = None
-    hidden_states: Tensor = None
-
-
-@dataclass
-class AttentionFillMaskModelOutput(FillMaskModelOutput):
-    """The output class for the fill mask and attention based models.
-
-    Args:
-        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
-        attention softmax, used to compute the weighted average in the
-        self-attention heads.
-    """
-    attentions: Tensor = None
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase):
-    """
-    Base class for model's outputs that also contains a pooling of the last
-    hidden states.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the
-            model.
-        pooler_output (`torch.FloatTensor` of shape `(batch_size,
-        hidden_size)`):
-            Last layer hidden-state of the first token of the sequence
-            (classification token) after further processing through the layers
-            used for the auxiliary pretraining task. E.g. for BERT-family of
-            models, this returns the classification token after processing
-            through a linear layer and a tanh activation function. The linear
-            layer weights are trained from the next sentence prediction
-            (classification) objective during pretraining.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
         `output_attentions=True` is passed or when
         `config.output_attentions=True`):
@@ -303,75 +62,8 @@ class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase):
             can be used (see `past_key_values` input) to speed up sequential
             decoding.
     """
-
-    last_hidden_state: Tensor = None
-    pooler_output: Tensor = None
-    hidden_states: Tensor = None
-    past_key_values: Tensor = None
     attentions: Tensor = None
-    cross_attentions: Tensor = None
-
-
-@dataclass
-class BaseModelOutputWithPastAndCrossAttentions(ModelOutputBase):
-    """
-    Base class for model's outputs that may also contain a past key/values (to
-    speed up sequential decoding).
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the
-            model.
-
-            If `past_key_values` is used only the last hidden-state of the
-            sequences of shape `(batch_size, 1, hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
-        when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
-            with each tuple having 2 tensors of shape `(batch_size, num_heads,
-            sequence_length, embed_size_per_head)`) and optionally if
-            `config.is_encoder_decoder=True` 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length,
-            embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the
-            self-attention blocks and optionally if
-            `config.is_encoder_decoder=True` in the cross-attention blocks) that
-            can be used (see `past_key_values` input) to speed up sequential
-            decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` and `config.add_cross_attention=True` is passed
-        or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the
-            attention softmax, used to compute the weighted average in the
-            cross-attention heads.
-    """
-
-    last_hidden_state: Tensor = None
     past_key_values: Tensor = None
-    hidden_states: Tensor = None
-    attentions: Tensor = None
     cross_attentions: Tensor = None
 
 
@@ -459,6 +151,60 @@ class Seq2SeqModelOutput(ModelOutputBase):
     encoder_attentions: Optional[Tuple[Tensor]] = None
 
 
+@dataclass
+class FaqQuestionAnsweringOutput(ModelOutputBase):
+    """The output class for faq QA models.
+    """
+
+    scores: Tensor = None
+
+
+@dataclass
+class FeatureExtractionOutput(ModelOutputBase):
+    """The output class for feature extraction models.
+    """
+
+    text_embedding: Tensor = None
+
+
+@dataclass
+class FillMaskModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+        input_ids (`Tensor`, *optional*) The input id tensor fed into the model.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+            output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    input_ids: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionFillMaskModelOutput(FillMaskModelOutput):
+    """The output class for the fill mask and attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+
+
+@dataclass
+class InformationExtractionOutput(ModelOutputBase):
+    """The output class for information extraction models.
+    """
+
+    spo_list: np.ndarray = None
+
+
 @dataclass
 class Seq2SeqLMOutput(ModelOutputBase):
     """
@@ -543,6 +289,42 @@ class Seq2SeqLMOutput(ModelOutputBase):
     encoder_attentions: Optional[Tuple[Tensor]] = None
 
 
+@dataclass
+class TextClassificationModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model. loss (`Tensor`,
+        *optional*) The loss of the model, available when training.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+        output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+
+
+@dataclass
+class AttentionTextClassificationModelOutput(TextClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class TextErrorCorrectionOutput(ModelOutputBase):
+    """The output class for information extraction models.
+    """
+
+    predictions: np.ndarray = None
+
+
 @dataclass
 class TextGenerationModelOutput(ModelOutputBase):
     """The output class for text generation models.
@@ -588,3 +370,35 @@ class TokenGeneratorOutput(ModelOutputBase):
     scores: Optional[Tuple[Tensor]] = None
     attentions: Optional[Tuple[Tuple[Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[Tensor]]] = None
+
+
+@dataclass
+class TokenClassificationModelOutput(ModelOutputBase):
+    """The output class for token classification models.
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+        predictions: A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    offset_mapping: Tensor = None
+    predictions: Tensor = None
+    label_mask: Tensor = None
+
+
+@dataclass
+class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax,
+        used to compute the weighted average in the self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index af264bf0..8cb8600a 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -12,7 +12,7 @@ import numpy as np
 
 from modelscope.models.base import Model
 from modelscope.msdatasets import MsDataset
-from modelscope.outputs import TASK_OUTPUTS
+from modelscope.outputs import TASK_OUTPUTS, ModelOutputBase
 from modelscope.pipeline_inputs import TASK_INPUTS, check_input_type
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.config import Config
@@ -321,6 +321,8 @@ class Pipeline(ABC):
             return
         output_keys = TASK_OUTPUTS[task_name]
         missing_keys = []
+        input = input.keys() if isinstance(input,
+                                           (dict, ModelOutputBase)) else input
         for k in output_keys:
             if k not in input:
                 missing_keys.append(k)
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 097ff9ee..c1634a9c 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -298,6 +298,7 @@ def pipeline(task: str = None,
         raise ValueError('task or pipeline_name is required')
 
     model = normalize_model_input(model, model_revision)
+    pipeline_props = {'type': pipeline_name}
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
@@ -309,7 +310,7 @@ def pipeline(task: str = None,
                         model, str) else read_config(
                             model[0], revision=model_revision)
                 check_config(cfg)
-                pipeline_name = cfg.pipeline.type
+                pipeline_props = cfg.pipeline
         elif model is not None:
             # get pipeline info from Model object
             first_model = model[0] if isinstance(model, list) else model
@@ -318,13 +319,15 @@ def pipeline(task: str = None,
                 cfg = read_config(first_model.model_dir)
                 check_config(cfg)
                 first_model.pipeline = cfg.pipeline
-            pipeline_name = first_model.pipeline.type
+            pipeline_props = first_model.pipeline
         else:
             pipeline_name, default_model_repo = get_default_pipeline_info(task)
             model = normalize_model_input(default_model_repo, model_revision)
+            pipeline_props = {'type': pipeline_name}
 
-    cfg = ConfigDict(type=pipeline_name, model=model)
-    cfg.device = device
+    pipeline_props['model'] = model
+    pipeline_props['device'] = device
+    cfg = ConfigDict(pipeline_props)
 
     if kwargs:
         cfg.update(kwargs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
index 37cae4ce..cde70fff 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -61,6 +61,8 @@ class EasyCVPipeline(object):
         self.cfg = Config.from_file(self.config_file)
         if 'device' in kwargs:
             kwargs['device'] = create_device(kwargs['device'])
+        if 'predictor_config' in kwargs:
+            kwargs.pop('predictor_config')
         self.predict_op = self._build_predict_op(**kwargs)
 
     def _build_predict_op(self, **kwargs):
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index fd731ef6..eaff2144 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -12,22 +12,19 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .extractive_summarization_pipeline import ExtractiveSummarizationPipeline
-    from .fasttext_sequence_classification_pipeline import FasttextSequenceClassificationPipeline
+    from .fasttext_text_classification_pipeline import FasttextSequenceClassificationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
-    from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline, \
-        NamedEntityRecognitionThaiPipeline, \
-        NamedEntityRecognitionVietPipeline
+    from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
     from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
-    from .text_generation_pipeline import TextGenerationPipeline
-    from .text2text_generation_pipeline import Text2TextGenerationPipeline
+    from .text_generation_pipeline import TextGenerationPipeline, TextGenerationT5Pipeline
     from .token_classification_pipeline import TokenClassificationPipeline
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline, WordSegmentationThaiPipeline
@@ -56,8 +53,6 @@ else:
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline': [
             'NamedEntityRecognitionPipeline',
-            'NamedEntityRecognitionThaiPipeline',
-            'NamedEntityRecognitionVietPipeline'
         ],
         'text_ranking_pipeline': ['TextRankingPipeline'],
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
@@ -66,7 +61,8 @@ else:
         ['TableQuestionAnsweringPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
-        'text_generation_pipeline': ['TextGenerationPipeline'],
+        'text_generation_pipeline':
+        ['TextGenerationPipeline', 'TextGenerationT5Pipeline'],
         'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
         'token_classification_pipeline': ['TokenClassificationPipeline'],
         'translation_pipeline': ['TranslationPipeline'],
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index afd5e29f..33e06685 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -24,18 +24,27 @@ class ConversationalTextToSqlPipeline(Pipeline):
     def __init__(self,
                  model: Union[StarForTextToSql, str],
                  preprocessor: ConversationalTextToSqlPreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """use `model` and `preprocessor` to create a conversational text-to-sql prediction pipeline
 
         Args:
-            model (StarForTextToSql): a model instance
-            preprocessor (ConversationalTextToSqlPreprocessor):
-                a preprocessor instance
+            model (StarForTextToSql): A model instance
+            preprocessor (ConversationalTextToSqlPreprocessor): A preprocessor instance
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
             self.preprocessor = ConversationalTextToSqlPreprocessor(
-                self.model.model_dir)
+                self.model.model_dir, **kwargs)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
         """process the prediction results
diff --git a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
index c803663b..f53f186c 100644
--- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
@@ -22,6 +22,9 @@ class DialogIntentPredictionPipeline(Pipeline):
     def __init__(self,
                  model: Union[SpaceForDialogIntent, str],
                  preprocessor: DialogIntentPredictionPreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """Use `model` and `preprocessor` to create a dialog intent prediction pipeline
 
@@ -29,11 +32,18 @@ class DialogIntentPredictionPipeline(Pipeline):
             model (str or SpaceForDialogIntent): Supply either a local model dir or a model id from the model hub,
             or a SpaceForDialogIntent instance.
             preprocessor (DialogIntentPredictionPreprocessor): An optional preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
             self.preprocessor = DialogIntentPredictionPreprocessor(
-                self.model.model_dir)
+                self.model.model_dir, **kwargs)
         self.categories = self.preprocessor.categories
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
index c0cd52dd..c2cf2493 100644
--- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
@@ -21,6 +21,9 @@ class DialogModelingPipeline(Pipeline):
     def __init__(self,
                  model: Union[SpaceForDialogModeling, str],
                  preprocessor: DialogModelingPreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """Use `model` and `preprocessor` to create a dialog modeling pipeline for dialog response generation
 
@@ -28,11 +31,18 @@ class DialogModelingPipeline(Pipeline):
             model (str or SpaceForDialogModeling): Supply either a local model dir or a model id from the model hub,
             or a SpaceForDialogModeling instance.
             preprocessor (DialogModelingPreprocessor): An optional preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
             self.preprocessor = DialogModelingPreprocessor(
-                self.model.model_dir)
+                self.model.model_dir, **kwargs)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
         """process the prediction results
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index b7adf904..207b4f81 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -22,6 +22,9 @@ class DialogStateTrackingPipeline(Pipeline):
     def __init__(self,
                  model: Union[SpaceForDST, str],
                  preprocessor: DialogStateTrackingPreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """use `model` and `preprocessor` to create a dialog state tracking pipeline for
         observation of dialog states tracking after many turns of open domain dialogue
@@ -30,11 +33,20 @@ class DialogStateTrackingPipeline(Pipeline):
             model (str or SpaceForDialogStateTracking): Supply either a local model dir or a model id
             from the model hub, or a SpaceForDialogStateTracking instance.
             preprocessor (DialogStateTrackingPreprocessor): An optional preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
         if preprocessor is None:
             self.preprocessor = DialogStateTrackingPreprocessor(
-                self.model.model_dir)
+                self.model.model_dir, **kwargs)
 
         self.tokenizer = self.preprocessor.tokenizer
         self.config = self.preprocessor.config
diff --git a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
index 325d3303..216d5302 100644
--- a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
@@ -21,8 +21,16 @@ class DistributedGPT3Pipeline(DistributedPipeline):
     model = None
 
     def __init__(self, model, preprocessor=None, **kwargs):
+        """
+
+        Args:
+            model: The model piece, str is not supported.
+            preprocessor: The preprocessor matched with the model.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+        """
         if preprocessor is None:
-            preprocessor = TextGenerationJiebaPreprocessor(model)
+            preprocessor = TextGenerationJiebaPreprocessor(model, **kwargs)
         super().__init__(model, preprocessor=preprocessor, **kwargs)
         assert hasattr(preprocessor, 'tokenizer')
 
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
index 8499f7ff..fe42e472 100644
--- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -8,7 +8,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.models.nlp.plug import DistributedPlug
 from modelscope.pipelines.base import DistributedPipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TextGenerationPreprocessor
+from modelscope.preprocessors import TextGenerationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 
 
@@ -24,11 +24,12 @@ class DistributedPlugPipeline(DistributedPipeline):
                  model,
                  preprocessor=None,
                  first_sequence='sentence',
+                 sequence_length=512,
                  **kwargs):
         """Create a plug pipeline instance.
 
         Args:
-            model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        model: The model_id of plug(damo/nlp_plug_text-generation_27B).
         The default path to damo/nlp_plug_text-generation_27B can be obtained by function
         get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
         this path before calling this class by model_id.
@@ -53,17 +54,16 @@ class DistributedPlugPipeline(DistributedPipeline):
                 |_ mp_rank_05_model_states.pt
                 |_ mp_rank_06_model_states.pt
                 |_ mp_rank_07_model_states.pt
-            preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+        preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
             be used as default.
-            first_sequence: The first_sequence key name if the input format is a dict.
-            kwargs:
-                sequence_length: The input sequence_length.
+        kwargs (dict, `optional`): Extra kwargs passed into the preprocessor's constructor.
         """
         if preprocessor is None:
-            preprocessor = TextGenerationPreprocessor(
+            preprocessor = TextGenerationTransformersPreprocessor(
                 model,
                 first_sequence=first_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
+                sequence_length=sequence_length,
+                **kwargs)
         super().__init__(model, preprocessor=preprocessor, **kwargs)
         assert hasattr(preprocessor, 'tokenizer')
         self.cls_token_id = preprocessor.tokenizer.cls_token_id
diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
index b29dcca7..6e2121c3 100644
--- a/modelscope/pipelines/nlp/document_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
@@ -14,7 +14,8 @@ from modelscope.models.nlp.ponet.configuration import PoNetConfig
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import DocumentSegmentationPreprocessor
+from modelscope.preprocessors import \
+    DocumentSegmentationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -27,26 +28,34 @@ __all__ = ['DocumentSegmentationPipeline']
     Tasks.document_segmentation, module_name=Pipelines.document_segmentation)
 class DocumentSegmentationPipeline(Pipeline):
 
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: DocumentSegmentationPreprocessor = None,
-                 **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+    def __init__(
+            self,
+            model: Union[Model, str],
+            preprocessor: DocumentSegmentationTransformersPreprocessor = None,
+            config_file: str = None,
+            device: str = 'gpu',
+            auto_collate=True,
+            **kwargs):
+        """The document segmentation pipeline.
 
-        self.model_dir = self.model.model_dir
-        self.model_cfg = self.model.forward()
-
-        if self.model_cfg['type'] == 'bert':
-            config = BertConfig.from_pretrained(self.model_dir, num_labels=2)
-        elif self.model_cfg['type'] == 'ponet':
-            config = PoNetConfig.from_pretrained(self.model_dir, num_labels=2)
-
-        self.document_segmentation_model = self.model.build_with_config(
-            config=config)
+        Args:
+            model (str or Model): Supply either a local model dir or a model id from the model hub
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
 
+        self.model_dir = self.model.model_dir
+        self.model_cfg = self.model.model_cfg
         if preprocessor is None:
-            self.preprocessor = DocumentSegmentationPreprocessor(
-                self.model.model_dir, config)
+            self.preprocessor = DocumentSegmentationTransformersPreprocessor(
+                self.model_dir, self.model.config.max_position_embeddings,
+                **kwargs)
 
     def __call__(
             self, documents: Union[List[List[str]], List[str],
@@ -85,8 +94,7 @@ class DocumentSegmentationPipeline(Pipeline):
                 key: torch.tensor(val)
                 for key, val in predict_dataset.items()
             }
-            predictions = self.document_segmentation_model.forward(
-                **input).logits
+            predictions = self.model.forward(**input).logits
 
         predictions = np.argmax(predictions, axis=2)
         assert len(sentences) == len(
diff --git a/modelscope/pipelines/nlp/extractive_summarization_pipeline.py b/modelscope/pipelines/nlp/extractive_summarization_pipeline.py
index b35ecc78..1581690e 100644
--- a/modelscope/pipelines/nlp/extractive_summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/extractive_summarization_pipeline.py
@@ -6,15 +6,14 @@ from typing import Any, Dict, List, Union
 import numpy as np
 import torch
 from datasets import Dataset
-from transformers.models.bert.modeling_bert import BertConfig
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp.ponet.configuration import PoNetConfig
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import DocumentSegmentationPreprocessor
+from modelscope.preprocessors import \
+    DocumentSegmentationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -28,31 +27,29 @@ __all__ = ['ExtractiveSummarizationPipeline']
     module_name=Pipelines.extractive_summarization)
 class ExtractiveSummarizationPipeline(Pipeline):
 
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: DocumentSegmentationPreprocessor = None,
-                 **kwargs):
-
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-
-        self.model_dir = model.model_dir
-        self.model_cfg = model.forward()
-
-        if self.model_cfg['type'] == 'bert':
-            config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
-        elif self.model_cfg['type'] == 'ponet':
-            config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2)
-
-        self.extractive_summarization_model = model.build_with_config(
-            config=config)
+    def __init__(
+            self,
+            model: Union[Model, str],
+            preprocessor: DocumentSegmentationTransformersPreprocessor = None,
+            config_file: str = None,
+            device: str = 'gpu',
+            auto_collate=True,
+            **kwargs):
+
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        self.model_dir = self.model.model_dir
+        self.model_cfg = self.model.model_cfg
 
         if preprocessor is None:
-            preprocessor = DocumentSegmentationPreprocessor(
-                self.model_dir, config)
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-
-        self.preprocessor = preprocessor
+            self.preprocessor = DocumentSegmentationTransformersPreprocessor(
+                self.model_dir, self.model.config.max_position_embeddings,
+                **kwargs)
 
     def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]:
         output = self.predict(documents)
@@ -80,8 +77,7 @@ class ExtractiveSummarizationPipeline(Pipeline):
                 key: torch.tensor(val)
                 for key, val in predict_dataset.items()
             }
-            logits = self.extractive_summarization_model.forward(
-                **input).logits
+            logits = self.model.forward(**input).logits
 
         predictions = np.argmax(logits, axis=2)
         assert len(sentences) == len(
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 46d75f49..5675144a 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -20,8 +20,24 @@ class FaqQuestionAnsweringPipeline(Pipeline):
     def __init__(self,
                  model: Union[str, Model],
                  preprocessor: Preprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        """The faq question answering pipeline.
+
+        Args:
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor, `optional`): a preprocessor instance
+            kwargs (dict, `optional`):
+                The preprocessor kwargs passed into the preprocessor's constructor.
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir, **kwargs)
diff --git a/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py b/modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py
similarity index 85%
rename from modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py
rename to modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py
index f10af88f..a3138490 100644
--- a/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/fasttext_text_classification_pipeline.py
@@ -9,11 +9,9 @@ from fasttext import load_model
 from fasttext.FastText import _FastText
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FasttextSequenceClassificationPipeline']
@@ -36,8 +34,7 @@ class FasttextSequenceClassificationPipeline(Pipeline):
         """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
 
         Args:
-            model: a model directory including model.bin and spm.model
-            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+            model: A model directory including model.bin and spm.model
         """
         super().__init__(model=model)
         model_file = os.path.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
@@ -53,8 +50,11 @@ class FasttextSequenceClassificationPipeline(Pipeline):
         text_sp = sentencepiece_tokenize(self.spm, text)
         return {'text_sp': text_sp, 'text': text}
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        topk = inputs.get('topk', -1)
+    def forward(self,
+                inputs: Dict[str, Any],
+                topk: int = None) -> Dict[str, Any]:
+        if topk is None:
+            topk = inputs.get('topk', -1)
         label, probs = self.model.predict(inputs['text_sp'], k=topk)
         label = [x.replace('__label__', '') for x in label]
         result = {
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
index aed78868..2ea264f0 100644
--- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -9,7 +9,8 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NLPPreprocessor, Preprocessor
+from modelscope.preprocessors import (FillMaskTransformersPreprocessor,
+                                      Preprocessor)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -23,7 +24,11 @@ class FeatureExtractionPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='sentence',
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 padding=False,
+                 sequence_length=128,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction
 
@@ -32,11 +37,8 @@ class FeatureExtractionPipeline(Pipeline):
             no-head model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            first_sequence: The key to read the sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
 
             Example:
             >>> from modelscope.pipelines import pipeline
@@ -46,19 +48,21 @@ class FeatureExtractionPipeline(Pipeline):
 
 
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
 
         if preprocessor is None:
-            self.preprocessor = NLPPreprocessor(
+            self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                padding=kwargs.pop('padding', False),
-                sequence_length=kwargs.pop('sequence_length', 128))
+                padding=padding,
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
 
-        self.config = Config.from_file(
-            os.path.join(self.model.model_dir, ModelFile.CONFIGURATION))
-        self.tokenizer = self.preprocessor.tokenizer
-
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index d7dc70f8..af731d00 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -23,7 +23,11 @@ class FillMaskPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
-                 first_sequence: str = 'sentence',
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 first_sequence='sentence',
+                 sequence_length=128,
                  **kwargs):
         """The inference pipeline for all the fill mask sub-tasks.
 
@@ -31,11 +35,8 @@ class FillMaskPipeline(Pipeline):
             model (`str` or `Model` or module instance): A model instance or a model local dir
                 or a model id in the model hub.
             preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
-            first_sequence (`str`， `optional`): The key to read the sentence in.
-            sequence_length (`int`， `optional`): Max sequence length in the user's custom scenario, default 128.
-
-            NOTE1: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
 
             Example1:
             >>> from modelscope.pipelines import pipeline
@@ -51,20 +52,25 @@ class FillMaskPipeline(Pipeline):
             NOTE2: Please pay attention to the model's special tokens.
             If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
             If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
-            To view other examples plese check the tests/pipelines/test_fill_mask.py.
+            To view other examples plese check tests/pipelines/test_fill_mask.py.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
                 first_sequence=first_sequence,
-                second_sequence=None,
-                sequence_length=kwargs.pop('sequence_length', 128))
-            assert hasattr(
-                self.preprocessor, 'mask_id'
-            ), 'The input preprocessor should have the mask_id attribute.'
-
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
+        assert hasattr(
+            self.preprocessor, 'mask_id'
+        ), 'The input preprocessor should have the mask_id attribute.'
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index cf96fd36..0c726c9a 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -8,8 +8,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      RelationExtractionPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['InformationExtractionPipeline']
@@ -24,12 +23,33 @@ class InformationExtractionPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=512,
                  **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if preprocessor is None:
-            self.preprocessor = RelationExtractionPreprocessor(
+        """
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported information extraction task, or a
+            model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        if self.preprocessor is None:
+            self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 512))
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
 
     def forward(self, inputs: Dict[str, Any],
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 74b380ec..9c5600fd 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -1,36 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Union
-
-import torch
+from typing import Optional, Union
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.nlp import TokenClassificationPipeline
-from modelscope.preprocessors import (NERPreprocessorThai, NERPreprocessorViet,
-                                      Preprocessor,
-                                      TokenClassificationPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 
-__all__ = [
-    'NamedEntityRecognitionPipeline', 'NamedEntityRecognitionThaiPipeline',
-    'NamedEntityRecognitionVietPipeline'
-]
+__all__ = ['NamedEntityRecognitionPipeline']
 
 
 @PIPELINES.register_module(
     Tasks.named_entity_recognition,
     module_name=Pipelines.named_entity_recognition)
+@PIPELINES.register_module(
+    Tasks.named_entity_recognition,
+    module_name=Pipelines.named_entity_recognition_thai)
+@PIPELINES.register_module(
+    Tasks.named_entity_recognition,
+    module_name=Pipelines.named_entity_recognition_viet)
 class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
 
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=128,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp NER pipeline for prediction
 
@@ -39,8 +38,8 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
             model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
             Example:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline(task='named-entity-recognition',
@@ -50,44 +49,17 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
 
             To view other examples plese check the tests/pipelines/test_named_entity_recognition.py.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
-            self.preprocessor = TokenClassificationPreprocessor(
+            self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-
-
-@PIPELINES.register_module(
-    Tasks.named_entity_recognition,
-    module_name=Pipelines.named_entity_recognition_thai)
-class NamedEntityRecognitionThaiPipeline(NamedEntityRecognitionPipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if preprocessor is None:
-            self.preprocessor = NERPreprocessorThai(
-                self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 512))
-
-
-@PIPELINES.register_module(
-    Tasks.named_entity_recognition,
-    module_name=Pipelines.named_entity_recognition_viet)
-class NamedEntityRecognitionVietPipeline(NamedEntityRecognitionPipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if preprocessor is None:
-            self.preprocessor = NERPreprocessorViet(
-                self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 512))
+        assert hasattr(self.preprocessor, 'id2label')
+        self.id2label = self.preprocessor.id2label
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index adac7f1b..424a9abc 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -22,7 +22,10 @@ class SentenceEmbeddingPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='first_sequence',
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=128,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp text dual encoder then generates the text representation.
         Args:
@@ -30,15 +33,20 @@ class SentenceEmbeddingPipeline(Pipeline):
             or a model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
-                self.model.model_dir
-                if isinstance(self.model, Model) else model,
-                first_sequence=first_sequence,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                self.model.model_dir,
+                sequence_length=sequence_length,
+                **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py
index 6ea7cd5f..7c8355f9 100644
--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -1,12 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
-from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import OfaPreprocessor, Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -19,6 +18,9 @@ class SummarizationPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """Use `model` and `preprocessor` to create a Summarization pipeline for prediction.
 
@@ -26,11 +28,25 @@ class SummarizationPipeline(Pipeline):
             model (str or Model): Supply either a local model dir which supported the summarization task,
             or a model id from the model hub, or a model instance.
             preprocessor (Preprocessor): An optional preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         self.model.eval()
-        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
-            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)
+        if preprocessor is None:
+            if self.model.__class__.__name__ == 'OfaForAllTasks':
+                self.preprocessor = Preprocessor.from_pretrained(
+                    self.model.model_dir,
+                    type=Preprocessors.ofa_tasks_preprocessor,
+                    field=Fields.multi_modal)
+            else:
+                self.preprocessor = Preprocessor.from_pretrained(
+                    self.model.model_dir, **kwargs)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 36f4c08a..917a70d4 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -33,6 +33,9 @@ class TableQuestionAnsweringPipeline(Pipeline):
                  model: Union[TableQuestionAnswering, str],
                  preprocessor: TableQuestionAnsweringPreprocessor = None,
                  db: Database = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """use `model` and `preprocessor` to create a table question answering prediction pipeline
 
@@ -40,11 +43,19 @@ class TableQuestionAnsweringPipeline(Pipeline):
             model (TableQuestionAnswering): a model instance
             preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance
             db (Database): a database to store tables in the database
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
         if preprocessor is None:
             self.preprocessor = TableQuestionAnsweringPreprocessor(
-                self.model.model_dir)
+                self.model.model_dir, **kwargs)
 
         # initilize tokenizer
         self.tokenizer = BertTokenizer(
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
deleted file mode 100644
index 9bf226b9..00000000
--- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, List, Optional, Union
-
-import torch
-from numpy import isin
-
-from modelscope.metainfo import Pipelines
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline, Tensor
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import Text2TextGenerationPreprocessor
-from modelscope.utils.config import use_task_specific_params
-from modelscope.utils.constant import Tasks
-
-__all__ = ['Text2TextGenerationPipeline']
-
-TRANSLATE_PIPELINES = [
-    Pipelines.translation_en_to_de,
-    Pipelines.translation_en_to_ro,
-    Pipelines.translation_en_to_fr,
-]
-
-
-@PIPELINES.register_module(
-    Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
-@PIPELINES.register_module(
-    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de)
-@PIPELINES.register_module(
-    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro)
-@PIPELINES.register_module(
-    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr)
-class Text2TextGenerationPipeline(Pipeline):
-
-    def __init__(
-            self,
-            model: Union[Model, str],
-            preprocessor: Optional[Text2TextGenerationPreprocessor] = None,
-            first_sequence='sentence',
-            **kwargs):
-        """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the text generation task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='text2text-generation',
-            >>>    model='damo/nlp_t5_text2text-generation_chinese-base')
-            >>> sentence1 = '中国的首都位于<extra_id_0>。'
-            >>> print(pipeline_ins(sentence1))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'sentence': sentence1}))
-            >>> # 北京
-
-            To view other examples plese check the tests/pipelines/test_text_generation.py.
-        """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if preprocessor is None:
-            self.preprocessor = Text2TextGenerationPreprocessor(
-                self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 128))
-        self.tokenizer = self.preprocessor.tokenizer
-        self.pipeline = self.model.pipeline.type
-        self.model.eval()
-
-    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
-        """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks
-        """
-        if not isinstance(inputs, str):
-            raise ValueError(f'Not supported input type: {type(inputs)}')
-
-        if self.pipeline in TRANSLATE_PIPELINES:
-            use_task_specific_params(self.model, self.pipeline)
-            inputs = self.model.config.prefix + inputs
-
-        return super().preprocess(inputs, **preprocess_params)
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-
-        forward_params['min_length'] = forward_params.get(
-            'min_length', self.model.config.min_length)
-        forward_params['max_length'] = forward_params.get(
-            'max_length', self.model.config.max_length)
-
-        with torch.no_grad():
-            output_ids = self.model.generate(**inputs, **forward_params)
-            return {'output_ids': output_ids}
-
-    def postprocess(self, inputs: Dict[str, Tensor],
-                    **postprocess_params) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-        output = self.tokenizer.decode(
-            inputs['output_ids'][0],
-            skip_special_tokens=True,
-        )
-        return {OutputKeys.TEXT: output}
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index fd223c76..24c07d69 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -5,11 +5,14 @@ import numpy as np
 
 from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import OutputKeys, TextClassificationModelOutput
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 @PIPELINES.register_module(
@@ -31,6 +34,9 @@ class TextClassificationPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Preprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  **kwargs):
         """The inference pipeline for all the text classification sub-tasks.
 
@@ -38,10 +44,8 @@ class TextClassificationPipeline(Pipeline):
             model (`str` or `Model` or module instance): A model instance or a model local dir
                 or a model id in the model hub.
             preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
-            first_sequence (`str`, `optional`): The key of the first sentence.
-            second_sequence (`str`, `optional`): The key of the second sentence.
-            sequence_length (`int`, `optional`): The sequence length.
-            id2label (`dict`, `optional`): The id-label mapping.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
 
         Example:
             >>> from modelscope.pipelines import pipeline
@@ -49,31 +53,38 @@ class TextClassificationPipeline(Pipeline):
                 model='damo/nlp_structbert_sentence-similarity_chinese-base')
             >>> input = ('这是个测试', '这也是个测试')
             >>> print(pipeline_ins(input))
-
-        NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence'
-            param will have no affection.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
 
         if preprocessor is None:
             if self.model.__class__.__name__ == 'OfaForAllTasks':
                 self.preprocessor = Preprocessor.from_pretrained(
                     model_name_or_path=self.model.model_dir,
                     type=Preprocessors.ofa_tasks_preprocessor,
-                    field=Fields.multi_modal)
+                    field=Fields.multi_modal,
+                    **kwargs)
             else:
                 first_sequence = kwargs.pop('first_sequence', 'first_sequence')
                 second_sequence = kwargs.pop('second_sequence', None)
+                sequence_length = kwargs.pop('sequence_length', 512)
                 self.preprocessor = Preprocessor.from_pretrained(
-                    self.model
-                    if isinstance(self.model, str) else self.model.model_dir,
-                    first_sequence=first_sequence,
-                    second_sequence=second_sequence,
-                    sequence_length=kwargs.pop('sequence_length', 512))
-
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
+                    self.model.model_dir, **{
+                        'first_sequence': first_sequence,
+                        'second_sequence': second_sequence,
+                        'sequence_length': sequence_length,
+                        **kwargs
+                    })
+                assert hasattr(self.preprocessor, 'id2label')
+                self.id2label = self.preprocessor.id2label
+                if self.id2label is None:
+                    logger.warn(
+                        'The id2label mapping is None, will return original ids.'
+                    )
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
@@ -82,16 +93,17 @@ class TextClassificationPipeline(Pipeline):
         return self.model(**inputs, **forward_params)
 
     def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
+                    inputs: Union[Dict[str, Any],
+                                  TextClassificationModelOutput],
+                    topk: int = None) -> Dict[str, Any]:
+        """Process the prediction results
 
         Args:
             inputs (`Dict[str, Any]` or `TextClassificationModelOutput`): The model output, please check
                 the `TextClassificationModelOutput` class for details.
             topk (int): The topk probs to take
         Returns:
-            Dict[str, str]: the prediction results.
+            Dict[str, Any]: the prediction results.
                 scores: The probabilities of each label.
                 labels: The real labels.
             Label at index 0 is the smallest probability.
@@ -99,8 +111,6 @@ class TextClassificationPipeline(Pipeline):
         if self.model.__class__.__name__ == 'OfaForAllTasks':
             return inputs
         else:
-            assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                              'as a parameter or make sure the preprocessor has the attribute.'
             logits = inputs[OutputKeys.LOGITS].cpu().numpy()
             if logits.shape[0] == 1:
                 logits = logits[0]
@@ -111,20 +121,24 @@ class TextClassificationPipeline(Pipeline):
 
             probs = softmax(logits)
             num_classes = probs.shape[-1]
-            topk = min(topk, num_classes)
+            topk = min(topk, num_classes) if topk is not None else num_classes
             top_indices = np.argpartition(probs, -topk)[-topk:]
             probs = np.take_along_axis(probs, top_indices, axis=-1).tolist()
 
             def map_to_label(id):
-                if id in self.id2label:
-                    return self.id2label[id]
-                elif str(id) in self.id2label:
-                    return self.id2label[str(id)]
+                if self.id2label is not None:
+                    if id in self.id2label:
+                        return self.id2label[id]
+                    elif str(id) in self.id2label:
+                        return self.id2label[str(id)]
+                    else:
+                        raise Exception(
+                            f'id {id} not found in id2label: {self.id2label}')
                 else:
-                    raise Exception('id not found in id2label')
+                    return id
 
             v_func = np.vectorize(map_to_label)
-            return {
-                OutputKeys.SCORES: probs,
-                OutputKeys.LABELS: v_func(top_indices).tolist()
-            }
+            top_indices = v_func(top_indices).tolist()
+            probs = list(reversed(probs))
+            top_indices = list(reversed(top_indices))
+            return {OutputKeys.SCORES: probs, OutputKeys.LABELS: top_indices}
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index ee8cb711..1e6d525a 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.models.nlp import BartForTextErrorCorrection
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TextErrorCorrectionPreprocessor
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['TextErrorCorrectionPipeline']
@@ -20,17 +20,20 @@ __all__ = ['TextErrorCorrectionPipeline']
     Tasks.text_error_correction, module_name=Pipelines.text_error_correction)
 class TextErrorCorrectionPipeline(Pipeline):
 
-    def __init__(
-            self,
-            model: Union[BartForTextErrorCorrection, str],
-            preprocessor: Optional[TextErrorCorrectionPreprocessor] = None,
-            **kwargs):
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 **kwargs):
         """use `model` and `preprocessor` to create a nlp text correction pipeline.
 
         Args:
             model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub.
             preprocessor (TextErrorCorrectionPreprocessor): An optional preprocessor instance.
-
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         Example:
         >>> from modelscope.pipelines import pipeline
         >>> pipeline_ins = pipeline(
@@ -38,13 +41,17 @@ class TextErrorCorrectionPipeline(Pipeline):
         >>> sentence1 = '随着中国经济突飞猛近，建造工业与日俱增'
         >>> print(pipeline_ins(sentence1))
 
-        To view other examples plese check the tests/pipelines/test_text_error_correction.py.
+        To view other examples plese check tests/pipelines/test_text_error_correction.py.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
-            self.preprocessor = TextErrorCorrectionPreprocessor(
-                self.model.model_dir)
+            self.preprocessor = Preprocessor.from_pretrained(
+                self.model.model_dir, **kwargs)
         self.vocab = self.preprocessor.vocab
 
     def forward(self, inputs: Dict[str, Any],
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index bf1162bf..566ca359 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,20 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 from typing import Any, Dict, Optional, Union
 
 import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import (ModelOutputBase, OutputKeys,
+                                TokenGeneratorOutput)
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
-from modelscope.utils.constant import Fields, Tasks
-from modelscope.utils.hub import read_config
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import Config, read_config
 
-__all__ = ['TextGenerationPipeline']
+__all__ = ['TextGenerationPipeline', 'TextGenerationT5Pipeline']
 
 
 @PIPELINES.register_module(
@@ -24,7 +26,11 @@ class TextGenerationPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
                  first_sequence='sentence',
+                 sequence_length=128,
                  **kwargs):
         """Use `model` and `preprocessor` to create a generation pipeline for prediction.
 
@@ -33,11 +39,8 @@ class TextGenerationPipeline(Pipeline):
             or a model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
 
             Example:
             >>> from modelscope.pipelines import pipeline
@@ -49,26 +52,29 @@ class TextGenerationPipeline(Pipeline):
             >>> # Or use the dict input:
             >>> print(pipeline_ins({'sentence': sentence1}))
 
-            To view other examples plese check the tests/pipelines/test_text_generation.py.
+            To view other examples plese check tests/pipelines/test_text_generation.py.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        cfg = read_config(self.model.model_dir)
-        self.postprocessor = cfg.pop('postprocessor', 'decode')
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
         if preprocessor is None:
-            preprocessor_cfg = cfg.preprocessor
-            preprocessor_cfg.update({
-                'model_dir':
+            self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                'first_sequence':
-                first_sequence,
-                'second_sequence':
-                None,
-                'sequence_length':
-                kwargs.pop('sequence_length', 128)
-            })
-            self.preprocessor = build_preprocessor(preprocessor_cfg,
-                                                   Fields.nlp)
+                first_sequence=first_sequence,
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
+        self.postprocessor = kwargs.pop('postprocessor', None)
+        if self.postprocessor is None and hasattr(self.model, 'model_dir'):
+            # Compatible with old code
+            cfg = read_config(self.model.model_dir)
+            self.postprocessor = cfg.get('postprocessor')
+        if self.postprocessor is None:
+            self.postprocessor = 'decode'
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return {}, pipeline_parameters, {}
@@ -79,20 +85,19 @@ class TextGenerationPipeline(Pipeline):
             return self.model.generate(inputs, **forward_params)
 
     def decode(self, inputs) -> str:
-        tokenizer = self.preprocessor.tokenizer
-        return tokenizer.decode(inputs.tolist(), skip_special_tokens=True)
+        return self.preprocessor.decode(
+            inputs.tolist(), skip_special_tokens=True)
 
     def sentence_piece(self, inputs) -> str:
-        tokenizer = self.preprocessor.tokenizer
-        return tokenizer.decode(inputs.tolist())
+        return self.preprocessor.decode(inputs.tolist())
 
     def roberta(self, inputs) -> str:
-        tokenizer = self.preprocessor.tokenizer
-        decoded = tokenizer.decode(inputs.tolist())
+        decoded = self.preprocessor.decode(inputs.tolist())
         return decoded.replace('<q>', '. ').replace('<mask>',
                                                     '. ').replace('</s>', '')
 
-    def postprocess(self, inputs: Dict[str, Tensor],
+    def postprocess(self, inputs: Union[Dict[str, Tensor],
+                                        TokenGeneratorOutput],
                     **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
@@ -102,9 +107,72 @@ class TextGenerationPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        inputs = inputs['sequences']
+        if isinstance(inputs, (dict, ModelOutputBase)):
+            inputs = inputs['sequences']
         if isinstance(inputs, list) or len(inputs.shape) > 1:
             inputs = inputs[0]
         decoded = getattr(self, self.postprocessor)(inputs)
         text = remove_space_between_chinese_chars(decoded)
         return {OutputKeys.TEXT: text}
+
+
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+class TextGenerationT5Pipeline(TextGenerationPipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 sub_task=None,
+                 **kwargs):
+        super().__init__(model, preprocessor, **kwargs)
+        self.sub_task = sub_task
+        self.task_specific_params = self._parse_specific_model_params(
+            getattr(self.model, 'model_dir', None), 'task_specific_params')
+        self.min_length = self._parse_specific_model_params(
+            getattr(self.model, 'model_dir', None), 'min_length')
+        self.max_length = self._parse_specific_model_params(
+            getattr(self.model, 'model_dir', None), 'max_length')
+
+    def _parse_specific_model_params(self, model_dir, key):
+        if model_dir is None:
+            return
+
+        cfg: Config = read_config(model_dir)
+        params = cfg.safe_get(f'model.{key}')
+        if params is None:
+            cfg: Config = read_config(os.path.join(model_dir, 'config.json'))
+            params = cfg.safe_get(key)
+        return params
+
+    def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
+        if not isinstance(inputs, str):
+            raise ValueError(f'Not supported input type: {type(inputs)}')
+
+        if self.task_specific_params is not None:
+            sub_task = self.sub_task or self.model.pipeline.type
+            if sub_task in self.task_specific_params:
+                self.model.config.update(self.task_specific_params[sub_task])
+                if 'prefix' in self.task_specific_params[sub_task]:
+                    inputs = self.task_specific_params[sub_task].prefix + inputs
+
+        return super().preprocess(inputs, **preprocess_params)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+
+        min_length = forward_params.get('min_length', self.min_length)
+        max_length = forward_params.get('max_length', self.max_length)
+        if min_length is not None:
+            forward_params['min_length'] = min_length
+        if max_length is not None:
+            forward_params['max_length'] = max_length
+
+        with torch.no_grad():
+            return self.model.generate(**inputs, **forward_params)
diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py
index fe627e5f..dfd0d433 100644
--- a/modelscope/pipelines/nlp/text_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -9,7 +9,8 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import Preprocessor, TextRankingPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TextRankingTransformersPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['TextRankingPipeline']
@@ -22,6 +23,10 @@ class TextRankingPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=128,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
 
@@ -30,14 +35,21 @@ class TextRankingPipeline(Pipeline):
             or a model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
 
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                sequence_length=sequence_length,
+                **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 86cc49b7..63f241a2 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -1,7 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
+import numpy as np
 import torch
 
 from modelscope.metainfo import Pipelines
@@ -32,24 +33,35 @@ class TokenClassificationPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=128,
                  **kwargs):
         """use `model` and `preprocessor` to create a token classification pipeline for prediction
 
         Args:
             model (str or Model): A model instance or a model local dir or a model id in the model hub.
             preprocessor (Preprocessor): a preprocessor instance, must not be None.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
 
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
 
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
+        assert hasattr(self.preprocessor, 'id2label')
+        self.id2label = self.preprocessor.id2label
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
@@ -60,53 +72,59 @@ class TokenClassificationPipeline(Pipeline):
             }
 
     def postprocess(self, inputs: Dict[str, Any],
-                    **postprocess_params) -> Dict[str, str]:
-        """process the prediction results
+                    **postprocess_params) -> Dict[str, Any]:
+        """Process the prediction results
 
         Args:
             inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
-            Dict[str, str]: the prediction results
+            Dict[str, Any]: the prediction results
         """
         chunks = self._chunk_process(inputs, **postprocess_params)
-
-        # for cws outputs
-        if len(chunks) > 0 and chunks[0]['type'].lower() == 'cws':
-            spans = [
-                chunk['span'] for chunk in chunks if chunk['span'].strip()
-            ]
-            seg_result = [span for span in spans]
-            outputs = {OutputKeys.OUTPUT: seg_result}
-
-        # for ner outputs
-        else:
-            outputs = {OutputKeys.OUTPUT: chunks}
-        return outputs
+        return {OutputKeys.OUTPUT: chunks}
 
     def _chunk_process(self, inputs: Dict[str, Any],
-                       **postprocess_params) -> Dict[str, str]:
+                       **postprocess_params) -> List:
         """process the prediction results and output as chunks
 
         Args:
             inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
-            Dict[str, str]: the prediction results
+            List: The output chunks
         """
         text = inputs['text']
+        # TODO post_process does not support batch for now.
         if OutputKeys.PREDICTIONS not in inputs:
             logits = inputs[OutputKeys.LOGITS]
-            predictions = torch.argmax(logits[0], dim=-1)
+            if len(logits.shape) == 3:
+                logits = logits[0]
+            predictions = torch.argmax(logits, dim=-1)
         else:
-            predictions = inputs[OutputKeys.PREDICTIONS].squeeze(
-                0).cpu().numpy()
+            predictions = inputs[OutputKeys.PREDICTIONS]
+            if len(predictions.shape) == 2:
+                predictions = predictions[0]
+
+        offset_mapping = inputs['offset_mapping']
+        if len(offset_mapping.shape) == 3:
+            offset_mapping = offset_mapping[0]
+
+        label_mask = inputs.get('label_mask')
+        if label_mask is not None:
+            masked_lengths = label_mask.sum(-1).long().cpu().item()
+            offset_mapping = torch.narrow(
+                offset_mapping, 0, 0,
+                masked_lengths)  # index_select only move loc, not resize
+            predictions = torch.narrow(
+                predictions, 0, 0,
+                masked_lengths)  # index_select only move loc, not resize
+
+        offset_mapping = torch_nested_numpify(
+            torch_nested_detach(offset_mapping))
         predictions = torch_nested_numpify(torch_nested_detach(predictions))
-        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
-
         labels = [self.id2label[x] for x in predictions]
-        if len(labels) > len(offset_mapping):
-            labels = labels[1:-1]
+
         chunks = []
         chunk = {}
         for label, offsets in zip(labels, offset_mapping):
diff --git a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
index 57fc646a..41f833dc 100644
--- a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
@@ -2,19 +2,15 @@
 
 import io
 import os
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
-import numpy as np
 import torch
 from transformers import XLMRobertaTokenizer
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import BertForSequenceClassification
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['TranslationQualityEstimationPipeline']
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 9fe2ad93..ee49d9a5 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -10,9 +10,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.nlp import TokenClassificationPipeline
-from modelscope.preprocessors import (Preprocessor,
-                                      TokenClassificationPreprocessor,
-                                      WordSegmentationPreprocessorThai)
+from modelscope.preprocessors import (
+    Preprocessor, TokenClassificationTransformersPreprocessor,
+    WordSegmentationPreprocessorThai)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
@@ -23,42 +23,49 @@ __all__ = ['WordSegmentationPipeline', 'WordSegmentationThaiPipeline']
 @PIPELINES.register_module(
     Tasks.word_segmentation, module_name=Pipelines.word_segmentation)
 class WordSegmentationPipeline(TokenClassificationPipeline):
+    """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
 
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
+    NOTE: The preprocessor will first split the sentence into single characters,
+    then feed them into the tokenizer with the parameter is_split_into_words=True.
+
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> pipeline_ins = pipeline(task='word-segmentation',
+    >>>    model='damo/nlp_structbert_word-segmentation_chinese-base')
+    >>> sentence1 = '今天天气不错，适合出去游玩'
+    >>> print(pipeline_ins(sentence1))
+
+    To view other examples plese check tests/pipelines/test_word_segmentation.py.
+    """
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    output_final_sentence=True,
+                    **postprocess_params) -> Dict[str, Any]:
+        """Process the prediction results
 
         Args:
-            model (str or Model): Supply either a local model dir which supported the WS task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: The preprocessor will first split the sentence into single characters,
-            then feed them into the tokenizer with the parameter is_split_into_words=True.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='word-segmentation',
-            >>>    model='damo/nlp_structbert_word-segmentation_chinese-base')
-            >>> sentence1 = '今天天气不错，适合出去游玩'
-            >>> print(pipeline_ins(sentence1))
-
-            To view other examples plese check the tests/pipelines/test_word_segmentation.py.
+            inputs (Dict[str, Any]): should be tensors from model
+            output_final_sentence (bool): Output the cut sentence splitted by blanks or not.
+                If False, the pipeline will output the original token-label information.
+
+        Returns:
+            Dict[str, Any]: The prediction results.
         """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if preprocessor is None:
-            self.preprocessor = TokenClassificationPreprocessor(
-                self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 128))
-        self.model.eval()
+        chunks = self._chunk_process(inputs, **postprocess_params)
+
+        # for cws outputs
+        if output_final_sentence:
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = [span for span in spans]
+            outputs = {OutputKeys.OUTPUT: seg_result}
 
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
+        # for ner outputs
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
+        return outputs
 
 
 @PIPELINES.register_module(
@@ -66,8 +73,10 @@ class WordSegmentationPipeline(TokenClassificationPipeline):
     module_name=Pipelines.multilingual_word_segmentation)
 class MultilingualWordSegmentationPipeline(WordSegmentationPipeline):
 
-    def postprocess(self, inputs: Dict[str, Any],
-                    **postprocess_params) -> Dict[str, str]:
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    output_final_sentence=True,
+                    **postprocess_params) -> Dict[str, Any]:
         chunks = self._chunk_process(inputs, **postprocess_params)
         word_segments = [entity['span'] for entity in chunks]
         return {OutputKeys.OUTPUT: word_segments}
@@ -80,14 +89,22 @@ class WordSegmentationThaiPipeline(MultilingualWordSegmentationPipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=512,
                  **kwargs):
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         if preprocessor is None:
-            preprocessor = WordSegmentationPreprocessorThai(
-                model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+            self.preprocessor = WordSegmentationPreprocessorThai(
+                self.model.model_dir,
+                sequence_length=sequence_length,
+                **kwargs)
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 31b556d7..3db73d8b 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -10,8 +10,7 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      ZeroShotClassificationPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ZeroShotClassificationPipeline']
@@ -25,6 +24,10 @@ class ZeroShotClassificationPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Preprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=512,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp zero shot classifiction for prediction.
 
@@ -44,7 +47,8 @@ class ZeroShotClassificationPipeline(Pipeline):
             or a model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
 
             Example:
             >>> from modelscope.pipelines import pipeline
@@ -55,17 +59,22 @@ class ZeroShotClassificationPipeline(Pipeline):
             >>> template = '这篇文章的标题是{}'
             >>> print(pipeline_ins(sentence1, candidate_labels=labels, hypothesis_template=template))
 
-            To view other examples plese check the tests/pipelines/test_zero_shot_classification.py.
+            To view other examples plese check tests/pipelines/test_zero_shot_classification.py.
         """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
         self.entailment_id = 0
         self.contradiction_id = 2
         if preprocessor is None:
-            self.preprocessor = ZeroShotClassificationPreprocessor(
+            sequence_length = kwargs.pop('sequence_length', 512)
+            self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
-                sequence_length=kwargs.pop('sequence_length', 512))
+                sequence_length=sequence_length,
+                **kwargs)
         self.model.eval()
 
     def _sanitize_parameters(self, **kwargs):
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index ce053459..b4adf935 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -16,15 +16,19 @@ if TYPE_CHECKING:
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (
-        DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
-        FillMaskPoNetPreprocessor, NLPPreprocessor,
-        NLPTokenizerPreprocessorBase, PassageRankingPreprocessor,
-        TextRankingPreprocessor, RelationExtractionPreprocessor,
-        SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
-        TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
-        TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
+        DocumentSegmentationTransformersPreprocessor,
+        FaqQuestionAnsweringTransformersPreprocessor,
+        FillMaskPoNetPreprocessor, FillMaskTransformersPreprocessor,
+        TextRankingTransformersPreprocessor,
+        RelationExtractionTransformersPreprocessor,
+        SentenceEmbeddingTransformersPreprocessor,
+        TextClassificationTransformersPreprocessor,
+        TokenClassificationTransformersPreprocessor,
+        TextErrorCorrectionPreprocessor, TextGenerationT5Preprocessor,
+        TextGenerationTransformersPreprocessor, Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor, CodeGeeXPreprocessor,
-        MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
+        MGLMSummarizationPreprocessor,
+        ZeroShotClassificationTransformersPreprocessor,
         TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
         DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
         DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor,
@@ -47,18 +51,21 @@ else:
         'kws': ['WavToLists'],
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
-            'DocumentSegmentationPreprocessor',
-            'FaqQuestionAnsweringPreprocessor', 'FillMaskPoNetPreprocessor',
-            'NLPPreprocessor', 'NLPTokenizerPreprocessorBase',
-            'TextRankingPreprocessor', 'RelationExtractionPreprocessor',
-            'SentenceEmbeddingPreprocessor',
-            'SequenceClassificationPreprocessor',
-            'TokenClassificationPreprocessor',
-            'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
-            'Tokenize', 'Text2TextGenerationPreprocessor',
+            'DocumentSegmentationTransformersPreprocessor',
+            'FaqQuestionAnsweringTransformersPreprocessor',
+            'FillMaskPoNetPreprocessor', 'FillMaskTransformersPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'TextRankingTransformersPreprocessor',
+            'RelationExtractionTransformersPreprocessor',
+            'SentenceEmbeddingTransformersPreprocessor',
+            'TextClassificationTransformersPreprocessor',
+            'TokenClassificationTransformersPreprocessor',
+            'TextErrorCorrectionPreprocessor',
+            'TextGenerationTransformersPreprocessor', 'Tokenize',
+            'TextGenerationT5Preprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'MGLMSummarizationPreprocessor', 'CodeGeeXPreprocessor',
-            'ZeroShotClassificationPreprocessor',
+            'ZeroShotClassificationTransformersPreprocessor',
             'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'NERPreprocessorViet', 'NERPreprocessorThai',
             'WordSegmentationPreprocessorThai',
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index e9b85424..277c26cc 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -2,9 +2,10 @@
 import os
 from abc import ABC, abstractmethod
 from copy import deepcopy
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence, Union
 
 from modelscope.metainfo import Models, Preprocessors
+from modelscope.utils.checkpoint import save_configuration
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke,
                                        ModeKeys, Tasks)
@@ -98,6 +99,8 @@ PREPROCESSOR_MAP = {
     Preprocessors.sen_cls_tokenizer,
     (Models.structbert, Tasks.part_of_speech):
     Preprocessors.token_cls_tokenizer,
+    (Models.token_classification_for_ner, Tasks.named_entity_recognition):
+    Preprocessors.token_cls_tokenizer,
     (Models.structbert, Tasks.token_classification):
     Preprocessors.token_cls_tokenizer,
     (Models.structbert, Tasks.word_segmentation):
@@ -117,7 +120,15 @@ PREPROCESSOR_MAP = {
     (Models.veco, Tasks.sentence_similarity):
     Preprocessors.sen_cls_tokenizer,
 
-    # space
+    # taskmodels
+    (Models.lcrf, Tasks.named_entity_recognition):
+    Preprocessors.sequence_labeling_tokenizer,
+    (Models.lcrf_wseg, Tasks.word_segmentation):
+    Preprocessors.sequence_labeling_tokenizer,
+    (Models.tcrf_wseg, Tasks.word_segmentation):
+    Preprocessors.sequence_labeling_tokenizer,
+    (Models.tcrf, Tasks.named_entity_recognition):
+    Preprocessors.sequence_labeling_tokenizer,
 }
 
 
@@ -125,6 +136,8 @@ class Preprocessor(ABC):
 
     def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs):
         self._mode = mode
+        assert self._mode in (ModeKeys.INFERENCE, ModeKeys.TRAIN,
+                              ModeKeys.EVAL)
         self.device = int(
             os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
         pass
@@ -264,4 +277,41 @@ class Preprocessor(ABC):
             })
             preprocessor = build_preprocessor(sub_cfg, field_name)
         preprocessor.mode = preprocessor_mode
+        sub_cfg.pop('model_dir', None)
+        if not hasattr(preprocessor, 'cfg'):
+            preprocessor.cfg = cfg
         return preprocessor
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        config: Optional[dict] = None,
+                        save_config_function: Callable = save_configuration):
+        """Save the preprocessor, its configuration and other related files to a directory,
+            so that it can be re-loaded
+
+        By default, this method will save the preprocessor's config with mode `inference`.
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json
+
+            save_config_function (Callable): The function used to save the configuration, call this function
+                after the config is updated.
+
+        """
+        if config is None and hasattr(self, 'cfg'):
+            config = self.cfg
+
+        if config is not None:
+            # Update the mode to `inference` in the preprocessor field.
+            if 'preprocessor' in config and config['preprocessor'] is not None:
+                if 'mode' in config['preprocessor']:
+                    config['preprocessor']['mode'] = 'inference'
+                elif 'val' in config['preprocessor'] and 'mode' in config[
+                        'preprocessor']['val']:
+                    config['preprocessor']['val']['mode'] = 'inference'
+
+            save_config_function(target_folder, config)
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 7c48fb3c..5f23fb27 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -5,24 +5,22 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
-    from .nlp_base import (NLPTokenizerPreprocessorBase, NLPBasePreprocessor)
-    from .text_generation_jieba_preprocessor import TextGenerationJiebaPreprocessor
+    from .text_generation_preprocessor import TextGenerationJiebaPreprocessor
     from .sentence_piece_preprocessor import SentencePiecePreprocessor
     from .bert_seq_cls_tokenizer import Tokenize
-    from .document_segmentation_preprocessor import DocumentSegmentationPreprocessor
-    from .faq_question_answering_preprocessor import FaqQuestionAnsweringPreprocessor
-    from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, NLPPreprocessor
-    from .text_ranking_preprocessor import TextRankingPreprocessor
-    from .relation_extraction_preprocessor import RelationExtractionPreprocessor
-    from .sentence_classification_preprocessor import SequenceClassificationPreprocessor
-    from .sentence_embedding_preprocessor import SentenceEmbeddingPreprocessor
-    from .text_generation_preprocessor import TextGenerationPreprocessor
-    from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor
-    from .token_classification_preprocessor import TokenClassificationPreprocessor, \
+    from .document_segmentation_preprocessor import DocumentSegmentationTransformersPreprocessor
+    from .faq_question_answering_preprocessor import FaqQuestionAnsweringTransformersPreprocessor
+    from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, FillMaskTransformersPreprocessor
+    from .text_ranking_preprocessor import TextRankingTransformersPreprocessor
+    from .relation_extraction_preprocessor import RelationExtractionTransformersPreprocessor
+    from .text_classification_preprocessor import TextClassificationTransformersPreprocessor
+    from .sentence_embedding_preprocessor import SentenceEmbeddingTransformersPreprocessor
+    from .text_generation_preprocessor import TextGenerationTransformersPreprocessor, TextGenerationT5Preprocessor
+    from .token_classification_preprocessor import TokenClassificationTransformersPreprocessor, \
         WordSegmentationBlankSetToLabelPreprocessor
     from .token_classification_thai_preprocessor import WordSegmentationPreprocessorThai, NERPreprocessorThai
     from .token_classification_viet_preprocessor import NERPreprocessorViet
-    from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor
+    from .zero_shot_classification_preprocessor import ZeroShotClassificationTransformersPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor, InputFeatures,
@@ -36,30 +34,31 @@ else:
             'NLPTokenizerPreprocessorBase',
             'NLPBasePreprocessor',
         ],
-        'text_generation_jieba_preprocessor':
-        ['TextGenerationJiebaPreprocessor'],
         'sentence_piece_preprocessor': ['SentencePiecePreprocessor'],
         'bert_seq_cls_tokenizer': ['Tokenize'],
         'document_segmentation_preprocessor':
-        ['DocumentSegmentationPreprocessor'],
+        ['DocumentSegmentationTransformersPreprocessor'],
         'faq_question_answering_preprocessor':
-        ['FaqQuestionAnsweringPreprocessor'],
+        ['FaqQuestionAnsweringTransformersPreprocessor'],
         'fill_mask_preprocessor':
-        ['FillMaskPoNetPreprocessor', 'NLPPreprocessor'],
-        'text_ranking_preprocessor': ['TextRankingPreprocessor'],
-        'relation_extraction_preprocessor': ['RelationExtractionPreprocessor'],
-        'sentence_classification_preprocessor':
-        ['SequenceClassificationPreprocessor'],
-        'sentence_embedding_preprocessor': ['SentenceEmbeddingPreprocessor'],
-        'text_generation_preprocessor': ['TextGenerationPreprocessor'],
-        'text2text_generation_preprocessor':
-        ['Text2TextGenerationPreprocessor'],
+        ['FillMaskPoNetPreprocessor', 'FillMaskTransformersPreprocessor'],
+        'text_ranking_preprocessor': ['TextRankingTransformersPreprocessor'],
+        'relation_extraction_preprocessor':
+        ['RelationExtractionTransformersPreprocessor'],
+        'text_classification_preprocessor':
+        ['TextClassificationTransformersPreprocessor'],
+        'sentence_embedding_preprocessor':
+        ['SentenceEmbeddingTransformersPreprocessor'],
+        'text_generation_preprocessor': [
+            'TextGenerationTransformersPreprocessor',
+            'TextGenerationJiebaPreprocessor', 'TextGenerationT5Preprocessor'
+        ],
         'token_classification_preprocessor': [
-            'TokenClassificationPreprocessor',
+            'TokenClassificationTransformersPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor'
         ],
-        'zero_shot_classification_reprocessor':
-        ['ZeroShotClassificationPreprocessor'],
+        'zero_shot_classification_preprocessor':
+        ['ZeroShotClassificationTransformersPreprocessor'],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
index 02249ea1..be922bf7 100644
--- a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
@@ -3,39 +3,52 @@
 from typing import Any, Dict
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.logger import get_logger
-from .nlp_base import NLPBasePreprocessor
 
 logger = get_logger()
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.document_segmentation)
-class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
-
-    def __init__(self, model_dir: str, config, *args, **kwargs):
-        """preprocess the data
+class DocumentSegmentationTransformersPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 model_max_length: int,
+                 mode: str = ModeKeys.INFERENCE,
+                 question_column_name='labels',
+                 context_column_name='sentences',
+                 example_id_column_name='example_id',
+                 label_list=['B-EOP', 'O']):
+        """The preprocessor for document segmentation task, based on transformers' tokenizer.
 
         Args:
-            model_dir (str): model path
+            model_dir: The model dir containing the essential files to build the tokenizer.
+            model_max_length: The max length the model supported.
+            mode: The mode for this preprocessor.
+            question_column_name: The key for the question column, default `labels`.
+            context_column_name: The key for the context column, default `sentences`.
+            example_id_column_name: The key for the example id column, default `example_id`.
+            label_list: The label list, default `['B-EOP', 'O']`
         """
 
-        super().__init__(model_dir, *args, **kwargs)
+        super().__init__(mode)
         from transformers import BertTokenizerFast
-        self.tokenizer = BertTokenizerFast.from_pretrained(
-            model_dir,
-            use_fast=True,
-        )
-        self.question_column_name = 'labels'
-        self.context_column_name = 'sentences'
-        self.example_id_column_name = 'example_id'
-        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.tokenizer = BertTokenizerFast.from_pretrained(model_dir, )
+        self.question_column_name = question_column_name
+        self.context_column_name = context_column_name
+        self.example_id_column_name = example_id_column_name
+        self.label_list = label_list
+        self.label_to_id = {
+            label: id
+            for id, label in enumerate(self.label_list)
+        }
         self.target_specical_ids = set()
         self.target_specical_ids.add(self.tokenizer.eos_token_id)
-        self.max_seq_length = config.max_position_embeddings
-        self.label_list = ['B-EOP', 'O']
+        self.max_seq_length = model_max_length
 
     def __call__(self, examples, model_cfg=None) -> Dict[str, Any]:
         questions = examples[self.question_column_name]
diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
index 873a8448..bfff3885 100644
--- a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -1,38 +1,58 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import os
 from typing import Any, Dict
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import Config, ConfigFields
-from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
-from .nlp_base import NLPBasePreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
-class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super(FaqQuestionAnsweringPreprocessor, self).__init__(
-            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
-
-        from transformers import BertTokenizer
-
-        preprocessor_config = Config.from_file(
-            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
-                ConfigFields.preprocessor, {})
-        if preprocessor_config.get('tokenizer',
-                                   'BertTokenizer') == 'XLMRoberta':
+class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer='BertTokenizer',
+                 query_set='query_set',
+                 support_set='support_set',
+                 label_in_support_set='label',
+                 text_in_support_set='text',
+                 sequence_length=None,
+                 **kwargs):
+        """The preprocessor for Faq QA task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir containing the essential files to build the tokenizer.
+            mode: The mode for this preprocessor.
+            tokenizer: The tokenizer type used, supported types are `BertTokenizer`
+                and `XLMRobertaTokenizer`, default `BertTokenizer`.
+            query_set: The key for the query_set.
+            support_set: The key for the support_set.
+            label_in_support_set: The key for the label_in_support_set.
+            text_in_support_set: The key for the text_in_support_set.
+            sequence_length: The sequence length for the preprocessor.
+        """
+        super().__init__(mode)
+        if tokenizer == 'XLMRoberta':
             from transformers import XLMRobertaTokenizer
             self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
         else:
+            from transformers import BertTokenizer
             self.tokenizer = BertTokenizer.from_pretrained(model_dir)
 
-        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        if sequence_length is not None:
+            self.max_len = sequence_length
+        else:
+            self.max_len = kwargs.get('max_seq_length', 50)
         self.label_dict = None
+        self.query_set = query_set
+        self.support_set = support_set
+        self.label_in_support_set = label_in_support_set
+        self.text_in_support_set = text_in_support_set
 
     def pad(self, samples, max_len):
         result = []
@@ -58,25 +78,31 @@ class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any],
                  **preprocessor_param) -> Dict[str, Any]:
-        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
-        queryset = data['query_set']
+        tmp_max_len = preprocessor_param.get(
+            'sequence_length',
+            preprocessor_param.get('max_seq_length', self.max_len))
+        queryset = data[self.query_set]
         if not isinstance(queryset, list):
             queryset = [queryset]
-        supportset = data['support_set']
-        supportset = sorted(supportset, key=lambda d: d['label'])
+        supportset = data[self.support_set]
+        supportset = sorted(
+            supportset, key=lambda d: d[self.label_in_support_set])
 
         queryset_tokenized = [self.encode_plus(text) for text in queryset]
         supportset_tokenized = [
-            self.encode_plus(item['text']) for item in supportset
+            self.encode_plus(item[self.text_in_support_set])
+            for item in supportset
         ]
 
         max_len = max(
             [len(seq) for seq in queryset_tokenized + supportset_tokenized])
-        max_len = min(TMP_MAX_LEN, max_len)
+        max_len = min(tmp_max_len, max_len)
         queryset_padded = self.pad(queryset_tokenized, max_len)
         supportset_padded = self.pad(supportset_tokenized, max_len)
 
-        supportset_labels_ori = [item['label'] for item in supportset]
+        supportset_labels_ori = [
+            item[self.label_in_support_set] for item in supportset
+        ]
         label_dict = []
         for label in supportset_labels_ori:
             if label not in label_dict:
diff --git a/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
new file mode 100644
index 00000000..249aa24c
--- /dev/null
+++ b/modelscope/preprocessors/nlp/feature_extraction_preprocessor.py
@@ -0,0 +1,78 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.hub import get_model_type
+from .transformers_tokenizer import NLPTokenizer
+from .utils import parse_text_and_label
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class FeatureExtractionTransformersPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str = None,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 sequence_length: int = 128,
+                 use_fast: bool = None,
+                 **kwargs):
+        """The preprocessor for feature extraction task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            use_fast: Use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = sequence_length
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(mode)
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+
+    def __call__(self, data: Union[str, Tuple, Dict],
+                 **kwargs) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, _ = parse_text_and_label(data, self.mode,
+                                                 self.first_sequence,
+                                                 self.second_sequence)
+        output = self._tokenize_text(text_a, text_b, **kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+        return output
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+        return self.nlp_tokenizer(sequence1, sequence2, **kwargs)
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
index b0638dbc..80ac441f 100644
--- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -2,60 +2,207 @@
 
 import os.path as osp
 import re
+from abc import abstractmethod
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 import torch
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.hub import get_model_type
 from modelscope.utils.nlp import import_external_nltk_data
-from .nlp_base import NLPTokenizerPreprocessorBase
+from .transformers_tokenizer import NLPTokenizer
+from .utils import parse_text_and_label
+
+
+class FillMaskPreprocessorBase(Preprocessor):
+
+    def __init__(self,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 mode: str = ModeKeys.INFERENCE):
+        """The base constructor for all the fill-mask preprocessors.
+
+        Args:
+            first_sequence: The key of the first sequence.
+            second_sequence: The key of the second sequence.
+            mode: The mode for the preprocessor.
+        """
+        super().__init__(mode)
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+
+    def __call__(self, data: Union[str, Tuple, Dict],
+                 **kwargs) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, _ = parse_text_and_label(data, self.mode,
+                                                 self.first_sequence,
+                                                 self.second_sequence)
+        output = self._tokenize_text(text_a, text_b, **kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+        return output
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
+        raise NotImplementedError()
+
+    @property
+    def mask_id(self):
+        """Return the id of the mask token.
+
+        Returns:
+            The id of mask token.
+        """
+        return None
+
+    @abstractmethod
+    def decode(self,
+               token_ids,
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        """Turn the token_ids to real sentence.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            The real sentence decoded by the preprocessor.
+        """
+        pass
 
 
 @PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.feature_extraction)
-class NLPPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
+class FillMaskTransformersPreprocessor(FillMaskPreprocessorBase):
+
+    def __init__(self,
+                 model_dir: str = None,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 sequence_length: int = 128,
+                 use_fast: bool = None,
+                 **kwargs):
+        """The preprocessor for fill mask task, based on transformers' tokenizer.
 
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            use_fast: Use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['max_length'] = sequence_length
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      True)
-        super().__init__(model_dir, mode=mode, **kwargs)
+        super().__init__(first_sequence, second_sequence, mode)
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+        return self.nlp_tokenizer(sequence1, sequence2, **kwargs)
 
     @property
     def mask_id(self):
-        return self.tokenizer.mask_token_id
+        """Return the id of the mask token.
+
+        Returns:
+            The id of mask token.
+        """
+        return self.nlp_tokenizer.tokenizer.mask_token_id
 
     def decode(self,
                token_ids,
                skip_special_tokens: bool = False,
                clean_up_tokenization_spaces: bool = True,
                **kwargs):
-        return self.tokenizer.decode(token_ids, skip_special_tokens,
-                                     clean_up_tokenization_spaces, **kwargs)
+        """Turn the token_ids to real sentence.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            The real sentence decoded by the preprocessor.
+        """
+        return self.nlp_tokenizer.tokenizer.decode(
+            token_ids, skip_special_tokens, clean_up_tokenization_spaces,
+            **kwargs)
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
-class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in PoNet model's MLM task.
-    """
+class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
+
+    def __init__(self,
+                 model_dir,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 sequence_length: int = 512,
+                 use_fast: bool = None,
+                 **kwargs):
+        """The tokenizer preprocessor used in PoNet model's MLM task.
 
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            use_fast: Use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['max_length'] = sequence_length
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      True)
-        super().__init__(model_dir, mode=mode, **kwargs)
+        super().__init__(first_sequence, second_sequence, mode)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, use_fast=use_fast, tokenize_kwargs=kwargs)
 
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
@@ -80,27 +227,15 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
         self.sent_tokenize = sent_tokenize
         self.max_length = kwargs['max_length']
 
-    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b, labels = self.parse_text_and_label(data)
-        output = self.tokenizer(
-            text_a,
-            text_b,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
+    def __call__(self, data: Union[str, Tuple, Dict],
+                 **kwargs) -> Dict[str, Any]:
+        text_a, text_b, _ = parse_text_and_label(data, self.mode,
+                                                 self.first_sequence,
+                                                 self.second_sequence)
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+        output = self.nlp_tokenizer(text_a, text_b, **kwargs)
         max_seq_length = self.max_length
 
         if text_b is None:
@@ -108,7 +243,7 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
             seg_lens = list(
                 map(
                     len,
-                    self.tokenizer(
+                    self.nlp_tokenizer.tokenizer(
                         self.sent_tokenize(text_a),
                         add_special_tokens=False,
                         truncation=True)['input_ids']))
@@ -125,18 +260,36 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
             k: np.array(v) if isinstance(v, list) else v
             for k, v in output.items()
         }
-
-        self.labels_to_id(labels, output)
         return output
 
     @property
     def mask_id(self):
-        return self.tokenizer.mask_token_id
+        """Return the id of the mask token.
+
+        Returns:
+            The id of mask token.
+        """
+        return self.nlp_tokenizer.tokenizer.mask_token_id
 
     def decode(self,
                token_ids,
                skip_special_tokens: bool = False,
                clean_up_tokenization_spaces: bool = True,
                **kwargs):
-        return self.tokenizer.decode(token_ids, skip_special_tokens,
-                                     clean_up_tokenization_spaces, **kwargs)
+        """Turn the token_ids to real sentence.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            The real sentence decoded by the preprocessor.
+        """
+        return self.nlp_tokenizer.tokenizer.decode(
+            token_ids, skip_special_tokens, clean_up_tokenization_spaces,
+            **kwargs)
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
deleted file mode 100644
index 7fe28eb5..00000000
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os
-from abc import ABC
-from collections.abc import Mapping
-from typing import Any, Dict, List, Tuple, Union
-
-import json
-import numpy as np
-import torch
-from transformers import AutoTokenizer
-
-from modelscope.metainfo import Models
-from modelscope.outputs import OutputKeys
-from modelscope.preprocessors.base import Preprocessor
-from modelscope.utils.constant import ModeKeys
-from modelscope.utils.hub import get_model_type, parse_label_mapping
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = [
-    'NLPBasePreprocessor',
-    'NLPTokenizerPreprocessorBase',
-]
-
-
-class NLPBasePreprocessor(Preprocessor, ABC):
-
-    def __init__(self,
-                 model_dir: str,
-                 first_sequence=None,
-                 second_sequence=None,
-                 label=None,
-                 label2id=None,
-                 mode=ModeKeys.INFERENCE,
-                 use_fast=None,
-                 **kwargs):
-        """The NLP preprocessor base class.
-
-        Args:
-            model_dir (str): The local model path
-            first_sequence: The key for the first sequence
-            second_sequence: The key for the second sequence
-            label: The label key
-            label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
-                if this mapping is not supplied.
-            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
-            use_fast: use the fast version of tokenizer
-
-        """
-        self.model_dir = model_dir
-        self.first_sequence = first_sequence
-        self.second_sequence = second_sequence
-        self.label = label
-
-        self.use_fast = use_fast
-        if self.use_fast is None and model_dir is None:
-            self.use_fast = False
-        elif self.use_fast is None and os.path.isfile(
-                os.path.join(model_dir, 'tokenizer_config.json')):
-            with open(
-                    os.path.join(model_dir, 'tokenizer_config.json'),
-                    'r',
-                    encoding='utf-8') as f:
-                json_config = json.load(f)
-                self.use_fast = json_config.get('use_fast')
-        self.use_fast = False if self.use_fast is None else self.use_fast
-
-        self.label2id = label2id
-        if self.label2id is None and model_dir is not None:
-            self.label2id = parse_label_mapping(model_dir)
-        super().__init__(mode, **kwargs)
-
-    @property
-    def mask_id(self):
-        """Child preprocessor can override this property to return the id of mask token.
-
-        Returns:
-            The id of mask token, default None.
-        """
-        return None
-
-    def decode(self,
-               token_ids: Union[int, List[int], 'np.ndarray', 'torch.Tensor',
-                                'tf.Tensor'],
-               skip_special_tokens: bool = False,
-               clean_up_tokenization_spaces: bool = True,
-               **kwargs):
-        """Turn the token_ids to real sentence.
-
-        Args:
-            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
-                List of tokenized input ids. Can be obtained using the `__call__` method.
-            skip_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean up the tokenization spaces.
-            kwargs (additional keyword arguments, *optional*):
-                Will be passed to the underlying model specific decode method.
-        Returns:
-            The real sentence decoded by the preprocessor.
-        """
-        raise NotImplementedError()
-
-
-class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
-
-    def __init__(self,
-                 model_dir: str,
-                 first_sequence: str = None,
-                 second_sequence: str = None,
-                 label: str = 'label',
-                 label2id: dict = None,
-                 mode: str = ModeKeys.INFERENCE,
-                 use_fast: bool = None,
-                 **kwargs):
-        """The NLP tokenizer preprocessor base class.
-
-        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
-
-        Args:
-            model_dir (str): The local model path
-            first_sequence: The key for the first sequence
-            second_sequence: The key for the second sequence
-            label: The key for the label
-            label2id: An optional label2id dict.
-                If label2id is None, the preprocessor will try to parse label-id mapping from:
-                - configuration.json model.label2id/model.id2label
-                - config.json label2id/id2label
-                - label_mapping.json
-            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
-            use_fast: use the fast version of tokenizer
-            kwargs: These kwargs will be directly fed into the tokenizer.
-        """
-
-        super().__init__(model_dir, first_sequence, second_sequence, label,
-                         label2id, mode, use_fast, **kwargs)
-        self.model_dir = model_dir
-        self.tokenize_kwargs = kwargs
-        self.tokenizer = self.build_tokenizer(model_dir)
-        logger.info(f'The key of sentence1: {self.first_sequence}, '
-                    f'The key of sentence2: {self.second_sequence}, '
-                    f'The key of label: {self.label}')
-        if self.first_sequence is None:
-            logger.warning('[Important] first_sequence attribute is not set, '
-                           'this will cause an error if your input is a dict.')
-
-    @property
-    def id2label(self):
-        """Return the id2label mapping according to the label2id mapping.
-
-        @return: The id2label mapping if exists.
-        """
-        if self.label2id is not None:
-            return {id: label for label, id in self.label2id.items()}
-        return None
-
-    def build_tokenizer(self, model_dir):
-        """Build a tokenizer by the model type.
-
-        NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a
-        multi-thread problem.
-
-        Args:
-            model_dir:  The local model dir.
-
-        Returns:
-            The initialized tokenizer.
-        """
-        self.is_transformer_based_model = 'lstm' not in model_dir
-        # fast version lead to parallel inference failed
-        model_type = get_model_type(model_dir)
-        if model_type in (Models.structbert, Models.gpt3, Models.palm,
-                          Models.plug):
-            from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
-            tokenizer = SbertTokenizerFast if self.use_fast else SbertTokenizer
-            return tokenizer.from_pretrained(model_dir)
-        elif model_type == Models.veco:
-            from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
-            tokenizer = VecoTokenizerFast if self.use_fast else VecoTokenizer
-            return tokenizer.from_pretrained(model_dir)
-        elif model_type == Models.deberta_v2:
-            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
-            tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer
-            return tokenizer.from_pretrained(model_dir)
-        elif not self.is_transformer_based_model:
-            from transformers import BertTokenizer, BertTokenizerFast
-            tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
-            return tokenizer.from_pretrained(model_dir)
-        else:
-            return AutoTokenizer.from_pretrained(
-                model_dir, use_fast=self.use_fast)
-
-    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b, labels = self.parse_text_and_label(data)
-        output = self.tokenizer(
-            text_a,
-            text_b,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-        output = {
-            k: np.array(v) if isinstance(v, list) else v
-            for k, v in output.items()
-        }
-        self.labels_to_id(labels, output)
-        return output
-
-    def parse_text_and_label(self, data):
-        """Parse the input and return the sentences and labels.
-
-        When input type is tuple or list and its size is 2:
-        If the pair param is False, data will be parsed as the first_sentence and the label,
-        else it will be parsed as the first_sentence and the second_sentence.
-
-        Args:
-            data: The input data.
-
-        Returns:
-            The sentences and labels tuple.
-        """
-        text_a, text_b, labels = None, None, None
-        if isinstance(data, str):
-            text_a = data
-        elif isinstance(data, tuple) or isinstance(data, list):
-            if len(data) == 3:
-                text_a, text_b, labels = data
-            elif len(data) == 2:
-                if self._mode == ModeKeys.INFERENCE:
-                    text_a, text_b = data
-                else:
-                    text_a, labels = data
-        elif isinstance(data, Mapping):
-            text_a = data.get(self.first_sequence)
-            text_b = data.get(self.second_sequence)
-            labels = data.get(self.label)
-
-        return text_a, text_b, labels
-
-    def labels_to_id(self, labels, output):
-        """Turn the labels to id with the type int or float.
-
-        If the original label's type is str or int, the label2id mapping will try to convert it to the final label.
-        If the original label's type is float, or the label2id mapping does not exist,
-        the original label will be returned.
-
-        Args:
-            labels: The input labels.
-            output: The label id.
-
-        Returns:
-            The final labels.
-        """
-
-        def label_can_be_mapped(label):
-            return isinstance(label, str) or isinstance(label, int)
-
-        try:
-            if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
-                    and self.label2id is not None:
-                output[OutputKeys.LABELS] = [
-                    self.label2id[label]
-                    if label in self.label2id else self.label2id[str(label)]
-                    for label in labels
-                ]
-            elif label_can_be_mapped(labels) and self.label2id is not None:
-                output[OutputKeys.LABELS] = self.label2id[
-                    labels] if labels in self.label2id else self.label2id[str(
-                        labels)]
-            elif labels is not None:
-                output[OutputKeys.LABELS] = labels
-        except KeyError as e:
-            logger.error(
-                f'Label {labels} cannot be found in the label mapping {self.label2id},'
-                f'which comes from the user input or the configuration files. '
-                f'Please consider matching your labels with this mapping.')
-            raise e
diff --git a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
index 9a426ab7..58aa000d 100644
--- a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
+++ b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
@@ -5,34 +5,36 @@ from typing import Any, Dict
 from transformers import AutoTokenizer
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
-from .nlp_base import NLPBasePreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.re_tokenizer)
-class RelationExtractionPreprocessor(NLPBasePreprocessor):
-    """The relation extraction preprocessor used in normal RE task.
-    """
+class RelationExtractionTransformersPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
+    def __init__(
+        self,
+        model_dir: str,
+        mode: str = ModeKeys.INFERENCE,
+        **kwargs,
+    ):
+        """The preprocessor for relation Extraction task, based on transformers' tokenizer.
 
         Args:
-            model_dir (str): model path
+            model_dir: The model dir used to initialize the tokenizer.
+            mode: The mode for the preprocessor.
         """
 
-        super().__init__(model_dir, *args, **kwargs)
-
+        super().__init__(mode)
         self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir, use_fast=True)
 
     @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    def __call__(self, data: str, **kwargs) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -46,7 +48,9 @@ class RelationExtractionPreprocessor(NLPBasePreprocessor):
 
         # preprocess the data for the model input
         text = data
-        output = self.tokenizer([text], return_tensors='pt')
+        if 'return_tensors' not in kwargs:
+            kwargs['return_tensors'] = 'pt'
+        output = self.tokenizer([text], **kwargs)
         return {
             'text': text,
             'input_ids': output['input_ids'],
diff --git a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
deleted file mode 100644
index f1295c50..00000000
--- a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from modelscope.metainfo import Preprocessors
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, ModeKeys
-from .nlp_base import NLPTokenizerPreprocessorBase
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in sequence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index 519de60c..ccbf3ef2 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -1,31 +1,61 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
-from .nlp_base import NLPTokenizerPreprocessorBase
+from modelscope.utils.hub import get_model_type
+from .transformers_tokenizer import NLPTokenizer
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sentence_embedding)
-class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
     """The tokenizer preprocessor used in sentence embedding.
     """
 
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence='source_sentence',
+                 second_sequence='sentences_to_compare',
+                 mode=ModeKeys.INFERENCE,
+                 use_fast: bool = None,
+                 sequence_length: int = 128,
+                 **kwargs):
+        """The preprocessor for sentence embedding task, based on transformers' tokenizer.
 
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            first_sequence: The key of the first sequence.
+            second_sequence: The key of the second sequence.
+            mode: The mode for the preprocessor.
+            use_fast: Use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        kwargs['max_length'] = sequence_length
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+        super().__init__(mode=mode)
+
+    def __call__(self,
+                 data: Dict,
+                 padding=True,
+                 truncation=True,
+                 **kwargs) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
             data Dict:
-                keys: "source_sentence" && "sentences_to_compare"
+                keys: the source sentence and the sentences to compare
                 values: list of sentences
                 Example:
                     {"source_sentence": ["how long it take to get a master's degree"],
@@ -37,16 +67,16 @@ class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
-        source_sentence = data['source_sentence']
-        compare_sentences = data['sentences_to_compare']
-        sentences = []
-        sentences.append(source_sentence[0])
+        source_sentence = data[self.first_sequence]
+        compare_sentences = data[self.second_sequence]
+        sentences = [source_sentence[0]]
         for sent in compare_sentences:
             sentences.append(sent)
 
-        tokenized_inputs = self.tokenizer(
-            sentences,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            padding=True,
-            truncation=True)
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+
+        tokenized_inputs = self.nlp_tokenizer(
+            sentences, padding=padding, truncation=truncation, **kwargs)
         return tokenized_inputs
diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
index 1d1ef19d..6b0b76e1 100644
--- a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 import os.path as osp
-from typing import Any, Dict
 
 import sentencepiece as spm
 import torch
@@ -9,17 +8,26 @@ import torch
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Fields, ModeKeys
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sentence_piece)
 class SentencePiecePreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        import os
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """The preprocessor for the sentence piece tokenizer.
+
+        Args:
+            model_dir: The model dir contains the essential files used by the `SentencePieceProcessor`.
+            mode: The mode for the preprocessor.
+        """
 
-        super().__init__(*args, **kwargs)
+        super().__init__(mode)
         self.tokenizer = None
         for file_name in os.listdir(model_dir):
             if file_name.endswith('.model'):
@@ -28,5 +36,5 @@ class SentencePiecePreprocessor(Preprocessor):
                 break
         assert self.tokenizer is not None, 'Can not find .model file'
 
-    def __call__(self, data: str) -> Dict[str, Any]:
+    def __call__(self, data: str) -> torch.Tensor:
         return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
diff --git a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
deleted file mode 100644
index 5693d36e..00000000
--- a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict, Union
-
-from modelscope.metainfo import Preprocessors
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, ModeKeys
-from .nlp_base import NLPTokenizerPreprocessorBase
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
-class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
-        kwargs['padding'] = kwargs.get('padding', False)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        text_a, _, _ = self.parse_text_and_label(data)
-
-        inputs = self.tokenizer(
-            text_a,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-
-        # This is produced by tokenizers but is an invalid generate kwargs
-        if 'token_type_ids' in inputs:
-            del inputs['token_type_ids']
-        return inputs
diff --git a/modelscope/preprocessors/nlp/text_classification_preprocessor.py b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
new file mode 100644
index 00000000..06820e6c
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
@@ -0,0 +1,152 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from abc import abstractmethod
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from .transformers_tokenizer import NLPTokenizer
+from .utils import labels_to_id, parse_text_and_label
+
+logger = get_logger(__name__)
+
+
+class TextClassificationPreprocessorBase(Preprocessor):
+
+    def __init__(
+        self,
+        model_dir=None,
+        first_sequence: str = None,
+        second_sequence: str = None,
+        label: str = 'label',
+        label2id: Dict = None,
+        mode: str = ModeKeys.INFERENCE,
+    ):
+        """The base class for the text classification preprocessor.
+
+        Args:
+            model_dir(str, `optional`): The model dir used to parse the label mapping, can be None.
+            first_sequence(str, `optional`): The key of the first sequence.
+            second_sequence(str, `optional`): The key of the second sequence.
+            label(str, `optional`): The keys of the label columns, default is `label`
+            label2id: (dict, `optional`): The optional label2id mapping
+            mode: The mode for the preprocessor
+        """
+        super().__init__(mode)
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.label = label
+        self.label2id = label2id
+        if self.label2id is None and self.model_dir is not None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+        logger.info(f'The key of sentence1: {self.first_sequence}, '
+                    f'The key of sentence2: {self.second_sequence}, '
+                    f'The key of label: {self.label}')
+        if self.first_sequence is None:
+            logger.warning('[Important] first_sequence attribute is not set, '
+                           'this will cause an error if your input is a dict.')
+
+    @property
+    def id2label(self):
+        """Return the id2label mapping according to the label2id mapping.
+
+        @return: The id2label mapping if exists.
+        """
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def __call__(self, data: Union[str, Tuple, Dict],
+                 **kwargs) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = parse_text_and_label(data, self.mode,
+                                                      self.first_sequence,
+                                                      self.second_sequence,
+                                                      self.label)
+        output = self._tokenize_text(text_a, text_b, **kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+        labels_to_id(labels, output, self.label2id)
+        return output
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
+        raise NotImplementedError()
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class TextClassificationTransformersPreprocessor(
+        TextClassificationPreprocessorBase):
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+        return self.nlp_tokenizer(sequence1, sequence2, **kwargs)
+
+    def __init__(self,
+                 model_dir=None,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 label: Union[str, List] = 'label',
+                 label2id: Dict = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 sequence_length: int = 128,
+                 use_fast: bool = None,
+                 **kwargs):
+        """The tokenizer preprocessor used in sequence classification.
+
+        Args:
+            use_fast: Whether to use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = sequence_length
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+        super().__init__(model_dir, first_sequence, second_sequence, label,
+                         label2id, mode)
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
index 4e5ba3bd..357a946f 100644
--- a/modelscope/preprocessors/nlp/text_error_correction.py
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -7,12 +7,11 @@ from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
-from .nlp_base import NLPBasePreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(NLPBasePreprocessor):
+class TextErrorCorrectionPreprocessor(Preprocessor):
     """The preprocessor used in text correction task.
     """
 
@@ -23,7 +22,7 @@ class TextErrorCorrectionPreprocessor(NLPBasePreprocessor):
         Args:
             model_dir (str): model path
         """
-        super().__init__(model_dir, *args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
 
     def __call__(self, data: str) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
deleted file mode 100644
index 1e972d64..00000000
--- a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os.path as osp
-from typing import Any, Dict
-
-from modelscope.metainfo import Preprocessors
-from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
-class TextGenerationJiebaPreprocessor(Preprocessor):
-    """The jieba tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
-        super().__init__(*args, **kwargs)
-        self.tokenizer = JiebaBPETokenizer(
-            osp.join(model_dir, 'tokenizer.json'))
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
-        """
-        import torch
-
-        return {
-            'input_ids':
-            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
-        }
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
index 238e2972..7ce04a38 100644
--- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -1,62 +1,257 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os.path as osp
 from typing import Any, Dict, Optional, Union
 
+import numpy as np
+import torch
+
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
-from .nlp_base import NLPTokenizerPreprocessorBase
+from modelscope.utils.hub import get_model_type
+from modelscope.utils.logger import get_logger
+from .transformers_tokenizer import NLPTokenizer
+from .utils import parse_text_and_label
+
+logger = get_logger(__name__)
+
+
+class TextGenerationPreprocessorBase(Preprocessor):
+
+    def __init__(self,
+                 mode: str = ModeKeys.INFERENCE,
+                 src_txt='src_txt',
+                 tgt_txt='tgt_txt'):
+        """The base class for all the text generation task's preprocessors.
+
+        Args:
+            mode: The preprocessor mode.
+            src_txt: The key for the src text.
+            tgt_txt: The key for the tgt text.
+        """
+        super().__init__(mode)
+        self.src_txt = src_txt
+        self.tgt_txt = tgt_txt
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
+        raise NotImplementedError()
+
+    def __call__(self, data: Union[Dict, str], **kwargs) -> Dict[str, Any]:
+        text_a, text_b = parse_text_and_label(data, self.mode, self.src_txt,
+                                              self.tgt_txt)[0:2]
+
+        output = self._tokenize_text(text_a, text_b, **kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+        return output
+
+    def decode(self, tokens, **kwargs):
+        """Decode the tokens to real text.
+
+        Args:
+            tokens: The output tokens from model's `forward` and `generate`
+
+        Returns:
+            The actual text.
+        """
+        raise NotImplementedError()
+
+
+class NLPTokenizerForRoberta(NLPTokenizer):
+
+    def build_tokenizer(self):
+
+        def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
+            import os
+            for name in os.listdir(model_dir):
+                full_name = os.path.join(model_dir, name)
+                if 'roberta' in name and os.path.isdir(full_name):
+                    return full_name
+
+        roberta_tokenizer_dir = get_roberta_tokenizer_dir(self.model_dir)
+        if roberta_tokenizer_dir:
+            from transformers import RobertaTokenizer
+            return RobertaTokenizer.from_pretrained(
+                roberta_tokenizer_dir, do_lower_case=False)
+        return super().build_tokenizer()
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
+class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
 
     def __init__(self,
                  model_dir: str,
                  tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
+                 mode: str = ModeKeys.INFERENCE,
+                 src_txt='src_txt',
+                 tgt_txt='tgt_txt',
+                 sequence_length: int = 128,
+                 use_fast: bool = None,
                  **kwargs):
+        """The tokenizer preprocessor used in text generation.
+
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            mode: The mode for the preprocessor.
+            src_txt: The key of the source sentence.
+            tgt_txt: The key of the generated sentence.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            use_fast: Whether to use the fast tokenizer or not.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
+        if 'first_sequence' in kwargs:
+            src_txt = kwargs.pop('first_sequence')
+        super().__init__(mode, src_txt, tgt_txt)
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    @staticmethod
-    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
-        import os
-        for name in os.listdir(model_dir):
-            full_name = os.path.join(model_dir, name)
-            if 'roberta' in name and os.path.isdir(full_name):
-                return full_name
-
-    def build_tokenizer(self, model_dir: str):
-        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
-        if roberta_tokenizer_dir:
-            from transformers import RobertaTokenizer
-            return RobertaTokenizer.from_pretrained(
-                roberta_tokenizer_dir, do_lower_case=False)
-        return super().build_tokenizer(model_dir)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        if self._mode == ModeKeys.INFERENCE:
-            return super().__call__(data)
-        src_rst = super().__call__(data['src_txt'])
-        src_input_ids = src_rst['input_ids']
-        src_attention_mask = src_rst['attention_mask']
-        if 'tgt_txt' in data:
-            labels = super().__call__(data['tgt_txt'])['input_ids']
-        else:
-            labels = src_input_ids[1:]
-            src_input_ids = src_input_ids[:-1]
-            src_attention_mask = src_attention_mask[:-1]
+        kwargs['max_length'] = sequence_length
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizerForRoberta(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+
+    def decode(self, tokens, **kwargs):
+        """Decode the tokens to real text.
+
+        Args:
+            tokens: The output tokens from model's `forward` and `generate`
+
+        Returns:
+            The actual text.
+        """
+        return self.nlp_tokenizer.tokenizer.decode(tokens, **kwargs)
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
+
+        output = self.nlp_tokenizer(sequence1, **kwargs)
+
+        if self.mode != ModeKeys.INFERENCE:
+            if sequence2 is not None:
+                labels = self.nlp_tokenizer(sequence2)['input_ids']
+                src_input_ids = output['input_ids']
+                src_attention_mask = output['attention_mask']
+            else:
+                labels = output['input_ids'][1:]
+                src_input_ids = output['input_ids'][:-1]
+                src_attention_mask = output['attention_mask'][:-1]
+
+            output = {
+                'input_ids': src_input_ids,
+                'attention_mask': src_attention_mask,
+                'labels': labels,
+            }
+        return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
+class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase):
+    """The jieba tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 src_txt='src_txt',
+                 tgt_txt=None):
+        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
+        super().__init__(mode, src_txt, tgt_txt)
+        if self.tgt_txt is not None:
+            logger.warn(
+                f'TextGenerationJiebaPreprocessor currently does not support training, '
+                f'the {self.tgt_txt} of the tgt_txt field will be ignored.')
+        self.src_txt = src_txt
+        self.tokenizer = JiebaBPETokenizer(
+            osp.join(model_dir, 'tokenizer.json'))
+
+    def decode(self, tokens, **kwargs):
+        """Decode the tokens to real text.
+
+        Args:
+            tokens: The output tokens from model's `forward` and `generate`
 
+        Returns:
+            The actual text.
+        """
+        return self.tokenizer.detokenize(tokens)
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
         return {
-            'input_ids': src_input_ids,
-            'attention_mask': src_attention_mask,
-            'labels': labels,
+            'input_ids':
+            torch.tensor(self.tokenizer.tokenize(sequence1)).unsqueeze_(0)
         }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 src_txt='src_txt',
+                 tgt_txt='tgt_txt',
+                 use_fast: bool = None,
+                 sequence_length: int = 128,
+                 **kwargs):
+        """The preprocessor for text to text generation task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir used to initialize the tokenizer.
+            src_txt: The key of the first sequence.
+            use_fast: Use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            mode: The mode for the preprocessor.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
+        """
+        super().__init__(
+            model_dir,
+            mode=mode,
+            src_txt=src_txt,
+            tgt_txt=tgt_txt,
+            sequence_length=sequence_length,
+            use_fast=use_fast,
+            truncation=kwargs.pop('truncation', True),
+            padding=kwargs.pop('padding', 'max_length'),
+            return_token_type_ids=kwargs.pop('return_token_type_ids', False),
+            **kwargs)
diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
index 2ada6892..574b94ae 100644
--- a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
@@ -1,67 +1,78 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 from transformers import AutoTokenizer
 
 from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
-from .nlp_base import NLPTokenizerPreprocessorBase
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_ranking)
-class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in passage ranking model.
-    """
+class TextRankingTransformersPreprocessor(Preprocessor):
 
     def __init__(self,
                  model_dir: str,
-                 mode=ModeKeys.INFERENCE,
-                 *args,
+                 mode: str = ModeKeys.INFERENCE,
+                 first_sequence='source_sentence',
+                 second_sequence='sentences_to_compare',
+                 label='labels',
+                 qid='qid',
+                 sequence_length=128,
                  **kwargs):
-        """preprocess the data
+        """The tokenizer preprocessor class for the text ranking preprocessor.
 
         Args:
-            model_dir (str): model path
+            model_dir(str, `optional`): The model dir used to parse the label mapping, can be None.
+            first_sequence(str, `optional`): The key of the first sequence.
+            second_sequence(str, `optional`): The key of the second sequence.
+            label(str, `optional`): The keys of the label columns, default `labels`.
+            qid(str, `optional`): The qid info.
+            mode: The mode for the preprocessor.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
         """
-        super().__init__(model_dir, mode=mode, *args, **kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'source_sentence')
-        self.second_sequence = kwargs.pop('second_sequence',
-                                          'sentences_to_compare')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
+        super().__init__(mode)
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.label = label
+        self.qid = qid
+        self.sequence_length = sequence_length
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
 
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
-        if isinstance(data, tuple):
-            sentence1, sentence2 = data
-        elif isinstance(data, dict):
-            sentence1 = data.get(self.first_sequence)
-            sentence2 = data.get(self.second_sequence)
+    @type_assert(object, dict)
+    def __call__(self,
+                 data: Dict,
+                 padding='max_length',
+                 truncation=True,
+                 **kwargs) -> Dict[str, Any]:
+        sentence1 = data.get(self.first_sequence)
+        sentence2 = data.get(self.second_sequence)
+        labels = data.get(self.label)
+        qid = data.get(self.qid)
+
         if isinstance(sentence2, str):
             sentence2 = [sentence2]
         if isinstance(sentence1, str):
             sentence1 = [sentence1]
         sentence1 = sentence1 * len(sentence2)
-
-        max_seq_length = self.sequence_length
+        kwargs['max_length'] = kwargs.get(
+            'max_length', kwargs.pop('sequence_length', self.sequence_length))
+        if 'return_tensors' not in kwargs:
+            kwargs['return_tensors'] = 'pt'
         feature = self.tokenizer(
             sentence1,
             sentence2,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length,
-            return_tensors='pt')
-        if 'labels' in data:
-            labels = data['labels']
+            padding=padding,
+            truncation=truncation,
+            **kwargs)
+        if labels is not None:
             feature['labels'] = labels
-        if 'qid' in data:
-            qid = data['qid']
+        if qid is not None:
             feature['qid'] = qid
         return feature
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index a7616736..1d42324d 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -1,28 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
 import torch
 
 from modelscope.metainfo import Preprocessors
 from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
 from modelscope.utils.type_assert import type_assert
-from .nlp_base import NLPBasePreprocessor, NLPTokenizerPreprocessorBase
+from .transformers_tokenizer import NLPTokenizer
+from .utils import parse_text_and_label
+
+logger = get_logger(__name__)
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp,
     module_name=Preprocessors.word_segment_text_to_label_preprocessor)
-class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
+class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
     """The preprocessor used to turn a single sentence to a labeled token-classification dict.
     """
 
-    def __init__(self, **kwargs):
-        self.first_sequence: str = kwargs.pop('first_sequence', 'tokens')
-        self.label = kwargs.pop('label', OutputKeys.LABELS)
+    def __init__(self, generated_sentence='tokens', generated_label='labels'):
+        super().__init__()
+        self.generated_sentence = generated_sentence
+        self.generated_label = generated_label
 
     def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
         data = data.split(' ')
@@ -43,9 +50,134 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
 
         chars, labels = produce_train_sample(data)
         return {
-            self.first_sequence: chars,
-            self.label: labels,
+            self.generated_sentence: chars,
+            self.generated_label: labels,
+        }
+
+
+class TokenClassificationPreprocessorBase(Preprocessor):
+
+    def __init__(
+        self,
+        model_dir: str = None,
+        first_sequence: str = None,
+        label: str = 'label',
+        label2id: Dict = None,
+        label_all_tokens: bool = False,
+        mode: str = ModeKeys.INFERENCE,
+    ):
+        """The base class for all the token-classification tasks.
+
+        Args:
+            model_dir: The model dir to build the the label2id mapping.
+                If None, user need to pass in the `label2id` param.
+            first_sequence: The key for the text(token) column if input type is a dict.
+            label: The key for the label column if input type is a dict and the mode is `training` or `evaluation`.
+            label2id: The label2id mapping, if not provided, you need to specify the model_dir to search the mapping
+                from config files.
+            label_all_tokens: If label exists in the dataset, the preprocessor will try to label the tokens.
+                If label_all_tokens is true, all non-initial sub-tokens will get labels like `I-xxx`,
+                or else the labels will be filled with -100, default False.
+            mode: The preprocessor mode.
+        """
+        super().__init__(mode)
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.label = label
+        self.label2id = label2id
+        self.label_all_tokens = label_all_tokens
+        if self.label2id is None and self.model_dir is not None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+    @property
+    def id2label(self):
+        """Return the id2label mapping according to the label2id mapping.
+
+        @return: The id2label mapping if exists.
+        """
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def labels_to_id(self, labels_list, word_ids):
+        # align the labels with tokenized text
+        assert self.label2id is not None
+        # Map that sends B-Xxx label to its I-Xxx counterpart
+        b_to_i_label = []
+        label_enumerate_values = [
+            k for k, v in sorted(
+                self.label2id.items(), key=lambda item: item[1])
+        ]
+        for idx, label in enumerate(label_enumerate_values):
+            if label.startswith('B-') and label.replace(
+                    'B-', 'I-') in label_enumerate_values:
+                b_to_i_label.append(
+                    label_enumerate_values.index(label.replace('B-', 'I-')))
+            else:
+                b_to_i_label.append(idx)
+
+        label_row = [self.label2id[lb] for lb in labels_list]
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            if word_idx is None:
+                label_ids.append(-100)
+            elif word_idx != previous_word_idx:
+                label_ids.append(label_row[word_idx])
+            else:
+                if self.label_all_tokens:
+                    label_ids.append(b_to_i_label[label_row[word_idx]])
+                else:
+                    label_ids.append(-100)
+            previous_word_idx = word_idx
+        return label_ids
+
+    def _tokenize_text(self, sequence1, **kwargs):
+        """Tokenize the text.
+
+        Args:
+            sequence1: The first sequence.
+            sequence2: The second sequence which may be None.
+
+        Returns:
+            The encoded sequence.
+        """
+        raise NotImplementedError()
+
+    @type_assert(object, (str, tuple, dict))
+    def __call__(self, data: Union[dict, tuple, str],
+                 **kwargs) -> Dict[str, Any]:
+        text, _, label = parse_text_and_label(
+            data, self.mode, self.first_sequence, label=self.label)
+        outputs, word_ids = self._tokenize_text(text, **kwargs)
+        if label is not None:
+            label_ids = self.labels_to_id(label, word_ids)
+            outputs[OutputKeys.LABELS] = label_ids
+        outputs = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in outputs.items()
         }
+        if self.mode == ModeKeys.INFERENCE:
+            outputs['text'] = text
+        return outputs
+
+
+class NLPTokenizerForLSTM(NLPTokenizer):
+
+    def build_tokenizer(self):
+        if self.model_type == 'lstm':
+            from transformers import AutoTokenizer
+            return AutoTokenizer.from_pretrained(
+                self.model_dir, use_fast=self.use_fast, tokenizer_type='bert')
+        else:
+            return super().build_tokenizer()
+
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
 
 
 @PREPROCESSORS.register_module(
@@ -54,227 +186,238 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
     Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
-class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+class TokenClassificationTransformersPreprocessor(
+        TokenClassificationPreprocessorBase):
     """The tokenizer preprocessor used in normal NER task.
     """
 
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
+    def __init__(self,
+                 model_dir: str = None,
+                 first_sequence: str = None,
+                 label: str = 'label',
+                 label2id: Dict = None,
+                 label_all_tokens: bool = False,
+                 mode: str = ModeKeys.INFERENCE,
+                 sequence_length=128,
+                 use_fast=None,
+                 **kwargs):
+        """
 
         Args:
-            model_dir (str): model path
+            use_fast: Whether to use the fast tokenizer or not.
+            sequence_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+            **kwargs: Extra args input into the tokenizer's __call__ method.
         """
+        super().__init__(model_dir, first_sequence, label, label2id,
+                         label_all_tokens, mode)
+        self.is_lstm_model = 'lstm' in model_dir
+        model_type = None
+        if self.is_lstm_model:
+            model_type = 'lstm'
+        elif model_dir is not None:
+            model_type = get_model_type(model_dir)
         kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        self.sequence_length = kwargs['max_length']
-        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-        if 'is_split_into_words' in kwargs:
-            self.tokenize_kwargs['is_split_into_words'] = kwargs.pop(
-                'is_split_into_words')
-        else:
-            self.tokenize_kwargs[
-                'is_split_into_words'] = self.tokenizer.init_kwargs.get(
-                    'is_split_into_words', False)
-        if 'label2id' in kwargs:
-            kwargs.pop('label2id')
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = sequence_length
+        kwargs['add_special_tokens'] = model_type != 'lstm'
+        self.nlp_tokenizer = NLPTokenizerForLSTM(
+            model_dir=model_dir,
+            model_type=model_type,
+            use_fast=use_fast,
+            tokenize_kwargs=kwargs)
 
-    @type_assert(object, (str, dict))
-    def __call__(self, data: Union[dict, str]) -> Dict[str, Any]:
-        """process the raw input data
+    def _tokenize_text(self, text: Union[str, List[str]], **kwargs):
+        tokens = text
+        if self.mode != ModeKeys.INFERENCE:
+            assert isinstance(tokens, list), 'Input needs to be lists in training and evaluating,' \
+                                             'because the length of the words and the labels need to be equal.'
+        is_split_into_words = self.nlp_tokenizer.get_tokenizer_kwarg(
+            'is_split_into_words', False)
+        if is_split_into_words:
+            tokens = list(tokens)
 
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
+        if is_split_into_words and self.mode == ModeKeys.INFERENCE:
+            encodings, word_ids = self._tokenize_text_by_words(
+                tokens, **kwargs)
+        elif self.nlp_tokenizer.tokenizer.is_fast:
+            encodings, word_ids = self._tokenize_text_with_fast_tokenizer(
+                tokens, **kwargs)
+        else:
+            encodings, word_ids = self._tokenize_text_with_slow_tokenizer(
+                tokens, **kwargs)
 
-        # preprocess the data for the model input
-        text = None
-        labels_list = None
-        if isinstance(data, str):
-            # for inference inputs without label
-            text = data
-        elif isinstance(data, dict):
-            # for finetune inputs with label
-            text = data.get(self.first_sequence)
-            labels_list = data.get(self.label)
-            if isinstance(text, list):
-                self.tokenize_kwargs['is_split_into_words'] = True
-
-        if self._mode == ModeKeys.INFERENCE:
-            self.tokenize_kwargs['add_special_tokens'] = False
+        if self.mode == ModeKeys.INFERENCE:
+            for key in encodings.keys():
+                encodings[key] = torch.tensor(encodings[key]).unsqueeze(0)
+        else:
+            encodings.pop('offset_mapping', None)
+        return encodings, word_ids
 
+    def _tokenize_text_by_words(self, tokens, **kwargs):
         input_ids = []
         label_mask = []
         offset_mapping = []
-        token_type_ids = []
-        if self.tokenize_kwargs[
-                'is_split_into_words'] and self._mode == ModeKeys.INFERENCE:
-            for offset, token in enumerate(list(text)):
-                subtoken_ids = self.tokenizer.encode(token,
-                                                     **self.tokenize_kwargs)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)])
+        attention_mask = []
+        for offset, token in enumerate(tokens):
+            subtoken_ids = self.nlp_tokenizer.tokenizer.encode(
+                token, add_special_tokens=False)
+            if len(subtoken_ids) == 0:
+                subtoken_ids = [self.nlp_tokenizer.tokenizer.unk_token_id]
+            input_ids.extend(subtoken_ids)
+            attention_mask.extend([1] * len(subtoken_ids))
+            label_mask.extend([True] + [False] * (len(subtoken_ids) - 1))
+            offset_mapping.extend([(offset, offset + 1)])
+
+        padding = kwargs.get('padding',
+                             self.nlp_tokenizer.get_tokenizer_kwarg('padding'))
+        max_length = kwargs.get(
+            'max_length',
+            kwargs.get('sequence_length',
+                       self.nlp_tokenizer.get_tokenizer_kwarg('max_length')))
+        special_token = 1 if self.nlp_tokenizer.get_tokenizer_kwarg(
+            'add_special_tokens') else 0
+        if len(label_mask) > max_length - 2 * special_token:
+            label_mask = label_mask[:(max_length - 2 * special_token)]
+            input_ids = input_ids[:(max_length - 2 * special_token)]
+        offset_mapping = offset_mapping[:sum(label_mask)]
+        if padding == 'max_length':
+            label_mask = [False] * special_token + label_mask + \
+                         [False] * (max_length - len(label_mask) - special_token)
+            offset_mapping = offset_mapping + [(0, 0)] * (
+                max_length - len(offset_mapping))
+            input_ids = [self.nlp_tokenizer.tokenizer.cls_token_id] * special_token + input_ids + \
+                        [self.nlp_tokenizer.tokenizer.sep_token_id] * special_token + \
+                        [self.nlp_tokenizer.tokenizer.pad_token_id] * (max_length - len(input_ids) - 2 * special_token)
+            attention_mask = attention_mask + [1] * (
+                special_token * 2) + [0] * (
+                    max_length - len(attention_mask) - 2 * special_token)
         else:
-            if self.tokenizer.is_fast:
-                encodings = self.tokenizer(
-                    text, return_offsets_mapping=True, **self.tokenize_kwargs)
-                attention_mask = encodings['attention_mask']
-                if 'token_type_ids' in encodings:
-                    token_type_ids = encodings['token_type_ids']
-                input_ids = encodings['input_ids']
-                word_ids = encodings.word_ids()
-                for i in range(len(word_ids)):
-                    if word_ids[i] is None:
-                        label_mask.append(0)
-                    elif word_ids[i] == word_ids[i - 1]:
-                        label_mask.append(0)
-                        offset_mapping[-1] = (
-                            offset_mapping[-1][0],
-                            encodings['offset_mapping'][i][1])
-                    else:
-                        label_mask.append(1)
-                        offset_mapping.append(encodings['offset_mapping'][i])
+            label_mask = [False] * special_token + label_mask + \
+                         [False] * special_token
+            input_ids = [self.nlp_tokenizer.tokenizer.cls_token_id] * special_token + input_ids + \
+                        [self.nlp_tokenizer.tokenizer.sep_token_id] * special_token
+            attention_mask = attention_mask + [1] * (special_token * 2)
+
+        encodings = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping,
+        }
+        return encodings, None
+
+    def _tokenize_text_with_fast_tokenizer(self, tokens, **kwargs):
+        is_split_into_words = isinstance(tokens, list)
+        encodings = self.nlp_tokenizer(
+            tokens,
+            return_offsets_mapping=True,
+            is_split_into_words=is_split_into_words,
+            **kwargs)
+        label_mask = []
+        word_ids = encodings.word_ids()
+        offset_mapping = []
+        for i in range(len(word_ids)):
+            if word_ids[i] is None:
+                label_mask.append(False)
+            elif word_ids[i] == word_ids[i - 1]:
+                label_mask.append(False)
+                if not is_split_into_words:
+                    offset_mapping[-1] = (offset_mapping[-1][0],
+                                          encodings['offset_mapping'][i][1])
             else:
-                encodings = self.tokenizer(text, **self.tokenize_kwargs)
-                input_ids = encodings['input_ids']
-                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
-                    text)
-
-        if self._mode == ModeKeys.INFERENCE:
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-            offset_mapping = offset_mapping[:sum(label_mask)]
-
-            if not self.is_transformer_based_model:
-                input_ids = input_ids[1:-1]
-                attention_mask = attention_mask[1:-1]
-                label_mask = label_mask[1:-1]
-
-            input_ids = torch.tensor(input_ids).unsqueeze(0)
-            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
-            label_mask = torch.tensor(
-                label_mask, dtype=torch.bool).unsqueeze(0)
-
-            # the token classification
-            output = {
-                'text': text,
-                'input_ids': input_ids,
-                'attention_mask': attention_mask,
-                'label_mask': label_mask,
-                'offset_mapping': offset_mapping
-            }
+                label_mask.append(True)
+                if is_split_into_words:
+                    offset_mapping.append((word_ids[i], word_ids[i] + 1))
+                else:
+                    offset_mapping.append(encodings['offset_mapping'][i])
+
+        padding = self.nlp_tokenizer.get_tokenizer_kwarg('padding')
+        if padding == 'max_length':
+            offset_mapping = offset_mapping + [(0, 0)] * (
+                len(label_mask) - len(offset_mapping))
+        encodings['offset_mapping'] = offset_mapping
+        encodings['label_mask'] = label_mask
+        return encodings, word_ids
+
+    def _tokenize_text_with_slow_tokenizer(self, tokens, **kwargs):
+        assert self.mode == ModeKeys.INFERENCE and isinstance(tokens, str), \
+            'Slow tokenizer now only support str input in inference mode. If you are training models, ' \
+            'please consider using the fast tokenizer.'
+        word_ids = None
+        encodings = self.nlp_tokenizer(
+            tokens, is_split_into_words=False, **kwargs)
+        tokenizer_name = self.nlp_tokenizer.get_tokenizer_class()
+        method = 'get_label_mask_and_offset_mapping_' + tokenizer_name
+        if not hasattr(self, method):
+            raise RuntimeError(
+                f'No `{method}` method defined for '
+                f'tokenizer {tokenizer_name}, please use a fast tokenizer instead, or '
+                f'try to implement a `{method}` method')
+        label_mask, offset_mapping = getattr(self, method)(tokens)
+        padding = self.nlp_tokenizer.get_tokenizer_kwarg('padding')
+        max_length = self.nlp_tokenizer.get_tokenizer_kwarg('max_length')
+        special_token = 1 if self.nlp_tokenizer.get_tokenizer_kwarg(
+            'add_special_tokens') else 0
+        if len(label_mask) > max_length - 2 * special_token:
+            label_mask = label_mask[:(max_length - 2 * special_token)]
+        offset_mapping = offset_mapping[:sum(label_mask)]
+        if padding == 'max_length':
+            label_mask = [False] * special_token + label_mask + \
+                         [False] * (max_length - len(label_mask) - special_token)
+            offset_mapping = offset_mapping + [(0, 0)] * (
+                max_length - len(offset_mapping))
         else:
-            output = {
-                'input_ids': input_ids,
-                'token_type_ids': token_type_ids,
-                'attention_mask': attention_mask,
-                'label_mask': label_mask,
-            }
-
-            # align the labels with tokenized text
-            if labels_list is not None:
-                assert self.label2id is not None
-                # Map that sends B-Xxx label to its I-Xxx counterpart
-                b_to_i_label = []
-                label_enumerate_values = [
-                    k for k, v in sorted(
-                        self.label2id.items(), key=lambda item: item[1])
-                ]
-                for idx, label in enumerate(label_enumerate_values):
-                    if label.startswith('B-') and label.replace(
-                            'B-', 'I-') in label_enumerate_values:
-                        b_to_i_label.append(
-                            label_enumerate_values.index(
-                                label.replace('B-', 'I-')))
-                    else:
-                        b_to_i_label.append(idx)
-
-                label_row = [self.label2id[lb] for lb in labels_list]
-                previous_word_idx = None
-                label_ids = []
-                for word_idx in word_ids:
-                    if word_idx is None:
-                        label_ids.append(-100)
-                    elif word_idx != previous_word_idx:
-                        label_ids.append(label_row[word_idx])
-                    else:
-                        if self.label_all_tokens:
-                            label_ids.append(b_to_i_label[label_row[word_idx]])
-                        else:
-                            label_ids.append(-100)
-                    previous_word_idx = word_idx
-                labels = label_ids
-                output['labels'] = labels
-            output = {
-                k: np.array(v) if isinstance(v, list) else v
-                for k, v in output.items()
-            }
-        return output
+            label_mask = [False] * special_token + label_mask + \
+                         [False] * special_token
+        encodings['offset_mapping'] = offset_mapping
+        encodings['label_mask'] = label_mask
+        return encodings, word_ids
 
-    def get_tokenizer_class(self):
-        tokenizer_class = self.tokenizer.__class__.__name__
-        if tokenizer_class.endswith(
-                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
-            tokenizer_class = tokenizer_class[:-4]
-        return tokenizer_class
+    def get_label_mask_and_offset_mapping_BertTokenizer(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.nlp_tokenizer.tokenizer.tokenize(text)
+        offset = 0
+        for token in tokens:
+            is_start = (token[:2] != '##')
+            if is_start:
+                label_mask.append(True)
+            else:
+                token = token[2:]
+                label_mask.append(False)
+            start = offset + text[offset:].index(token)
+            end = start + len(token)
+            if is_start:
+                offset_mapping.append((start, end))
+            else:
+                offset_mapping[-1] = (offset_mapping[-1][0], end)
+            offset = end
+
+        return label_mask, offset_mapping
 
-    def get_label_mask_and_offset_mapping(self, text):
+    def get_label_mask_and_offset_mapping_XLMRobertaTokenizer(self, text):
         label_mask = []
         offset_mapping = []
-        tokens = self.tokenizer.tokenize(text)
+        tokens = self.nlp_tokenizer.tokenizer.tokenize(text)
         offset = 0
-        if self.get_tokenizer_class() == 'BertTokenizer':
-            for token in tokens:
-                is_start = (token[:2] != '##')
-                if is_start:
-                    label_mask.append(True)
-                else:
-                    token = token[2:]
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+        last_is_blank = False
+        for token in tokens:
+            is_start = (token[0] == '▁')
+            if is_start:
+                token = token[1:]
+                label_mask.append(True)
+                if len(token) == 0:
+                    last_is_blank = True
+                    continue
+            else:
+                label_mask.append(False)
+            start = offset + text[offset:].index(token)
+            end = start + len(token)
+            if last_is_blank or is_start:
+                offset_mapping.append((start, end))
+            else:
+                offset_mapping[-1] = (offset_mapping[-1][0], end)
+            offset = end
             last_is_blank = False
-            for token in tokens:
-                is_start = (token[0] == '▁')
-                if is_start:
-                    token = token[1:]
-                    label_mask.append(True)
-                    if len(token) == 0:
-                        last_is_blank = True
-                        continue
-                else:
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if last_is_blank or is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-                last_is_blank = False
-        else:
-            raise NotImplementedError
-
         return label_mask, offset_mapping
diff --git a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
index a356cea7..f2ea73f6 100644
--- a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
@@ -9,19 +9,23 @@ from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
-from .token_classification_preprocessor import TokenClassificationPreprocessor
+from .token_classification_preprocessor import \
+    TokenClassificationTransformersPreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.thai_ner_tokenizer)
-class NERPreprocessorThai(TokenClassificationPreprocessor):
+class NERPreprocessorThai(TokenClassificationTransformersPreprocessor):
 
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, dict))
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         from pythainlp import word_tokenize
-
+        if isinstance(data, str):
+            text = data
+        else:
+            text = data[self.first_sequence]
         segmented_data = ' '.join([
-            w.strip(' ') for w in word_tokenize(text=data, engine='newmm')
+            w.strip(' ') for w in word_tokenize(text=text, engine='newmm')
             if w.strip(' ') != ''
         ])
         output = super().__call__(segmented_data)
@@ -31,12 +35,17 @@ class NERPreprocessorThai(TokenClassificationPreprocessor):
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.thai_wseg_tokenizer)
-class WordSegmentationPreprocessorThai(TokenClassificationPreprocessor):
+class WordSegmentationPreprocessorThai(
+        TokenClassificationTransformersPreprocessor):
 
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, dict))
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         import regex
-        data = regex.findall(r'\X', data)
+        if isinstance(data, str):
+            text = data
+        else:
+            text = data[self.first_sequence]
+        data = regex.findall(r'\X', text)
         data = ' '.join([char for char in data])
 
         output = super().__call__(data)
diff --git a/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
index f8970d1a..c68d6c3b 100644
--- a/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
@@ -9,19 +9,23 @@ from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
-from .token_classification_preprocessor import TokenClassificationPreprocessor
+from .token_classification_preprocessor import \
+    TokenClassificationTransformersPreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.viet_ner_tokenizer)
-class NERPreprocessorViet(TokenClassificationPreprocessor):
+class NERPreprocessorViet(TokenClassificationTransformersPreprocessor):
 
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, dict))
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         from pyvi import ViTokenizer
-
+        if isinstance(data, str):
+            text = data
+        else:
+            text = data[self.first_sequence]
         seg_words = [
-            t.strip(' ') for t in ViTokenizer.tokenize(data).split(' ')
+            t.strip(' ') for t in ViTokenizer.tokenize(text).split(' ')
             if t.strip(' ') != ''
         ]
         raw_words = []
diff --git a/modelscope/preprocessors/nlp/transformers_tokenizer.py b/modelscope/preprocessors/nlp/transformers_tokenizer.py
new file mode 100644
index 00000000..2cec4b93
--- /dev/null
+++ b/modelscope/preprocessors/nlp/transformers_tokenizer.py
@@ -0,0 +1,112 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from collections.abc import Mapping
+
+import json
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Models
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModeKeys
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = [
+    'NLPTokenizer',
+]
+
+
+class NLPTokenizer:
+
+    def __init__(self,
+                 model_dir: str = None,
+                 model_type=None,
+                 use_fast: bool = None,
+                 tokenize_kwargs=None):
+        """The transformers tokenizer preprocessor base class.
+
+        Any nlp preprocessor which uses the huggingface tokenizer can inherit from this class.
+
+        Args:
+            model_dir (str, `optional`): The local path containing the files used to create a preprocessor.
+            use_fast (str, `optional`): Use the fast version of tokenizer
+            tokenize_kwargs (dict, `optional`): These args will be directly fed into the tokenizer.
+        """
+        self.model_dir = model_dir
+        self.model_type = model_type
+        self.tokenize_kwargs = tokenize_kwargs
+        if self.tokenize_kwargs is None:
+            self.tokenize_kwargs = {}
+        self._use_fast = use_fast
+        self._tokenizer = None
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = self.build_tokenizer()
+        return self._tokenizer
+
+    @property
+    def use_fast(self):
+        if self._use_fast is None:
+            if self._use_fast is None and self.model_dir is None:
+                self._use_fast = False
+            elif self._use_fast is None and os.path.isfile(
+                    os.path.join(self.model_dir, 'tokenizer_config.json')):
+                with open(
+                        os.path.join(self.model_dir, 'tokenizer_config.json'),
+                        'r',
+                        encoding='utf-8') as f:
+                    json_config = json.load(f)
+                    self._use_fast = json_config.get('use_fast')
+            self._use_fast = False if self._use_fast is None else self._use_fast
+        return self._use_fast
+
+    def build_tokenizer(self):
+        """Build a tokenizer by the model type.
+
+        NOTE: The fast tokenizers have a multi-thread problem, use it carefully.
+
+        Returns:
+            The initialized tokenizer.
+        """
+        # fast version lead to parallel inference failed
+        model_type = self.model_type
+        model_dir = self.model_dir
+        if model_type == Models.deberta_v2:
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
+            tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer
+            return tokenizer.from_pretrained(
+                model_dir) if model_dir is not None else tokenizer()
+
+        if model_type in (Models.structbert, Models.gpt3, Models.palm,
+                          Models.plug):
+            from transformers import BertTokenizer, BertTokenizerFast
+            tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
+            return tokenizer.from_pretrained(
+                model_dir) if model_dir is not None else tokenizer()
+        elif model_type == Models.veco:
+            from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+            tokenizer = XLMRobertaTokenizerFast if self.use_fast else XLMRobertaTokenizer
+            return tokenizer.from_pretrained(
+                model_dir) if model_dir is not None else tokenizer()
+
+        assert model_dir is not None
+        return AutoTokenizer.from_pretrained(model_dir, use_fast=self.use_fast)
+
+    def __call__(self, text, text_pair=None, **kwargs):
+        kwargs['max_length'] = kwargs.get('max_length',
+                                          kwargs.pop('sequence_length', None))
+        if kwargs['max_length'] is None:
+            kwargs.pop('max_length')
+        tokenize_kwargs = {k: v for k, v in self.tokenize_kwargs.items()}
+        tokenize_kwargs.update(kwargs)
+        kwargs.update(self.tokenize_kwargs)
+        return self.tokenizer(text, text_pair, **tokenize_kwargs)
+
+    def get_tokenizer_kwarg(self, key, default_value=None):
+        if key in self.tokenize_kwargs:
+            return self.tokenize_kwargs[key]
+        return self.tokenizer.init_kwargs.get(key, default_value)
diff --git a/modelscope/preprocessors/nlp/utils.py b/modelscope/preprocessors/nlp/utils.py
new file mode 100644
index 00000000..bc097f3e
--- /dev/null
+++ b/modelscope/preprocessors/nlp/utils.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from collections.abc import Mapping
+from typing import Any, Dict, List, Tuple, Union
+
+import json
+import numpy as np
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Models
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.utils.constant import ModeKeys
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['parse_text_and_label', 'labels_to_id']
+
+
+def parse_text_and_label(data,
+                         mode,
+                         first_sequence=None,
+                         second_sequence=None,
+                         label=None):
+    """Parse the input and return the sentences and labels.
+
+    When input type is tuple or list and its size is 2:
+    If the pair param is False, data will be parsed as the first_sentence and the label,
+    else it will be parsed as the first_sentence and the second_sentence.
+
+    Args:
+        data: The input data.
+        mode: The mode of the preprocessor
+        first_sequence: The key of the first sequence
+        second_sequence: The key of the second sequence
+        label: The key of the label
+    Returns:
+        The sentences and labels tuple.
+    """
+    text_a, text_b, labels = None, None, None
+    if isinstance(data, str):
+        text_a = data
+    elif isinstance(data, tuple) or isinstance(data, list):
+        if len(data) == 3:
+            text_a, text_b, labels = data
+        elif len(data) == 2:
+            if mode == ModeKeys.INFERENCE:
+                text_a, text_b = data
+            else:
+                text_a, labels = data
+    elif isinstance(data, Mapping):
+        text_a = data.get(first_sequence)
+        text_b = data.get(second_sequence)
+        if label is None or isinstance(label, str):
+            labels = data.get(label)
+        else:
+            labels = [data.get(lb) for lb in label]
+    return text_a, text_b, labels
+
+
+def labels_to_id(labels, output, label2id=None):
+    """Turn the labels to id with the type int or float.
+
+    If the original label's type is str or int, the label2id mapping will try to convert it to the final label.
+    If the original label's type is float, or the label2id mapping does not exist,
+    the original label will be returned.
+
+    Args:
+        label2id: An extra label2id mapping. If not provided, the label will not be translated to ids.
+        labels: The input labels.
+        output: The label id.
+
+    Returns:
+        The final labels.
+    """
+
+    def label_can_be_mapped(label):
+        return isinstance(label, str) or isinstance(label, int)
+
+    try:
+        if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
+                and label2id is not None:
+            output[OutputKeys.LABELS] = [
+                label2id[label] if label in label2id else label2id[str(label)]
+                for label in labels
+            ]
+        elif label_can_be_mapped(labels) and label2id is not None:
+            output[OutputKeys.LABELS] = label2id[
+                labels] if labels in label2id else label2id[str(labels)]
+        elif labels is not None:
+            output[OutputKeys.LABELS] = labels
+    except KeyError as e:
+        logger.error(
+            f'Label {labels} cannot be found in the label mapping {label2id},'
+            f'which comes from the user input or the configuration files. '
+            f'Please consider matching your labels with this mapping.')
+        raise e
diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py
new file mode 100644
index 00000000..a7d87674
--- /dev/null
+++ b/modelscope/preprocessors/nlp/zero_shot_classification_preprocessor.py
@@ -0,0 +1,74 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.hub import get_model_type
+from .transformers_tokenizer import NLPTokenizer
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationTransformersPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in zero shot classification.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence=None,
+                 mode=ModeKeys.INFERENCE,
+                 sequence_length=512,
+                 use_fast=None,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = sequence_length
+        model_type = None
+        if model_dir is not None:
+            model_type = get_model_type(model_dir)
+        self.nlp_tokenizer = NLPTokenizer(
+            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
+        self.first_sequence = first_sequence
+        super().__init__(mode=mode)
+
+    def __call__(self,
+                 data: Union[str, Dict],
+                 hypothesis_template: str,
+                 candidate_labels: list,
+                 padding=True,
+                 truncation=True,
+                 truncation_strategy='only_first',
+                 **kwargs) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        if 'return_tensors' not in kwargs:
+            kwargs[
+                'return_tensors'] = 'pt' if self._mode == ModeKeys.INFERENCE else None
+
+        features = self.nlp_tokenizer(
+            pairs,
+            padding=padding,
+            truncation=truncation,
+            truncation_strategy=truncation_strategy,
+            **kwargs)
+        return features
diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
deleted file mode 100644
index eb3c4b37..00000000
--- a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict, Union
-
-from modelscope.metainfo import Preprocessors
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, ModeKeys
-from .nlp_base import NLPTokenizerPreprocessorBase
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in zero shot classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str or dict): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
-
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            truncation_strategy='only_first',
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
-        return features
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index 89aa39ba..91b4ef8b 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -6,11 +6,12 @@ import numpy as np
 import torch
 
 from modelscope import __version__
-from modelscope.metainfo import Hooks
-from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
+from modelscope.metainfo import Hooks, Pipelines
+from modelscope.utils.checkpoint import (load_checkpoint, save_checkpoint,
+                                         save_configuration)
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import get_dist_info, is_master
+from modelscope.utils.torch_utils import is_master
 from .builder import HOOKS
 from .hook import Hook
 from .priority import Priority
@@ -28,17 +29,25 @@ class CheckpointHook(Hook):
         save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
         save_last (bool): Whether to save the last checkpoint. Default: True.
         checkpoint_file (str): The checkpoint file to be loaded.
+        load_all_state (bool): Load all states(optimizer, epoch, lr_scheduler, random_state, etc.) when loading old
+            training state file or not. The model's state dict will only be loaded if False.
+        max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
+        If the number exceeding the limit, earlier checkpoints will be deleted first.
     """
 
     PRIORITY = Priority.LOW
 
-    def __init__(self,
-                 interval=0,
-                 by_epoch=True,
-                 save_optimizer=True,
-                 save_dir=None,
-                 save_last=True,
-                 checkpoint_file=None):
+    def __init__(
+        self,
+        interval=0,
+        by_epoch=True,
+        save_optimizer=True,
+        save_dir=None,
+        save_last=True,
+        checkpoint_file=None,
+        load_all_state=True,
+        max_checkpoint_num=None,
+    ):
         self.interval = interval
         self.by_epoch = by_epoch
         self.save_optimizer = save_optimizer
@@ -47,6 +56,11 @@ class CheckpointHook(Hook):
         self.save_last = save_last
         self.rng_state = None
         self.need_load_rng_state = False
+        self.load_all_state = load_all_state
+        self.max_checkpoint_num = None
+        if max_checkpoint_num is not None:
+            self.max_checkpoint_num = max(int(max_checkpoint_num), 1)
+        self.history_checkpoints = []
 
     def before_run(self, trainer):
         if not self.save_dir:
@@ -65,9 +79,10 @@ class CheckpointHook(Hook):
 
         if self.checkpoint_file is not None and os.path.isfile(
                 self.checkpoint_file):
-            meta = self.load_checkpoint(self.checkpoint_file, trainer)
+            meta = self.load_checkpoint(self.checkpoint_file, trainer,
+                                        self.load_all_state)
             self.rng_state = meta.get('rng_state')
-            self.need_load_rng_state = True
+            self.need_load_rng_state = self.load_all_state
 
     def before_train_iter(self, trainer):
         if self.need_load_rng_state:
@@ -95,28 +110,30 @@ class CheckpointHook(Hook):
                 self._save_checkpoint(trainer)
 
     @classmethod
-    def load_checkpoint(cls, filename, trainer):
+    def load_checkpoint(cls, filename, trainer, load_all_state=True):
         from modelscope.trainers.parallel.utils import is_parallel
         if is_parallel(trainer.model):
             model = trainer.model.module
         else:
             model = trainer.model
-        meta = load_checkpoint(filename, model,
-                               getattr(trainer, 'optimizer', None),
-                               getattr(trainer, 'lr_scheduler', None))
-        trainer._epoch = meta.get('epoch', trainer._epoch)
-        trainer._iter = meta.get('iter', trainer._iter)
-        trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
-
-        for i, hook in enumerate(trainer.hooks):
-            # hook: Hook
-            key = f'{hook.__class__}-{i}'
-            if key in meta and hasattr(hook, 'load_state_dict'):
-                hook.load_state_dict(meta.get(key, {}))
-            else:
-                trainer.logger.warn(
-                    f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
-                )
+        meta = load_checkpoint(
+            filename, model,
+            getattr(trainer, 'optimizer', None) if load_all_state else None,
+            getattr(trainer, 'lr_scheduler', None) if load_all_state else None)
+        if load_all_state:
+            trainer._epoch = meta.get('epoch', trainer._epoch)
+            trainer._iter = meta.get('iter', trainer._iter)
+            trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
+
+            for i, hook in enumerate(trainer.hooks):
+                # hook: Hook
+                key = f'{hook.__class__}-{i}'
+                if key in meta and hasattr(hook, 'load_state_dict'):
+                    hook.load_state_dict(meta.get(key, {}))
+                else:
+                    trainer.logger.warn(
+                        f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
+                    )
 
         version = meta.get('modelscope')
         if version != __version__:
@@ -163,6 +180,21 @@ class CheckpointHook(Hook):
                                        and not self.by_epoch):
             self._save_pretrained(trainer)
 
+        self.history_checkpoints.append(cur_save_name)
+        self.remove_obsolete_checkpoints()
+
+    def remove_obsolete_checkpoints(self):
+        if self.max_checkpoint_num is not None and \
+                len(self.history_checkpoints) > self.max_checkpoint_num:
+            history_checkpoints = [ckpt for ckpt in self.history_checkpoints]
+            self.history_checkpoints.clear()
+            for i, ckpt_file in enumerate(history_checkpoints):
+                if i < len(history_checkpoints) - self.max_checkpoint_num:
+                    if os.path.isfile(ckpt_file):
+                        os.remove(ckpt_file)
+                else:
+                    self.history_checkpoints.append(ckpt_file)
+
     def _save_pretrained(self, trainer):
         output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
         from modelscope.trainers.parallel.utils import is_parallel
@@ -175,15 +207,53 @@ class CheckpointHook(Hook):
         config = trainer.cfg.to_dict()
         # override pipeline by tasks name after finetune done,
         # avoid case like fill mask pipeline with a text cls task
-        config['pipeline'] = {'type': config['task']}
+        if config['task'] in [
+                getattr(Pipelines, attr) for attr in dir(Pipelines)
+                if not attr.startswith('__')
+        ]:
+            # TODO a temp fix to avoid pipeline_name and task mismatch
+            config['pipeline'] = {'type': config['task']}
+
+        class SaveConfig:
+
+            def __init__(self, output_dir, config):
+                self.output_dir = output_dir
+                self.config = config
+
+            def __call__(self, _output_dir, _config):
+                self.config = _config
+
+            def save_config(self):
+                save_configuration(self.output_dir, self.config)
+
+        save_config_fn = SaveConfig(output_dir, config)
 
         if hasattr(model, 'save_pretrained'):
+            # Now support two binary files: pytorch_model.bin and pytorch_model.pt
+            default_bin_file = ModelFile.TORCH_MODEL_BIN_FILE
+            if hasattr(
+                    model,
+                    'model_dir') and ModelFile.TORCH_MODEL_FILE in os.listdir(
+                        model.model_dir):
+                default_bin_file = ModelFile.TORCH_MODEL_FILE
             model.save_pretrained(
                 output_dir,
-                ModelFile.TORCH_MODEL_BIN_FILE,
+                default_bin_file,
                 save_function=save_checkpoint,
-                config=config,
+                config=save_config_fn.config,
+                save_config_function=save_config_fn,
                 with_meta=False)
+        if trainer.train_preprocessor is not None:
+            trainer.train_preprocessor.save_pretrained(
+                output_dir,
+                save_config_fn.config,
+                save_config_function=save_config_fn)
+        if trainer.eval_preprocessor is not None:
+            trainer.eval_preprocessor.save_pretrained(
+                output_dir,
+                save_config_fn.config,
+                save_config_function=save_config_fn)
+        save_config_fn.save_config()
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -222,6 +292,9 @@ class BestCkptSaverHook(CheckpointHook):
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): Output directory to save best checkpoint.
         restore_best (bool): Whether to restore the best checkpoint after training.
+        max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
+            If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the
+            `rule` and `metric_key` arguments.
     """
 
     PRIORITY = Priority.LOW
@@ -235,13 +308,17 @@ class BestCkptSaverHook(CheckpointHook):
                  save_dir=None,
                  save_file_name=None,
                  restore_best=False,
-                 interval=0):
+                 max_checkpoint_num=1,
+                 interval=0,
+                 **kwargs):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
             interval=interval,
             by_epoch=by_epoch,
             save_optimizer=save_optimizer,
             save_dir=save_dir,
+            max_checkpoint_num=max_checkpoint_num,
+            **kwargs,
         )
         self.metric_key = metric_key
         self.rule = rule
@@ -249,6 +326,7 @@ class BestCkptSaverHook(CheckpointHook):
         self._best_ckpt_file = None
         self.save_file_name = save_file_name
         self.restore_best = restore_best
+        self.history_checkpoints = set()
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
@@ -284,6 +362,10 @@ class BestCkptSaverHook(CheckpointHook):
                     self.save_dir,
                     f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
                 )
+        else:
+            if '.' not in cur_save_name:
+                cur_save_name = f'{cur_save_name}.pth'
+            cur_save_name = os.path.join(self.save_dir, cur_save_name)
 
         meta = {
             'epoch': trainer.epoch,
@@ -300,6 +382,28 @@ class BestCkptSaverHook(CheckpointHook):
                         trainer.lr_scheduler, meta)
         self._best_ckpt_file = cur_save_name
         self._save_pretrained(trainer)
+        self.history_checkpoints.add(cur_save_name)
+        self.remove_obsolete_checkpoints()
+
+    def remove_obsolete_checkpoints(self):
+
+        def extract_metric_from_filename(name1):
+            metric1 = float(name1.split(self.metric_key)[1].split('.')[0])
+            if self.rule == 'max':
+                return -metric1
+            else:
+                return metric1
+
+        if self.max_checkpoint_num is not None and \
+                len(self.history_checkpoints) > self.max_checkpoint_num:
+            history_checkpoints = sorted(
+                self.history_checkpoints, key=extract_metric_from_filename)
+            self.history_checkpoints.clear()
+            for i, ckpt_file in enumerate(history_checkpoints):
+                if i < self.max_checkpoint_num:
+                    self.history_checkpoints.add(ckpt_file)
+                elif os.path.isfile(ckpt_file):
+                    os.remove(ckpt_file)
 
     def state_dict(self):
         return {
diff --git a/modelscope/trainers/nlp/text_generation_trainer.py b/modelscope/trainers/nlp/text_generation_trainer.py
index f02faf71..fa6a448f 100644
--- a/modelscope/trainers/nlp/text_generation_trainer.py
+++ b/modelscope/trainers/nlp/text_generation_trainer.py
@@ -14,8 +14,8 @@ from modelscope.utils.file_utils import func_receive_dict_inputs
 class TextGenerationTrainer(NlpEpochBasedTrainer):
 
     def _decode(self, tokens):
-        tokenizer = self.eval_preprocessor.tokenizer
-        return tokenizer.decode(tokens.tolist(), skip_special_tokens=True)
+        return self.eval_preprocessor.decode(
+            tokens.tolist(), skip_special_tokens=True)
 
     def evaluation_step(self, data):
         model = self.model.module if self._dist else self.model
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 65e56f9e..5ce7c2f5 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -426,77 +426,51 @@ class NlpTrainerArguments:
 
 @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer)
 class NlpEpochBasedTrainer(EpochBasedTrainer):
+    """Add code to adapt with nlp models.
+
+    This trainer will accept the information of labels&text keys in the cfg, and then initialize
+    the nlp models/preprocessors with this information.
+
+    Labels&text key information may be carried in the cfg like this:
+
+    >>> cfg = {
+    >>>     ...
+    >>>     "dataset": {
+    >>>         "train": {
+    >>>             "first_sequence": "text1",
+    >>>             "second_sequence": "text2",
+    >>>             "label": "label",
+    >>>             "labels": [1, 2, 3, 4],
+    >>>         },
+    >>>         "val": {
+    >>>             "first_sequence": "text3",
+    >>>             "second_sequence": "text4",
+    >>>             "label": "label2",
+    >>>         },
+    >>>     }
+    >>> }
+
+    To view some actual finetune examples, please check the test files listed below:
+    tests/trainers/test_finetune_sequence_classification.py
+    tests/trainers/test_finetune_token_classification.py
+    """
 
-    def __init__(
-            self,
-            model: Optional[Union[TorchModel, nn.Module, str]] = None,
-            cfg_file: Optional[str] = None,
-            cfg_modify_fn: Optional[Callable] = None,
-            arg_parse_fn: Optional[Callable] = None,
-            data_collator: Optional[Callable] = None,
-            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
-            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
-            preprocessor: Optional[Preprocessor] = None,
-            optimizers: Tuple[torch.optim.Optimizer,
-                              torch.optim.lr_scheduler._LRScheduler] = (None,
-                                                                        None),
-            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
-            **kwargs):
-        """Add code to adapt with nlp models.
-
-        This trainer will accept the information of labels&text keys in the cfg, and then initialize
-        the nlp models/preprocessors with this information.
-
-        Labels&text key information may be carried in the cfg like this:
-
-        >>> cfg = {
-        >>>     ...
-        >>>     "dataset": {
-        >>>         "train": {
-        >>>             "first_sequence": "text1",
-        >>>             "second_sequence": "text2",
-        >>>             "label": "label",
-        >>>             "labels": [1, 2, 3, 4]
-        >>>         }
-        >>>     }
-        >>> }
-
-
-        Args:
-            cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
-
-            Example:
-            >>> def cfg_modify_fn(cfg):
-            >>>     cfg.preprocessor.first_sequence= 'text1'
-            >>>     cfg.preprocessor.second_sequence='text2'
-            >>>     return cfg
-
-            To view some actual finetune examples, please check the test files listed below:
-            tests/trainers/test_finetune_sequence_classification.py
-            tests/trainers/test_finetune_token_classification.py
-        """
-
-        if isinstance(model, str):
-            model_dir = self.get_or_download_model_dir(model, model_revision)
-            if cfg_file is None:
-                cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
-        else:
-            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
-            model_dir = os.path.dirname(cfg_file)
-
+    def __init__(self, *args, **kwargs):
         self.label2id = None
         self.id2label = None
         self.num_labels = None
-        self.cfg_modify_fn = cfg_modify_fn
-        self.cfg = self.rebuild_config(Config.from_file(cfg_file))
+        self.train_keys = None
+        self.eval_keys = None
+        super().__init__(*args, **kwargs)
 
+    def prepare_labels(self, cfg):
         try:
-            labels = self.cfg.dataset.train.labels
+            labels = cfg.dataset.train.labels
             self.label2id = {label: idx for idx, label in enumerate(labels)}
             self.id2label = {idx: label for idx, label in enumerate(labels)}
             self.num_labels = len(labels)
         except AttributeError:
-            label2id = parse_label_mapping(model_dir)
+            label2id = parse_label_mapping(self.model_dir)
             if label2id is not None:
                 self.label2id = label2id
                 self.id2label = {id: label for label, id in label2id.items()}
@@ -514,30 +488,15 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
 
             return {k: v for k, v in input_keys.items() if v is not None}
 
-        self.train_keys = build_dataset_keys(
-            self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
-            and hasattr(self.cfg.dataset, 'train') else None)
-        self.eval_keys = build_dataset_keys(
-            self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
-            and hasattr(self.cfg.dataset, 'val') else None)
+        self.train_keys = build_dataset_keys(cfg.safe_get('dataset.train'))
+        self.eval_keys = build_dataset_keys(cfg.safe_get('dataset.val'))
         if len(self.eval_keys) == 0:
             self.eval_keys = self.train_keys
 
-        super().__init__(
-            model=model_dir,
-            cfg_file=cfg_file,
-            arg_parse_fn=arg_parse_fn,
-            data_collator=data_collator,
-            preprocessor=preprocessor,
-            optimizers=optimizers,
-            model_revision=model_revision,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            **kwargs)
-
     def rebuild_config(self, cfg: Config):
         if self.cfg_modify_fn is not None:
             cfg = self.cfg_modify_fn(cfg)
+        self.prepare_labels(cfg)
         if not hasattr(cfg.model, 'label2id') and not hasattr(
                 cfg.model, 'id2label'):
             if self.id2label is not None:
@@ -571,6 +530,8 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         Returns: The preprocessor instance.
 
         """
+
+        # Compatible with old logic
         model_args = {} if self.label2id is None else {
             'label2id': self.label2id
         }
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index db5f6a9c..172cd6a8 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -74,12 +74,20 @@ class EpochBasedTrainer(BaseTrainer):
             containing the optimizer and the scheduler to use.
         seed (int): The optional random seed for torch, cuda, numpy and random.
         max_epochs: (int, optional): Total training epochs.
+        cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+
+        Examples of cfg_modify_fn:
+        >>> def cfg_modify_fn(cfg):
+        >>>     cfg.preprocessor.first_sequence= 'text1'
+        >>>     cfg.preprocessor.second_sequence='text2'
+        >>>     return cfg
     """
 
     def __init__(
             self,
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
             arg_parse_fn: Optional[Callable] = None,
             data_collator: Optional[Union[Callable, Dict[str,
                                                          Callable]]] = None,
@@ -96,6 +104,14 @@ class EpochBasedTrainer(BaseTrainer):
 
         self._seed = seed
         set_random_seed(self._seed)
+        self._metric_values = None
+        self.optimizers = optimizers
+        self._mode = ModeKeys.TRAIN
+        self._hooks: List[Hook] = []
+        self._epoch = 0
+        self._iter = 0
+        self._inner_iter = 0
+
         if isinstance(model, str):
             self.model_dir = self.get_or_download_model_dir(
                 model, model_revision)
@@ -107,11 +123,11 @@ class EpochBasedTrainer(BaseTrainer):
             self.model_dir = os.path.dirname(cfg_file)
 
         super().__init__(cfg_file, arg_parse_fn)
-
+        self.cfg_modify_fn = cfg_modify_fn
         # add default config
         merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
-
+        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
         if 'cfg_options' in kwargs:
             self.cfg.merge_from_dict(kwargs['cfg_options'])
 
@@ -125,110 +141,136 @@ class EpochBasedTrainer(BaseTrainer):
         else:
             self.work_dir = self.cfg.train.get('work_dir', './work_dir')
 
-        self.train_preprocessor, self.eval_preprocessor = None, None
-        if isinstance(preprocessor, Preprocessor):
-            self.train_preprocessor = preprocessor
-            self.eval_preprocessor = preprocessor
-        elif isinstance(preprocessor, Mapping):
-            if not (ConfigKeys.train in preprocessor
-                    or ConfigKeys.val in preprocessor):
-                raise ValueError(
-                    f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
-                )
-            if ConfigKeys.train in preprocessor:
-                assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
-                self.train_preprocessor = preprocessor[ConfigKeys.train]
-            if ConfigKeys.val in preprocessor:
-                assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
-                self.eval_preprocessor = preprocessor[ConfigKeys.val]
-        elif hasattr(self.cfg, ConfigFields.preprocessor
-                     ) and self.cfg.preprocessor is not None:
-            self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
-            )
-
-        if self.train_preprocessor is not None:
-            self.train_preprocessor.mode = ModeKeys.TRAIN
-        if self.eval_preprocessor is not None:
-            self.eval_preprocessor.mode = ModeKeys.EVAL
+        self.train_preprocessor, self.eval_preprocessor = self.get_preprocessors(
+            preprocessor)
 
-        if kwargs.get('launcher', None) is not None:
-            init_dist(kwargs['launcher'])
-
-        _, world_size = get_dist_info()
-        self._dist = world_size > 1
+        self._dist = self.init_dist(kwargs.get('launcher'))
+        self.device = self.get_device(kwargs.get('device'))
 
-        device_name = kwargs.get('device', 'gpu')
-        if self._dist:
-            local_rank = get_local_rank()
-            device_name = f'cuda:{local_rank}'
-
-        self.device = create_device(device_name)
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
-            task_data_config=self.cfg.dataset.get('train', None) if hasattr(
-                self.cfg, 'dataset') else None,
+            task_data_config=self.cfg.safe_get('dataset.train'),
             preprocessor=self.train_preprocessor,
             **kwargs)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
-            task_data_config=self.cfg.dataset.get('val', None) if hasattr(
-                self.cfg, 'dataset') else None,
+            task_data_config=self.cfg.safe_get('dataset.val'),
             preprocessor=self.eval_preprocessor,
             **kwargs)
 
-        self.train_data_collator, self.eval_data_collator = None, None
+        self.train_data_collator, self.eval_data_collator = self.get_data_collator(
+            data_collator)
+        self.metrics = self.get_metrics()
+        self._max_epochs = kwargs.get('max_epochs',
+                                      self.cfg.safe_get('train.max_epochs'))
+        assert self._max_epochs is not None, 'max_epochs should be provided by the init arguments or configured ' \
+                                             'in the `train.max_epochs` key in the configuration file.'
+        self._train_iters_per_epoch = kwargs.get(
+            'train_iters_per_epoch',
+            self.cfg.safe_get('train.train_iters_per_epoch'))
+        self._eval_iters_per_epoch = kwargs.get(
+            'val_iters_per_epoch',
+            self.cfg.safe_get('evaluation.val_iters_per_epoch'))
+        self.use_fp16 = kwargs.get('use_fp16', False)
+        # model placement
+        self.place_model()
+
+    def place_model(self):
+        """Place model to device, or to DDP
+        """
+        if self.device.type == 'cuda':
+            self.model.to(self.device)
+            if not is_parallel(self.model) and self._dist:
+                self.model = self.to_parallel(self.model)
+
+    def get_data_collator(self, data_collator):
+        """Get the data collator for both training and evaluating.
+
+        Args:
+            data_collator: The input data_collator param.
+
+        Returns:
+            The train_data_collator and eval_data_collator, can be None.
+        """
+
+        train_data_collator, eval_data_collator = None, None
         if isinstance(data_collator, Mapping):
-            if not (ConfigKeys.train in data_collator
-                    or ConfigKeys.val in data_collator):
-                raise ValueError(
-                    f'data_collator must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
-                )
             if ConfigKeys.train in data_collator:
                 assert isinstance(data_collator[ConfigKeys.train], Callable)
-                self.train_data_collator = data_collator[ConfigKeys.train]
+                train_data_collator = data_collator[ConfigKeys.train]
             if ConfigKeys.val in data_collator:
                 assert isinstance(data_collator[ConfigKeys.val], Callable)
-                self.eval_data_collator = data_collator[ConfigKeys.val]
+                eval_data_collator = data_collator[ConfigKeys.val]
         else:
             collate_fn = default_collate if data_collator is None else data_collator
-            self.train_data_collator = collate_fn
-            self.eval_data_collator = collate_fn
+            train_data_collator = collate_fn
+            eval_data_collator = collate_fn
+        return train_data_collator, eval_data_collator
 
-        self.metrics = self.get_metrics()
-        self._metric_values = None
-        self.optimizers = optimizers
-        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
-        self._mode = ModeKeys.TRAIN
-        self._hooks: List[Hook] = []
-        self._epoch = 0
-        self._iter = 0
-        self._inner_iter = 0
-        if 'max_epochs' not in kwargs:
-            assert hasattr(
-                self.cfg.train,
-                'max_epochs'), 'max_epochs is missing in configuration file'
-            self._max_epochs = self.cfg.train.max_epochs
-        else:
-            self._max_epochs = kwargs['max_epochs']
-        self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
-        self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
-        if self._train_iters_per_epoch is None and hasattr(
-                self.cfg.train, 'train_iters_per_epoch'):
-            self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch
-        if self._eval_iters_per_epoch is None and hasattr(
-                self.cfg, 'evaluation') and hasattr(self.cfg.evaluation,
-                                                    'val_iters_per_epoch'):
-            self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch
+    def init_dist(self, launcher=None):
+        """Init dist and returns the dist information.
 
-        self.use_fp16 = kwargs.get('use_fp16', False)
+        Args:
+            launcher: The launcher info.
 
-        # model placement
-        if self.device.type == 'cuda':
-            self.model.to(self.device)
-            if not is_parallel(self.model) and self._dist:
-                self.model = self.to_parallel(self.model)
+        Returns:
+            _dist: If world_size is greater than 1.
+        """
+        if launcher is not None:
+            init_dist(launcher)
+
+        _, world_size = get_dist_info()
+        _dist = world_size > 1
+        return _dist
+
+    def get_device(self, device=None):
+        """Get the device information.
+
+        Args:
+            device: The input device info.
+
+        Returns:
+            device_name: The final device name.
+        """
+        device_name = device if device is not None else 'gpu'
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
+        return create_device(device_name)
+
+    def get_preprocessors(self, preprocessor):
+        """Get the preprocessors information.
+
+        Args:
+            preprocessor: The input preprocessor info.
+
+        Returns:
+            The train_preprocessor and eval_preprocessor, can be None.
+        """
+        train_preprocessor = None
+        eval_preprocessor = None
+        if isinstance(preprocessor, Preprocessor):
+            train_preprocessor = preprocessor
+            eval_preprocessor = preprocessor
+        elif isinstance(preprocessor, Mapping):
+            if ConfigKeys.train in preprocessor:
+                assert isinstance(preprocessor[ConfigKeys.train], Callable)
+                train_preprocessor = preprocessor[ConfigKeys.train]
+            if ConfigKeys.val in preprocessor:
+                assert isinstance(preprocessor[ConfigKeys.val], Callable)
+                eval_preprocessor = preprocessor[ConfigKeys.val]
+        elif hasattr(self.cfg, ConfigFields.preprocessor
+                     ) and self.cfg.preprocessor is not None:
+            train_preprocessor, eval_preprocessor = self.build_preprocessor()
+
+        if train_preprocessor is not None:
+            train_preprocessor.mode = ModeKeys.TRAIN
+        if eval_preprocessor is not None:
+            eval_preprocessor.mode = ModeKeys.EVAL
+        return train_preprocessor, eval_preprocessor
 
     def rebuild_config(self, cfg: Config):
         """A method used to rebuild the config, any subclass can override this method.
@@ -236,6 +278,8 @@ class EpochBasedTrainer(BaseTrainer):
         Returns: The rebuilt config
 
         """
+        if self.cfg_modify_fn is not None:
+            cfg = self.cfg_modify_fn(cfg)
         return cfg
 
     @property
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index 6e4e7a19..87e0abc7 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -4,6 +4,7 @@ import logging
 import os
 import pickle
 import shutil
+from collections.abc import Mapping
 
 import torch
 from torch import distributed as dist
@@ -58,7 +59,7 @@ def single_gpu_test(trainer,
             if progress_with_iters:
                 batch_size = 1  # iteration count
             else:
-                if isinstance(data, dict):
+                if isinstance(data, Mapping):
                     if 'nsentences' in data:
                         batch_size = data['nsentences']
                     else:
@@ -138,7 +139,7 @@ def multi_gpu_test(trainer,
             result = trainer.evaluation_step(data)
             results.append(result)
 
-            if isinstance(data, dict):
+            if isinstance(data, Mapping):
                 if 'nsentences' in data:
                     batch_size = data['nsentences']
                 else:
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 5acaa411..e21c3dcc 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -5,7 +5,7 @@ import os
 import time
 from collections import OrderedDict
 from shutil import copytree, ignore_patterns, rmtree
-from typing import Callable, List, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
 import json
 import torch
@@ -137,11 +137,18 @@ def load_checkpoint(filename,
     return checkpoint.get('meta', {})
 
 
+def save_configuration(target_folder, config: Dict):
+    if ConfigFields.pipeline not in config:
+        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
+    cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
+    config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
+    storage.write(cfg_str.encode(), config_file)
+
+
 def save_pretrained(model,
                     target_folder: Union[str, os.PathLike],
                     save_checkpoint_name: str = None,
                     save_function: Callable = None,
-                    config: Optional[dict] = None,
                     **kwargs):
     """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
 
@@ -154,11 +161,8 @@ def save_pretrained(model,
         save_checkpoint_name (str):
         The checkpoint name to be saved in the target_folder
 
-        save_function (Callable, optional):
+        save_function (Callable):
         The function to use to save the state dictionary.
-
-        config (Optional[dict], optional):
-        The config for the configuration.json, might not be identical with model.config
     """
 
     if save_function is None or not isinstance(save_function, Callable):
@@ -173,9 +177,6 @@ def save_pretrained(model,
         raise Exception(
             'At least pass in one checkpoint name for saving method')
 
-    if config is None:
-        raise ValueError('Configuration is not valid')
-
     # Clean the folder from a previous save
     if os.path.exists(target_folder):
         rmtree(target_folder)
@@ -201,10 +202,3 @@ def save_pretrained(model,
         raise Exception(
             f'During saving checkpoints, the error of "{type(e).__name__} '
             f'with msg {e} throwed')
-
-    # Dump the config to the configuration.json
-    if ConfigFields.pipeline not in config:
-        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
-    cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
-    config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
-    storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index b3512251..71d820e5 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -3,6 +3,7 @@
 # https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
 
 import copy
+import dataclasses
 import os
 import os.path as osp
 import platform
@@ -10,6 +11,7 @@ import shutil
 import sys
 import tempfile
 import types
+from dataclasses import fields
 from pathlib import Path
 from types import FunctionType
 from typing import Dict, Union
@@ -337,6 +339,37 @@ class Config:
         super(Config, self).__setattr__('_filename', _filename)
         super(Config, self).__setattr__('_text', _text)
 
+    def safe_get(self, key_chain: str, default=None):
+        """Get a value with a key-chain in str format, if key does not exist, the default value will be returned.
+
+        This method is safe to call, and will not edit any value.
+
+        Args:
+            key_chain: The input key chain, for example: 'train.hooks[0].type'
+            default: The default value returned when any key does not exist, default None.
+
+        Returns:
+            The value, or the default value.
+        """
+        try:
+            keys = key_chain.split('.')
+            _cfg_dict = self._cfg_dict
+            for key in keys:
+                val = None
+                if '[' in key:
+                    key, val = key.split('[')
+                    val, _ = val.split(']')
+                _cfg_dict = getattr(_cfg_dict, key)
+                if val is not None:
+                    _cfg_dict = _cfg_dict[int(val)]
+            return _cfg_dict
+        except Exception as e:
+            logger.debug(
+                f'Key not valid in Config: {key_chain}, return the default value: {default}'
+            )
+            logger.debug(e)
+            return default
+
     def dump(self, file: str = None):
         """Dumps config into a file or returns a string representation of the
         config.
@@ -635,16 +668,6 @@ def check_config(cfg: Union[str, ConfigDict], is_training=False):
         check_attr(ConfigFields.evaluation)
 
 
-def use_task_specific_params(model, task):
-    """Update config with summarization specific params."""
-    task_specific_params = model.config.task_specific_params
-
-    if task_specific_params is not None:
-        pars = task_specific_params.get(task, {})
-        logger.info(f'using task specific params for {task}: {pars}')
-        model.config.update(pars)
-
-
 class JSONIteratorEncoder(json.JSONEncoder):
     """Implement this method in order that supporting arbitrary iterators, it returns
         a serializable object for ``obj``, or calls the base implementation
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 87a6eaff..7841e1fa 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -56,8 +56,10 @@ def read_config(model_id_or_path: str,
     if not os.path.exists(model_id_or_path):
         local_path = model_file_download(
             model_id_or_path, ModelFile.CONFIGURATION, revision=revision)
-    else:
+    elif os.path.isdir(model_id_or_path):
         local_path = os.path.join(model_id_or_path, ModelFile.CONFIGURATION)
+    elif os.path.isfile(model_id_or_path):
+        local_path = model_id_or_path
 
     return Config.from_file(local_path)
 
diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py
index 13a21480..3295b5d5 100644
--- a/modelscope/utils/nlp/utils.py
+++ b/modelscope/utils/nlp/utils.py
@@ -1,5 +1,7 @@
 import os.path as osp
 
+from modelscope.utils.hub import parse_label_mapping
+
 
 def import_external_nltk_data(nltk_data_dir, package_name):
     """import external nltk_data, and extract nltk zip package.
@@ -18,3 +20,49 @@ def import_external_nltk_data(nltk_data_dir, package_name):
         import zipfile
         with zipfile.ZipFile(filepath) as zf:
             zf.extractall(osp.join(packagepath))
+
+
+def parse_labels_in_order(model_dir=None, cfg=None, **kwargs):
+    """Parse labels information in order.
+
+    This is a helper function, used to get labels information in the correct order.
+    1. The kw arguments listed in the method will in the first priority.
+    2. Information in the cfg.dataset.train.labels will be used in the second priority (Compatible with old logic).
+    3. Information in other files will be used then.
+
+    Args:
+        model_dir: The model_dir used to call `parse_label_mapping`.
+        cfg: An optional cfg parsed and modified from the configuration.json.
+        **kwargs: The user inputs into the method.
+
+    Returns:
+        The modified kwargs.
+    """
+    label2id = kwargs.pop('label2id', None)
+    id2label = kwargs.pop('id2label', None)
+    num_labels = kwargs.pop('num_labels', None)
+    if label2id is None and id2label is not None:
+        label2id = {label: id for id, label in id2label.items()}
+    if label2id is None:
+        if cfg is not None and cfg.safe_get(
+                'dataset.train.labels') is not None:
+            # An extra logic to parse labels from the dataset area.
+            label2id = {
+                label: idx
+                for idx, label in enumerate(
+                    cfg.safe_get('dataset.train.labels'))
+            }
+        elif model_dir is not None:
+            label2id = parse_label_mapping(model_dir)
+
+    if num_labels is None and label2id is not None:
+        num_labels = len(label2id)
+    if id2label is None and label2id is not None:
+        id2label = {id: label for label, id in label2id.items()}
+    if num_labels is not None:
+        kwargs['num_labels'] = num_labels
+    if label2id is not None:
+        kwargs['label2id'] = label2id
+    if id2label is not None:
+        kwargs['id2label'] = id2label
+    return kwargs
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 5284aa43..38071bb8 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -64,8 +64,9 @@ class Registry(object):
         if group_key not in self._modules:
             self._modules[group_key] = dict()
 
-        if not inspect.isclass(module_cls):
-            raise TypeError(f'module is not a class type: {type(module_cls)}')
+        # Some registered module_cls can be function type.
+        # if not inspect.isclass(module_cls):
+        #     raise TypeError(f'module is not a class type: {type(module_cls)}')
 
         if module_name is None:
             module_name = module_cls.__name__
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 58b5b1a3..e7a47214 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -770,8 +770,6 @@ class IgnoreKeyFn:
         self.keys = keys if isinstance(keys, list) else []
 
     def __call__(self, v1output, v2output, key, type):
-        if key == 'encoder.encoder.layer.0.intermediate.intermediate_act_fn':
-            print()
         for _key in self.keys:
             pattern = re.compile(_key)
             if key is not None and pattern.fullmatch(key):
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index dff411f6..81a87398 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -4,7 +4,7 @@ import unittest
 
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.test_utils import require_tf, require_torch, test_level
@@ -73,7 +73,7 @@ class MsDatasetTest(unittest.TestCase):
     def test_to_torch_dataset_text(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         nlp_model = Model.from_pretrained(model_id)
-        preprocessor = SequenceClassificationPreprocessor(
+        preprocessor = TextClassificationTransformersPreprocessor(
             nlp_model.model_dir,
             first_sequence='premise',
             second_sequence=None,
@@ -95,7 +95,7 @@ class MsDatasetTest(unittest.TestCase):
         tf.compat.v1.enable_eager_execution()
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         nlp_model = Model.from_pretrained(model_id)
-        preprocessor = SequenceClassificationPreprocessor(
+        preprocessor = TextClassificationTransformersPreprocessor(
             nlp_model.model_dir,
             first_sequence='premise',
             second_sequence=None)
diff --git a/tests/pipelines/test_addr_similarity.py b/tests/pipelines/test_addr_similarity.py
index 57c47b09..8c1f93c9 100644
--- a/tests/pipelines/test_addr_similarity.py
+++ b/tests/pipelines/test_addr_similarity.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -22,7 +22,8 @@ class AddrSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        preprocessor = SequenceClassificationPreprocessor(model.model_dir)
+        preprocessor = TextClassificationTransformersPreprocessor(
+            model.model_dir)
 
         pipeline_ins = pipeline(
             task=Tasks.text_classification,
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
index 549d2cb3..9ed5cd2b 100644
--- a/tests/pipelines/test_deberta_tasks.py
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -8,7 +8,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import DebertaV2ForMaskedLM
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import NLPPreprocessor
+from modelscope.preprocessors import FillMaskTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -22,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id_deberta)
-        preprocessor = NLPPreprocessor(
+        preprocessor = FillMaskTransformersPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -38,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
         # sbert
         print(self.model_id_deberta)
         model = Model.from_pretrained(self.model_id_deberta)
-        preprocessor = NLPPreprocessor(
+        preprocessor = FillMaskTransformersPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 2f66f516..20c21755 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -9,7 +9,8 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForFaqQuestionAnswering
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
-from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.preprocessors import \
+    FaqQuestionAnsweringTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -47,7 +48,7 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(
+        preprocessor = FaqQuestionAnsweringTransformersPreprocessor.from_pretrained(
             cache_path)
         model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
         pipeline_ins = FaqQuestionAnsweringPipeline(
@@ -58,7 +59,8 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        preprocessor = FaqQuestionAnsweringPreprocessor(model.model_dir)
+        preprocessor = FaqQuestionAnsweringTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.faq_question_answering,
             model=model,
diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py
index 39291e76..6bad602a 100644
--- a/tests/pipelines/test_feature_extraction.py
+++ b/tests/pipelines/test_feature_extraction.py
@@ -9,7 +9,7 @@ from modelscope.models.nlp import FeatureExtractionModel
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FeatureExtractionPipeline
-from modelscope.preprocessors import NLPPreprocessor
+from modelscope.preprocessors import FillMaskTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -27,7 +27,7 @@ class FeatureExtractionTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = NLPPreprocessor(cache_path, padding=False)
+        tokenizer = FillMaskTransformersPreprocessor(cache_path, padding=False)
         model = FeatureExtractionModel.from_pretrained(self.model_id)
         pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -43,7 +43,8 @@ class FeatureExtractionTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = NLPPreprocessor(model.model_dir, padding=False)
+        tokenizer = FillMaskTransformersPreprocessor(
+            model.model_dir, padding=False)
         pipeline_ins = pipeline(
             task=Tasks.feature_extraction, model=model, preprocessor=tokenizer)
         result = pipeline_ins(input=self.sentence1)
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 64833026..bc244826 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -8,7 +8,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForMaskedLM, VecoForMaskedLM
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import NLPPreprocessor
+from modelscope.preprocessors import FillMaskTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -52,7 +52,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
         # sbert
         for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
-            preprocessor = NLPPreprocessor(
+            preprocessor = FillMaskTransformersPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
             model = SbertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -67,7 +67,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model_dir = snapshot_download(self.model_id_veco)
-        preprocessor = NLPPreprocessor(
+        preprocessor = FillMaskTransformersPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -84,7 +84,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
         # bert
         language = 'zh'
         model_dir = snapshot_download(self.model_id_bert)
-        preprocessor = NLPPreprocessor(
+        preprocessor = FillMaskTransformersPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = Model.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -102,7 +102,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
         for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
-            preprocessor = NLPPreprocessor(
+            preprocessor = FillMaskTransformersPreprocessor(
                 model.model_dir,
                 first_sequence='sentence',
                 second_sequence=None)
@@ -118,7 +118,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
-        preprocessor = NLPPreprocessor(
+        preprocessor = FillMaskTransformersPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py
index cb2b32d6..5ed019d9 100644
--- a/tests/pipelines/test_multilingual_named_entity_recognition.py
+++ b/tests/pipelines/test_multilingual_named_entity_recognition.py
@@ -6,8 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
                                    TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import (NamedEntityRecognitionThaiPipeline,
-                                      NamedEntityRecognitionVietPipeline)
+from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessorThai, NERPreprocessorViet
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -36,7 +35,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
         tokenizer = NERPreprocessorThai(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
-        pipeline1 = NamedEntityRecognitionThaiPipeline(
+        pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.named_entity_recognition,
@@ -76,7 +75,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
         tokenizer = NERPreprocessorViet(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
-        pipeline1 = NamedEntityRecognitionVietPipeline(
+        pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.named_entity_recognition,
@@ -103,6 +102,30 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
             task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id)
         print(pipeline_ins(input=self.viet_sentence))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_tcrf_with_model_name_viet_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id)
+        print(
+            pipeline_ins(
+                input=[
+                    self.viet_sentence, self.viet_sentence[:10],
+                    self.viet_sentence[5:]
+                ],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_tcrf_with_model_name_viet_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.viet_tcrf_model_id,
+            padding=False)
+        print(
+            pipeline_ins(input=[
+                self.viet_sentence, self.viet_sentence[:10],
+                self.viet_sentence[5:]
+            ]))
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_multilingual_word_segmentation.py b/tests/pipelines/test_multilingual_word_segmentation.py
index 25b4b241..da54fe02 100644
--- a/tests/pipelines/test_multilingual_word_segmentation.py
+++ b/tests/pipelines/test_multilingual_word_segmentation.py
@@ -48,6 +48,23 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.word_segmentation, model=self.model_id)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id)
+        print(
+            pipeline_ins(
+                input=[self.sentence, self.sentence[:10], self.sentence[6:]],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id, padding=False)
+        print(
+            pipeline_ins(
+                input=[self.sentence, self.sentence[:10], self.sentence[6:]]))
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 3317c604..c4bcdfec 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -7,7 +7,8 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
                                    TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
-from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.preprocessors import \
+    TokenClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -24,15 +25,19 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base'
+    lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
     sentence_en = 'pizza shovel'
     sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。'
     addr = '浙江省杭州市余杭区文一西路969号亲橙里'
+    addr1 = '浙江省西湖区灵隐隧道'
+    addr2 = '内蒙古自治区巴彦淖尔市'
+    ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.tcrf_model_id)
-        tokenizer = TokenClassificationPreprocessor(cache_path)
+        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -49,7 +54,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.lcrf_model_id)
-        tokenizer = TokenClassificationPreprocessor(cache_path)
+        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
         model = LSTMCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -66,7 +71,8 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_tcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.tcrf_model_id)
-        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -77,7 +83,8 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_addrst_with_model_from_modelhub(self):
         model = Model.from_pretrained(
             'damo/nlp_structbert_address-parsing_chinese_base')
-        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -90,10 +97,27 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.named_entity_recognition, model=self.addr_model_id)
         print(pipeline_ins(input=self.addr))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_addrst_with_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.addr_model_id)
+        print(
+            pipeline_ins(
+                input=[self.addr, self.addr1, self.addr2], batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_addrst_with_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.addr_model_id,
+            padding=False)
+        print(pipeline_ins(input=[self.addr, self.addr1, self.addr2]))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.lcrf_model_id)
-        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -112,18 +136,87 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_lcrf_with_chinese_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition, model=self.chinese_model_id)
         print(pipeline_ins(input=self.sentence_zh))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_lcrf_with_chinese_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.chinese_model_id,
+            padding=False)
+        print(
+            pipeline_ins(input=[
+                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
+            ]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_lcrf_with_chinese_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
+        print(
+            pipeline_ins(
+                input=[
+                    self.sentence_zh, self.sentence_zh[:20],
+                    self.sentence_zh[10:]
+                ],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_lstm_with_chinese_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
+        print(pipeline_ins(input=self.sentence_zh))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_lstm_with_chinese_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.lstm_model_id,
+            padding=False)
+        print(
+            pipeline_ins(input=[
+                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
+            ]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_lstm_with_chinese_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
+        print(
+            pipeline_ins(
+                input=[
+                    self.sentence_zh, self.sentence_zh[:20],
+                    self.sentence_zh[10:]
+                ],
+                batch_size=2))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_english_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition, model=self.english_model_id)
         print(pipeline_ins(input=self.sentence_en))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_english_with_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.english_model_id)
+        print(
+            pipeline_ins(
+                input=[self.ecom, self.sentence_zh, self.sentence],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_english_with_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=self.english_model_id,
+            padding=False)
+        print(pipeline_ins(input=[self.ecom, self.sentence_zh, self.sentence]))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 9e9fefea..94689e96 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -25,7 +25,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(cache_path)
+        tokenizer = TextClassificationTransformersPreprocessor(cache_path)
         model = Model.from_pretrained(cache_path)
         pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
@@ -38,7 +38,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
+        tokenizer = TextClassificationTransformersPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
index 038a90f0..5e4b20dc 100644
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -7,7 +7,8 @@ from modelscope.models import Model
 from modelscope.models.nlp import TokenClassificationModel
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TokenClassificationPipeline
-from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.preprocessors import \
+    TokenClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -19,7 +20,7 @@ class PartOfSpeechTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = TokenClassificationPreprocessor(cache_path)
+        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
         model = TokenClassificationModel.from_pretrained(cache_path)
         pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -32,7 +33,8 @@ class PartOfSpeechTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.part_of_speech, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index 561eaf21..b7bbe131 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import InformationExtractionModel
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import InformationExtractionPipeline
-from modelscope.preprocessors import RelationExtractionPreprocessor
+from modelscope.preprocessors import RelationExtractionTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -23,7 +23,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = RelationExtractionPreprocessor(cache_path)
+        tokenizer = RelationExtractionTransformersPreprocessor(cache_path)
         model = InformationExtractionModel.from_pretrained(cache_path)
         pipeline1 = InformationExtractionPipeline(
             model, preprocessor=tokenizer)
@@ -37,7 +37,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = RelationExtractionPreprocessor(model.model_dir)
+        tokenizer = RelationExtractionTransformersPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.relation_extraction,
             model=model,
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index e96724a8..4132f965 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -7,7 +7,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import BertForSentenceEmbedding
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
-from modelscope.preprocessors import SentenceEmbeddingPreprocessor
+from modelscope.preprocessors import SentenceEmbeddingTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -39,7 +39,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SentenceEmbeddingPreprocessor(cache_path)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(cache_path)
         model = BertForSentenceEmbedding.from_pretrained(cache_path)
         pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -61,7 +61,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SentenceEmbeddingPreprocessor(model.model_dir)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.inputs))
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 904caea3..486fadfa 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -26,7 +26,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(cache_path)
+        tokenizer = TextClassificationTransformersPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
         pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -42,13 +42,35 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
+        tokenizer = TextClassificationTransformersPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
             preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_similarity, model=self.model_id)
+        print(
+            pipeline_ins(
+                input=[(self.sentence1, self.sentence2),
+                       (self.sentence1[:4], self.sentence2[5:]),
+                       (self.sentence1[2:], self.sentence2[:8])],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_similarity, model=self.model_id, padding=False)
+        print(
+            pipeline_ins(input=[(
+                self.sentence1,
+                self.sentence2), (self.sentence1[:4], self.sentence2[5:]
+                                  ), (self.sentence1[2:],
+                                      self.sentence2[:8])]))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 5c8d4e93..e0f823be 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -7,7 +7,7 @@ from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -25,7 +25,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(cache_path)
+        tokenizer = TextClassificationTransformersPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
             self.model_id, num_labels=2)
         pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
@@ -39,7 +39,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
+        tokenizer = TextClassificationTransformersPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.text_classification,
             model=model,
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index d90263c4..6ce6a9b3 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import T5ForConditionalGeneration
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import Text2TextGenerationPipeline
-from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.pipelines.nlp import TextGenerationT5Pipeline
+from modelscope.preprocessors import TextGenerationT5Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -24,8 +24,8 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_T5(self):
         cache_path = snapshot_download(self.model_id_generate)
         model = T5ForConditionalGeneration.from_pretrained(cache_path)
-        preprocessor = Text2TextGenerationPreprocessor(cache_path)
-        pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
+        preprocessor = TextGenerationT5Preprocessor(cache_path)
+        pipeline1 = TextGenerationT5Pipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.text2text_generation, model=model, preprocessor=preprocessor)
         print(
@@ -35,7 +35,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_instance(self):
         model = Model.from_pretrained(self.model_id_translate)
-        preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
+        preprocessor = TextGenerationT5Preprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation,
             model=model,
@@ -48,6 +48,28 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.text2text_generation, model=self.model_id_translate)
         print(pipeline_ins(self.input_translate))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_id_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation, model=self.model_id_translate)
+        inputs = [
+            self.input_translate, self.input_translate[:8],
+            self.input_translate[8:]
+        ]
+        print(pipeline_ins(inputs, batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_id_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation,
+            model=self.model_id_translate,
+            padding=False)
+        print(
+            pipeline_ins([
+                self.input_translate, self.input_translate[:8],
+                self.input_translate[8:]
+            ]))
+
     @unittest.skip(
         'only for test cases, there is no default official model yet')
     def test_run_pipeline_without_model_id(self):
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 5b38e116..d07ddbb8 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -5,7 +5,7 @@ from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -41,7 +41,7 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skip('nlp model does not support tensor input, skipped')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        preprocessor = SequenceClassificationPreprocessor(
+        preprocessor = TextClassificationTransformersPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             task=Tasks.text_classification,
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index ddb77eeb..1ce6695f 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextGenerationPipeline
-from modelscope.preprocessors import TextGenerationPreprocessor
+from modelscope.preprocessors import TextGenerationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -44,7 +44,7 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def run_pipeline_with_model_instance(self, model_id, input):
         model = Model.from_pretrained(model_id)
-        preprocessor = TextGenerationPreprocessor(
+        preprocessor = TextGenerationTransformersPreprocessor(
             model.model_dir,
             model.tokenizer,
             first_sequence='sentence',
@@ -53,15 +53,38 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.text_generation, model=model, preprocessor=preprocessor)
         print(pipeline_ins(input))
 
-    def run_pipeline_with_model_id(self, model_id, input):
-        pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
-        print(pipeline_ins(input))
+    def run_pipeline_with_model_id(self,
+                                   model_id,
+                                   input,
+                                   init_kwargs={},
+                                   run_kwargs={}):
+        pipeline_ins = pipeline(
+            task=Tasks.text_generation, model=model_id, **init_kwargs)
+        print(pipeline_ins(input, **run_kwargs))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_palm_zh_base_with_model_name(self):
         self.run_pipeline_with_model_id(self.palm_model_id_zh_base,
                                         self.palm_input_zh)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_palm_zh_base_with_model_name_batch(self):
+        self.run_pipeline_with_model_id(
+            self.palm_model_id_zh_base, [
+                self.palm_input_zh, self.palm_input_zh[:10],
+                self.palm_input_zh[10:]
+            ],
+            run_kwargs={'batch_size': 2})
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_palm_zh_base_with_model_name_batch_iter(self):
+        self.run_pipeline_with_model_id(
+            self.palm_model_id_zh_base, [
+                self.palm_input_zh, self.palm_input_zh[:10],
+                self.palm_input_zh[10:]
+            ],
+            init_kwargs={'padding': False})
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_palm_en_with_model_name(self):
         self.run_pipeline_with_model_id(self.palm_model_id_en,
@@ -144,11 +167,8 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
                                                        self.palm_input_en)):
             cache_path = snapshot_download(model_id)
             model = PalmForTextGeneration.from_pretrained(cache_path)
-            preprocessor = TextGenerationPreprocessor(
-                cache_path,
-                model.tokenizer,
-                first_sequence='sentence',
-                second_sequence=None)
+            preprocessor = TextGenerationTransformersPreprocessor(
+                cache_path, first_sequence='sentence', second_sequence=None)
             pipeline1 = TextGenerationPipeline(model, preprocessor)
             pipeline2 = pipeline(
                 Tasks.text_generation, model=model, preprocessor=preprocessor)
@@ -160,7 +180,7 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_gpt3(self):
         cache_path = snapshot_download(self.gpt3_base_model_id)
         model = GPT3ForTextGeneration(cache_path)
-        preprocessor = TextGenerationPreprocessor(
+        preprocessor = TextGenerationTransformersPreprocessor(
             cache_path,
             model.tokenizer,
             first_sequence='sentence',
@@ -175,7 +195,10 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.text_generation)
-        print(pipeline_ins(self.palm_input_zh))
+        print(
+            pipeline_ins(
+                [self.palm_input_zh, self.palm_input_zh, self.palm_input_zh],
+                batch_size=2))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_bloom(self):
diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index 0b43e8b4..01f1887f 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -7,7 +7,7 @@ from modelscope.models import Model
 from modelscope.models.nlp import BertForTextRanking
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextRankingPipeline
-from modelscope.preprocessors import TextRankingPreprocessor
+from modelscope.preprocessors import TextRankingTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -32,7 +32,7 @@ class TextRankingTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         for model_id in self.models:
             cache_path = snapshot_download(model_id)
-            tokenizer = TextRankingPreprocessor(cache_path)
+            tokenizer = TextRankingTransformersPreprocessor(cache_path)
             model = BertForTextRanking.from_pretrained(cache_path)
             pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
             pipeline2 = pipeline(
@@ -46,7 +46,7 @@ class TextRankingTest(unittest.TestCase):
     def test_run_with_model_from_modelhub(self):
         for model_id in self.models:
             model = Model.from_pretrained(model_id)
-            tokenizer = TextRankingPreprocessor(model.model_dir)
+            tokenizer = TextRankingTransformersPreprocessor(model.model_dir)
             pipeline_ins = pipeline(
                 task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
             print(pipeline_ins(input=self.inputs))
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 6969c0e6..ffaf0155 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -6,7 +6,8 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
-from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.preprocessors import \
+    TokenClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -26,7 +27,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = TokenClassificationPreprocessor(cache_path)
+        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
         model = SbertForTokenClassification.from_pretrained(cache_path)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -38,7 +39,8 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
@@ -52,11 +54,24 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
                 'sbert_ws_zh',
                 compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=self.sentence))
-        with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model,
-                'sbert_ws_en',
-                compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
-            print(pipeline_ins(input=self.sentence_eng))
+        print(pipeline_ins(input=self.sentence_eng))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id)
+        print(
+            pipeline_ins(
+                input=[self.sentence, self.sentence[:5], self.sentence[5:]],
+                batch_size=2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch_iter(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id, padding=False)
+        print(
+            pipeline_ins(
+                input=[self.sentence, self.sentence[:5], self.sentence[5:]]))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index 00789707..f9a52b42 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -6,7 +6,8 @@ from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
-from modelscope.preprocessors import ZeroShotClassificationPreprocessor
+from modelscope.preprocessors import \
+    ZeroShotClassificationTransformersPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
@@ -28,7 +29,7 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = ZeroShotClassificationPreprocessor(cache_path)
+        tokenizer = ZeroShotClassificationTransformersPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
         pipeline1 = ZeroShotClassificationPipeline(
             model, preprocessor=tokenizer)
@@ -53,7 +54,8 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = ZeroShotClassificationPreprocessor(model.model_dir)
+        tokenizer = ZeroShotClassificationTransformersPreprocessor(
+            model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.zero_shot_classification,
             model=model,
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index f9f4d93f..9a31cc91 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -32,81 +32,74 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
-    def test_token_classification_tokenize(self):
-        with self.subTest(tokenizer_type='bert'):
-            cfg = dict(
-                type='token-cls-tokenizer',
-                model_dir='bert-base-cased',
-                label2id={
-                    'O': 0,
-                    'B': 1,
-                    'I': 2
-                })
-            preprocessor = build_preprocessor(cfg, Fields.nlp)
-            input = 'Do not meddle in the affairs of wizards, ' \
-                    'for they are subtle and quick to anger.'
-            output = preprocessor(input)
-            self.assertTrue(InputFields.text in output)
-            self.assertEqual(output['input_ids'].tolist()[0], [
-                101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678,
-                1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470,
-                119, 102
-            ])
-            self.assertEqual(output['attention_mask'].tolist()[0], [
-                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                1
-            ])
-            self.assertEqual(output['label_mask'].tolist()[0], [
-                False, True, True, True, False, True, True, True, True, True,
-                False, True, True, True, True, True, True, True, True, True,
-                True, False
-            ])
-            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
-                                                        (7, 13), (14, 16),
-                                                        (17, 20), (21, 28),
-                                                        (29, 31), (32, 39),
-                                                        (39, 40), (41, 44),
-                                                        (45, 49), (50, 53),
-                                                        (54, 60), (61, 64),
-                                                        (65, 70), (71, 73),
-                                                        (74, 79), (79, 80)])
+    def test_token_classification_tokenize_bert(self):
+        cfg = dict(
+            type='token-cls-tokenizer',
+            padding=False,
+            label_all_tokens=False,
+            model_dir='bert-base-cased',
+            label2id={
+                'O': 0,
+                'B': 1,
+                'I': 2
+            })
+        preprocessor = build_preprocessor(cfg, Fields.nlp)
+        input = 'Do not meddle in the affairs of wizards, ' \
+                'for they are subtle and quick to anger.'
+        output = preprocessor(input)
+        self.assertTrue(InputFields.text in output)
+        self.assertEqual(output['input_ids'].tolist()[0], [
+            101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116,
+            117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102
+        ])
+        self.assertEqual(
+            output['attention_mask'].tolist()[0],
+            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+        self.assertEqual(output['label_mask'].tolist()[0], [
+            False, True, True, True, False, True, True, True, True, True,
+            False, True, True, True, True, True, True, True, True, True, True,
+            False
+        ])
+        self.assertEqual(
+            output['offset_mapping'].tolist()[0],
+            [[0, 2], [3, 6], [7, 13], [14, 16], [17, 20], [21, 28], [29, 31],
+             [32, 39], [39, 40], [41, 44], [45, 49], [50, 53], [54, 60],
+             [61, 64], [65, 70], [71, 73], [74, 79], [79, 80]])
 
-        with self.subTest(tokenizer_type='roberta'):
-            cfg = dict(
-                type='token-cls-tokenizer',
-                model_dir='xlm-roberta-base',
-                label2id={
-                    'O': 0,
-                    'B': 1,
-                    'I': 2
-                })
-            preprocessor = build_preprocessor(cfg, Fields.nlp)
-            input = 'Do not meddle in the affairs of wizards, ' \
-                    'for they are subtle and quick to anger.'
-            output = preprocessor(input)
-            self.assertTrue(InputFields.text in output)
-            self.assertEqual(output['input_ids'].tolist()[0], [
-                0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239,
-                99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56,
-                5, 2
-            ])
-            self.assertEqual(output['attention_mask'].tolist()[0], [
-                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                1, 1, 1, 1, 1
-            ])
-            self.assertEqual(output['label_mask'].tolist()[0], [
-                False, True, True, True, False, True, True, True, False, True,
-                True, False, False, False, True, True, True, True, False, True,
-                True, True, True, False, False, False
-            ])
-            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
-                                                        (7, 13), (14, 16),
-                                                        (17, 20), (21, 28),
-                                                        (29, 31), (32, 40),
-                                                        (41, 44), (45, 49),
-                                                        (50, 53), (54, 60),
-                                                        (61, 64), (65, 70),
-                                                        (71, 73), (74, 80)])
+    def test_token_classification_tokenize_roberta(self):
+        cfg = dict(
+            type='token-cls-tokenizer',
+            padding=False,
+            label_all_tokens=False,
+            model_dir='xlm-roberta-base',
+            label2id={
+                'O': 0,
+                'B': 1,
+                'I': 2
+            })
+        preprocessor = build_preprocessor(cfg, Fields.nlp)
+        input = 'Do not meddle in the affairs of wizards, ' \
+                'for they are subtle and quick to anger.'
+        output = preprocessor(input)
+        self.assertTrue(InputFields.text in output)
+        self.assertEqual(output['input_ids'].tolist()[0], [
+            0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239, 99397,
+            4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56, 5, 2
+        ])
+        self.assertEqual(output['attention_mask'].tolist()[0], [
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1
+        ])
+        self.assertEqual(output['label_mask'].tolist()[0], [
+            False, True, True, True, False, True, True, True, False, True,
+            True, False, False, False, True, True, True, True, False, True,
+            True, True, True, False, False, False
+        ])
+        self.assertEqual(
+            output['offset_mapping'].tolist()[0],
+            [[0, 2], [3, 6], [7, 13], [14, 16], [17, 20], [21, 28], [29, 31],
+             [32, 40], [41, 44], [45, 49], [50, 53], [54, 60], [61, 64],
+             [65, 70], [71, 73], [74, 80]])
 
 
 if __name__ == '__main__':
diff --git a/tests/run.py b/tests/run.py
index 1b252756..e7fae5a2 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -555,7 +555,7 @@ if __name__ == '__main__':
         nargs='*',
         help='Run specified test suites(test suite files list split by space)')
     args = parser.parse_args()
-    set_test_level(args.level)
+    set_test_level(2)
     os.environ['REGRESSION_BASELINE'] = '1'
     logger.info(f'TEST LEVEL: {test_level()}')
     if not args.disable_profile:
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 061d37d3..f5632b63 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -340,21 +340,16 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
         """
 
-        from datasets import load_dataset
         langs = ['en']
         langs_eval = ['en']
         train_datasets = []
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = False
         for lang in langs:
             train_datasets.append(
-                load_dataset('xnli', lang, split='train', download_config=dc))
+                MsDataset.load('xnli', subset_name=lang, split='train'))
         eval_datasets = []
         for lang in langs_eval:
             eval_datasets.append(
-                load_dataset(
-                    'xnli', lang, split='validation', download_config=dc))
+                MsDataset.load('xnli', subset_name=lang, split='validation'))
         train_len = sum([len(dataset) for dataset in train_datasets])
         labels = ['0', '1', '2']
 
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index a92cee7b..a1480d38 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -91,8 +91,13 @@ class TestFinetuneTokenClassification(unittest.TestCase):
                     'label': 'labels',
                 }
             }
-            cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
+            cfg['preprocessor'] = {
+                'type': 'token-cls-tokenizer',
+                'padding': 'max_length'
+            }
             cfg.train.max_epochs = 2
+            cfg.train.dataloader.workers_per_gpu = 0
+            cfg.evaluation.dataloader.workers_per_gpu = 0
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index f1d9e414..5e9850a7 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -119,6 +119,85 @@ class TestTrainerWithNlp(unittest.TestCase):
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
         self.assertTrue(Metrics.accuracy in eval_results)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_save_best_ckpt(self):
+
+        class MockTrainer(EpochBasedTrainer):
+
+            def evaluation_loop(self, data_loader, metric_classes):
+                return {'accuracy': 10 + (-1)**self.iter * 1 * self.iter}
+
+        from modelscope.utils.regress_test_utils import MsRegressTool
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 10
+        cfg.preprocessor.first_sequence = 'sentence1'
+        cfg.preprocessor.second_sequence = 'sentence2'
+        cfg.preprocessor.label = 'label'
+        cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
+        cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
+        cfg.train.dataloader.batch_size_per_gpu = 2
+        cfg.train.hooks = [{
+            'type': 'BestCkptSaverHook',
+            'interval': 1,
+            'by_epoch': False,
+            'metric_key': 'accuracy',
+            'max_checkpoint_num': 4,
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }, {
+            'type': 'EvaluationHook',
+            'by_epoch': False,
+            'interval': 1
+        }]
+        cfg.train.work_dir = self.tmp_dir
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
+        dataset = dataset.to_hf_dataset().select(range(4))
+        kwargs = dict(
+            model=model_id,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            cfg_file=cfg_file)
+
+        regress_tool = MsRegressTool(baseline=True)
+        trainer: MockTrainer = MockTrainer(**kwargs)
+
+        def lazy_stop_callback():
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    if trainer.iter == 10:
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+            if 'EarlyStopHook' not in [
+                    hook.__class__.__name__ for hook in trainer.hooks
+            ]:
+                trainer.register_hook(EarlyStopHook())
+
+        with regress_tool.monitor_ms_train(
+                trainer,
+                'trainer_continue_train',
+                level='strict',
+                lazy_stop_callback=lazy_stop_callback):
+            trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in [22, 24, 26, 28]:
+            self.assertTrue(
+                any([
+                    f'accuracy{i}.pth' in filename
+                    for filename in results_files
+                ]))
+
     @unittest.skip('skip for now before test is re-configured')
     def test_trainer_with_configured_datasets(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index 0243053e..2db61637 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -40,12 +40,18 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(imports, dict)
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
-        self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
+        self.assertListEqual(
+            list(set(imports.keys()) - set(['torch', 'os'])), [])
         self.assertEqual(len(from_imports.keys()), 10)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
-        self.assertEqual(decorators,
-                         [('PIPELINES', 'text-generation', 'text-generation')])
+        self.assertEqual(
+            decorators,
+            [('PIPELINES', 'text-generation', 'text-generation'),
+             ('PIPELINES', 'text2text-generation', 'translation_en_to_de'),
+             ('PIPELINES', 'text2text-generation', 'translation_en_to_ro'),
+             ('PIPELINES', 'text2text-generation', 'translation_en_to_fr'),
+             ('PIPELINES', 'text2text-generation', 'text2text-generation')])
 
     def test_files_scaning_method(self):
         fileScaner = FilesAstScaning()