From 605cd7f44a19b083b8fe4a7ca78d749ab8b52574 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:26:25 +0800
Subject: [PATCH] [to #42322933] NLP 1030 Refactor

Features:
1. Refactor the directory structure of nlp models. All model files are placed into either the model folder or the task_model folder
2. Refactor all the comments to google style
3. Add detail comments to important tasks and nlp models, to list the description of the model, and its preprocessor&trainer
4. Model Exporting now supports a direct all to TorchModelExporter(no need to derive from it)
5. Refactor model save_pretrained method to support direct running(independent from trainer)
6. Remove the judgement of Model in the pipeline base class, to support outer register models running in our pipelines
7. Nlp trainer now has a NLPTrainingArguments class , user can pass arguments into the dataclass, and use it as a normal cfg_modify_fn, to simplify the operation of modify cfg.
8. Merge the BACKBONES and the MODELS, so user can get a backbone with the Model.from_pretrained call
9. Model.from_pretrained now support a task argument, so user can use a backbone and load it with a specific task class.
10. Support Preprocessor.from_pretrained method
11. Add standard return classes to important nlp tasks, so some of the pipelines and the models are independent now, the return values of the models will always be tensors, and the pipelines will take care of the conversion to numpy and the following stuffs.
12. Split the file of the nlp preprocessors, to make the dir structure more clear.

Bugs Fixing:
1. Fix a bug that lr_scheduler can be called earlier than the optimizer's step
2. Fix a bug that the direct call of Pipelines (not from pipeline(xxx)) throws error
3. Fix a bug that the trainer will not call the correct TaskDataset class
4. Fix a bug that the internal loading of dataset will throws error in the trainer class
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10490585
---
 data/test/regression/fill_mask_sbert_zh.bin   |    4 +-
 data/test/regression/fill_mask_veco_en.bin    |    4 +-
 data/test/regression/fill_mask_veco_zh.bin    |    4 +-
 data/test/regression/sbert-base-tnews.bin     |    4 +-
 data/test/regression/sbert_nli.bin            |    4 +-
 data/test/regression/sbert_sen_sim.bin        |    4 +-
 data/test/regression/sbert_ws_en.bin          |    4 +-
 data/test/regression/sbert_ws_zh.bin          |    4 +-
 data/test/regression/sbert_zero_shot.bin      |    4 +-
 modelscope/exporters/base.py                  |   22 +-
 ...rt_for_sequence_classification_exporter.py |   13 +-
 modelscope/exporters/torch_model_exporter.py  |  115 +-
 modelscope/metainfo.py                        |    1 -
 modelscope/metrics/base.py                    |    3 -
 .../metrics/token_classification_metric.py    |   17 +-
 modelscope/models/base/base_model.py          |   26 +-
 modelscope/models/builder.py                  |   17 +-
 modelscope/models/nlp/T5/__init__.py          |    8 +-
 .../nlp/T5/{modeling_t5.py => backbone.py}    | 1014 +++------
 .../{configuration_t5.py => configuration.py} |    1 +
 .../models/nlp/T5/t5_for_text_generation.py   |   56 -
 .../models/nlp/T5/text2text_generation.py     |  455 ++++
 modelscope/models/nlp/__init__.py             |  125 +-
 modelscope/models/nlp/backbones/bert.py       |    7 -
 modelscope/models/nlp/backbones/structbert.py |   52 -
 modelscope/models/nlp/bart/__init__.py        |    2 +
 .../text_error_correction.py}                 |    0
 modelscope/models/nlp/bert/__init__.py        |   48 +-
 modelscope/models/nlp/bert/backbone.py        |  952 ++++++++
 ...configuration_bert.py => configuration.py} |    1 +
 .../document_segmentation.py}                 |    1 +
 modelscope/models/nlp/bert/fill_mask.py       |  299 +++
 modelscope/models/nlp/bert/modeling_bert.py   | 1961 ----------------
 .../models/nlp/bert/sentence_embedding.py     |  113 +
 .../models/nlp/bert/text_classification.py    |  208 ++
 modelscope/models/nlp/bert/text_ranking.py    |   89 +
 .../models/nlp/bert/token_classification.py   |  225 ++
 modelscope/models/nlp/csanmt/__init__.py      |    2 +
 .../translation.py}                           |    0
 modelscope/models/nlp/deberta_v2/__init__.py  |   40 +-
 .../{modeling_deberta_v2.py => backbone.py}   |  729 +-----
 ...uration_deberta_v2.py => configuration.py} |    2 -
 modelscope/models/nlp/deberta_v2/fill_mask.py |  230 ++
 ...nization_deberta_v2.py => tokenization.py} |    0
 ...eberta_v2_fast.py => tokenization_fast.py} |    2 +-
 modelscope/models/nlp/gpt3/__init__.py        |   16 +-
 .../gpt3/{modeling_gpt3.py => backbone.py}    |    2 +-
 ...configuration_gpt3.py => configuration.py} |    0
 ..._text_generation.py => text_generation.py} |    0
 .../gpt3/{tokenizer_gpt3.py => tokenizer.py}  |    0
 .../nlp/{backbones => gpt_neo}/__init__.py    |    6 +-
 .../gpt_neo.py => gpt_neo/backbone.py}        |    5 +-
 .../nlp/heads/token_classification_head.py    |    4 +-
 modelscope/models/nlp/masked_language.py      |  164 --
 modelscope/models/nlp/palm_v2/__init__.py     |   12 +-
 .../palm_v2/{modeling_palm.py => backbone.py} |    2 +-
 ...configuration_palm.py => configuration.py} |    0
 ..._text_generation.py => text_generation.py} |    0
 modelscope/models/nlp/plug/__init__.py        |    8 +-
 .../plug/{modeling_plug.py => backbone.py}    |    2 +-
 ...configuration_plug.py => configuration.py} |    0
 .../models/nlp/plug/distributed_plug.py       |    3 +-
 modelscope/models/nlp/ponet/__init__.py       |   16 +-
 .../ponet/{modeling_ponet.py => backbone.py}  |  857 +------
 ...onfiguration_ponet.py => configuration.py} |    6 +-
 modelscope/models/nlp/ponet/fill_mask.py      |  252 +++
 ...{tokenization_ponet.py => tokenization.py} |    1 +
 .../models/nlp/ponet_for_masked_language.py   |   53 -
 modelscope/models/nlp/sentence_embedding.py   |   74 -
 .../models/nlp/sequence_classification.py     |  287 ---
 modelscope/models/nlp/space/__init__.py       |   20 +-
 ...onfiguration_space.py => configuration.py} |    0
 ...diction.py => dialog_intent_prediction.py} |   23 +-
 ..._dialog_modeling.py => dialog_modeling.py} |   22 +-
 ...ling_space.py => dialog_state_tracking.py} |  224 +-
 modelscope/models/nlp/space/model/__init__.py |    4 +-
 .../models/nlp/space/model/generator.py       |   19 +-
 .../models/nlp/space/model/model_base.py      |   11 +-
 .../nlp/space/model/tokenization_space.py     |    2 +-
 .../nlp/space/model/unified_transformer.py    |   22 +-
 .../nlp/space/modules/transformer_block.py    |   17 +-
 .../space/space_for_dialog_state_tracking.py  |  101 -
 modelscope/models/nlp/space_T_cn/__init__.py  |   21 +
 .../{modeling_space_T_cn.py => backbone.py}   |    5 +-
 ...uration_space_T_cn.py => configuration.py} |    2 +-
 .../table_question_answering.py               |   38 +-
 modelscope/models/nlp/space_T_en/__init__.py  |   21 +
 .../text_to_sql.py}                           |   34 +-
 modelscope/models/nlp/structbert/__init__.py  |   28 +-
 modelscope/models/nlp/structbert/backbone.py  |  932 ++++++++
 ...onfiguration_sbert.py => configuration.py} |   12 +-
 .../faq_question_answering.py}                |  238 +-
 modelscope/models/nlp/structbert/fill_mask.py |  284 +++
 .../models/nlp/structbert/modeling_sbert.py   | 1963 -----------------
 .../nlp/structbert/text_classification.py     |  235 ++
 .../nlp/structbert/token_classification.py    |  229 ++
 ...{tokenization_sbert.py => tokenization.py} |    0
 ...ion_sbert_fast.py => tokenization_fast.py} |    2 +-
 modelscope/models/nlp/task_models/__init__.py |    7 +
 .../nlp/task_models/feature_extraction.py     |   10 +-
 .../models/nlp/task_models/fill_mask.py       |    3 +-
 .../nlp/task_models/information_extraction.py |    2 +-
 .../nncrf_for_named_entity_recognition.py     |  121 +-
 .../task_models/sequence_classification.py    |   24 +-
 .../models/nlp/task_models/task_model.py      |    4 +-
 .../nlp/task_models/token_classification.py   |   31 +-
 modelscope/models/nlp/text_ranking.py         |   80 -
 modelscope/models/nlp/token_classification.py |  245 --
 modelscope/models/nlp/veco/__init__.py        |   24 +-
 modelscope/models/nlp/veco/backbone.py        |   96 +
 ...configuration_veco.py => configuration.py} |    0
 modelscope/models/nlp/veco/fill_mask.py       |   99 +
 modelscope/models/nlp/veco/modeling_veco.py   |  143 --
 .../models/nlp/veco/text_classification.py    |  150 ++
 .../models/nlp/veco/token_classification.py   |  107 +
 .../{tokenization_veco.py => tokenization.py} |    0
 ...tion_veco_fast.py => tokenization_fast.py} |    2 +-
 .../task_datasets/torch_base_dataset.py       |    1 +
 modelscope/outputs/__init__.py                |    2 +
 .../fields => outputs/nlp}/__init__.py        |    0
 modelscope/outputs/nlp/model_outputs.py       |  543 +++++
 modelscope/{ => outputs}/outputs.py           |   61 +-
 modelscope/pipelines/base.py                  |   36 +-
 modelscope/pipelines/builder.py               |    6 +-
 modelscope/pipelines/nlp/__init__.py          |    8 +-
 .../conversational_text_to_sql_pipeline.py    |   13 -
 .../nlp/dialog_state_tracking_pipeline.py     |    7 +-
 .../nlp/distributed_plug_pipeline.py          |   11 +-
 .../nlp/faq_question_answering_pipeline.py    |   28 +-
 .../pipelines/nlp/fill_mask_pipeline.py       |  118 +-
 .../pipelines/nlp/fill_mask_ponet_pipeline.py |  136 --
 .../nlp/named_entity_recognition_pipeline.py  |   69 +-
 .../nlp/sentence_embedding_pipeline.py        |   24 +-
 .../nlp/sequence_classification_pipeline.py   |   84 -
 .../nlp/table_question_answering_pipeline.py  |    6 +-
 .../nlp/text_classification_pipeline.py       |  125 +-
 .../pipelines/nlp/text_ranking_pipeline.py    |   20 +-
 .../nlp/token_classification_pipeline.py      |  104 +-
 .../pipelines/nlp/translation_pipeline.py     |    3 +-
 .../nlp/word_segmentation_pipeline.py         |   66 +-
 .../nlp/zero_shot_classification_pipeline.py  |    5 +-
 modelscope/preprocessors/__init__.py          |   61 +-
 modelscope/preprocessors/base.py              |   73 +-
 modelscope/preprocessors/nlp/__init__.py      |   88 +-
 .../nlp/bert_seq_cls_tokenizer.py             |   23 +
 .../nlp/document_segmentation_preprocessor.py |  220 ++
 .../faq_question_answering_preprocessor.py    |   90 +
 .../nlp/fill_mask_preprocessor.py             |  142 ++
 modelscope/preprocessors/nlp/nlp_base.py      | 1178 ++--------
 .../nlp/relation_extraction_preprocessor.py   |   55 +
 .../sentence_classification_preprocessor.py   |   25 +
 .../nlp/sentence_embedding_preprocessor.py    |   52 +
 .../nlp/sentence_piece_preprocessor.py        |   32 +
 .../preprocessors/{ => nlp}/space/__init__.py |    0
 .../preprocessors/{ => nlp}/space/args.py     |    5 +-
 .../preprocessors/{ => nlp}/space/batch.py    |    3 +
 .../{ => nlp}/space/data_loader.py            |   16 +-
 .../dialog_intent_prediction_preprocessor.py  |   20 +-
 .../space/dialog_modeling_preprocessor.py     |   17 +-
 .../dialog_state_tracking_preprocessor.py     |   10 +-
 .../{ => nlp}/space/dst_processors.py         |    0
 .../nlp/space/fields/__init__.py              |   23 +
 .../{ => nlp}/space/fields/gen_field.py       |    2 +-
 .../{ => nlp}/space/fields/intent_field.py    |    2 +-
 .../{ => nlp}/space/lazy_dataset.py           |    7 +-
 .../{ => nlp}/space/preprocess.py             |    7 +-
 .../preprocessors/{ => nlp}/space/sampler.py  |    4 +-
 .../{ => nlp}/space/tensorlistdataset.py      |    0
 .../{ => nlp}/space/tokenizer.py              |    2 +
 .../{ => nlp}/space_T_cn/__init__.py          |    0
 .../nlp/space_T_cn/fields/__init__.py         |    0
 .../{ => nlp}/space_T_cn/fields/database.py   |    2 +-
 .../space_T_cn/fields/schema_link.py          |    2 +-
 .../{ => nlp}/space_T_cn/fields/struct.py     |    0
 .../table_question_answering_preprocessor.py  |    5 +-
 .../{star => nlp/space_T_en}/__init__.py      |    0
 ...conversational_text_to_sql_preprocessor.py |   17 +-
 .../space_T_en}/fields/__init__.py            |    0
 .../space_T_en}/fields/common_utils.py        |    0
 .../{star => nlp/space_T_en}/fields/parse.py  |    0
 .../space_T_en}/fields/preprocess_dataset.py  |    2 +-
 .../space_T_en}/fields/process_dataset.py     |    5 -
 .../nlp/text2text_generation_preprocessor.py  |   40 +
 .../nlp/text_error_correction.py              |    5 +-
 .../nlp/text_generation_jieba_preprocessor.py |   44 +
 .../nlp/text_generation_preprocessor.py       |   62 +
 .../nlp/text_ranking_preprocessor.py          |   67 +
 .../nlp/token_classification_preprocessor.py  |  261 +++
 .../zero_shot_classification_reprocessor.py   |   51 +
 .../preprocessors/space/fields/__init__.py    |    2 -
 .../space/fields/dst_processors.py            | 1523 -------------
 modelscope/trainers/__init__.py               |    5 +-
 modelscope/trainers/default_config.py         |    3 +-
 .../trainers/hooks/lr_scheduler_hook.py       |    3 +-
 modelscope/trainers/hooks/optimizer/base.py   |    1 +
 modelscope/trainers/nlp/__init__.py           |    2 +-
 .../nlp/space/dialog_intent_trainer.py        |   73 +-
 .../nlp/space/dialog_modeling_trainer.py      |    3 +-
 .../trainers/nlp/space/trainer/gen_trainer.py |    9 +-
 .../nlp/space/trainer/intent_trainer.py       |  151 +-
 .../trainers/nlp/text_ranking_trainer.py      |   14 +-
 modelscope/trainers/nlp_trainer.py            |  491 ++++-
 modelscope/trainers/trainer.py                |   74 +-
 modelscope/utils/checkpoint.py                |    2 +-
 modelscope/utils/constant.py                  |    1 -
 modelscope/utils/hub.py                       |   10 +-
 modelscope/utils/nlp/space/args.py            |    4 +-
 modelscope/utils/nlp/space/clean_dataset.py   |    2 +
 modelscope/utils/nlp/space/criterions.py      |    2 +
 modelscope/utils/nlp/space/db_ops.py          |    2 +
 modelscope/utils/nlp/space/ontology.py        |    2 +
 modelscope/utils/nlp/space/scores.py          |    3 +
 modelscope/utils/nlp/space/utils.py           |    2 +
 modelscope/utils/nlp/space/utils_dst.py       |   26 +
 modelscope/utils/nlp/space_T_en/__init__.py   |    0
 .../nlp/{nlp_utils.py => space_T_en/utils.py} |   24 +-
 modelscope/utils/registry.py                  |    1 +
 modelscope/utils/regress_test_utils.py        |   70 +-
 modelscope/utils/tensor_utils.py              |   14 +-
 ...st_export_sbert_sequence_classification.py |   39 +-
 tests/hub/test_download_dataset.py            |  709 ++++++
 tests/models/test_deberta_v2_backbone.py      |   23 +
 tests/outputs/__init__.py                     |    0
 tests/outputs/test_model_outputs.py           |   30 +
 tests/pipelines/nlp/test_faq.py               |   59 +
 .../test_conversational_text_to_sql.py        |    3 +-
 .../test_dialog_intent_prediction.py          |    9 +-
 tests/pipelines/test_dialog_modeling.py       |   18 +-
 tests/pipelines/test_dialog_state_tracking.py |   16 +-
 .../pipelines/test_faq_question_answering.py  |    6 +-
 tests/pipelines/test_fill_mask.py             |   17 +-
 tests/pipelines/test_nli.py                   |    7 +-
 tests/pipelines/test_part_of_speech.py        |    2 +-
 tests/pipelines/test_sentence_embedding.py    |    4 +-
 tests/pipelines/test_sentence_similarity.py   |    5 +-
 .../test_sentiment_classification.py          |    5 +-
 .../test_table_question_answering.py          |    2 +-
 tests/pipelines/test_text_classification.py   |  100 +
 tests/pipelines/test_text_ranking.py          |    4 +-
 .../test_finetune_sequence_classification.py  |   53 +-
 tests/trainers/test_trainer_with_nlp.py       |    3 +-
 241 files changed, 10587 insertions(+), 11541 deletions(-)
 rename modelscope/models/nlp/T5/{modeling_t5.py => backbone.py} (73%)
 rename modelscope/models/nlp/T5/{configuration_t5.py => configuration.py} (99%)
 delete mode 100644 modelscope/models/nlp/T5/t5_for_text_generation.py
 create mode 100644 modelscope/models/nlp/T5/text2text_generation.py
 delete mode 100644 modelscope/models/nlp/backbones/bert.py
 delete mode 100644 modelscope/models/nlp/backbones/structbert.py
 create mode 100644 modelscope/models/nlp/bart/__init__.py
 rename modelscope/models/nlp/{bart_for_text_error_correction.py => bart/text_error_correction.py} (100%)
 create mode 100755 modelscope/models/nlp/bert/backbone.py
 rename modelscope/models/nlp/bert/{configuration_bert.py => configuration.py} (99%)
 rename modelscope/models/nlp/{bert_for_document_segmentation.py => bert/document_segmentation.py} (99%)
 create mode 100644 modelscope/models/nlp/bert/fill_mask.py
 delete mode 100755 modelscope/models/nlp/bert/modeling_bert.py
 create mode 100644 modelscope/models/nlp/bert/sentence_embedding.py
 create mode 100644 modelscope/models/nlp/bert/text_classification.py
 create mode 100644 modelscope/models/nlp/bert/text_ranking.py
 create mode 100644 modelscope/models/nlp/bert/token_classification.py
 create mode 100644 modelscope/models/nlp/csanmt/__init__.py
 rename modelscope/models/nlp/{csanmt_for_translation.py => csanmt/translation.py} (100%)
 rename modelscope/models/nlp/deberta_v2/{modeling_deberta_v2.py => backbone.py} (64%)
 rename modelscope/models/nlp/deberta_v2/{configuration_deberta_v2.py => configuration.py} (98%)
 create mode 100644 modelscope/models/nlp/deberta_v2/fill_mask.py
 rename modelscope/models/nlp/deberta_v2/{tokenization_deberta_v2.py => tokenization.py} (100%)
 rename modelscope/models/nlp/deberta_v2/{tokenization_deberta_v2_fast.py => tokenization_fast.py} (99%)
 rename modelscope/models/nlp/gpt3/{modeling_gpt3.py => backbone.py} (99%)
 rename modelscope/models/nlp/gpt3/{configuration_gpt3.py => configuration.py} (100%)
 rename modelscope/models/nlp/gpt3/{gpt3_for_text_generation.py => text_generation.py} (100%)
 rename modelscope/models/nlp/gpt3/{tokenizer_gpt3.py => tokenizer.py} (100%)
 rename modelscope/models/nlp/{backbones => gpt_neo}/__init__.py (83%)
 rename modelscope/models/nlp/{backbones/gpt_neo.py => gpt_neo/backbone.py} (74%)
 delete mode 100644 modelscope/models/nlp/masked_language.py
 rename modelscope/models/nlp/palm_v2/{modeling_palm.py => backbone.py} (99%)
 rename modelscope/models/nlp/palm_v2/{configuration_palm.py => configuration.py} (100%)
 rename modelscope/models/nlp/palm_v2/{palm_for_text_generation.py => text_generation.py} (100%)
 rename modelscope/models/nlp/plug/{modeling_plug.py => backbone.py} (99%)
 rename modelscope/models/nlp/plug/{configuration_plug.py => configuration.py} (100%)
 rename modelscope/models/nlp/ponet/{modeling_ponet.py => backbone.py} (55%)
 rename modelscope/models/nlp/ponet/{configuration_ponet.py => configuration.py} (96%)
 create mode 100644 modelscope/models/nlp/ponet/fill_mask.py
 rename modelscope/models/nlp/ponet/{tokenization_ponet.py => tokenization.py} (98%)
 delete mode 100644 modelscope/models/nlp/ponet_for_masked_language.py
 delete mode 100644 modelscope/models/nlp/sentence_embedding.py
 delete mode 100644 modelscope/models/nlp/sequence_classification.py
 rename modelscope/models/nlp/space/{model/configuration_space.py => configuration.py} (100%)
 rename modelscope/models/nlp/space/{space_for_dialog_intent_prediction.py => dialog_intent_prediction.py} (66%)
 rename modelscope/models/nlp/space/{space_for_dialog_modeling.py => dialog_modeling.py} (73%)
 rename modelscope/models/nlp/space/{model/modeling_space.py => dialog_state_tracking.py} (57%)
 delete mode 100644 modelscope/models/nlp/space/space_for_dialog_state_tracking.py
 rename modelscope/models/nlp/space_T_cn/{modeling_space_T_cn.py => backbone.py} (99%)
 rename modelscope/models/nlp/space_T_cn/{configuration_space_T_cn.py => configuration.py} (100%)
 rename modelscope/models/nlp/{ => space_T_cn}/table_question_answering.py (94%)
 create mode 100644 modelscope/models/nlp/space_T_en/__init__.py
 rename modelscope/models/nlp/{star_text_to_sql.py => space_T_en/text_to_sql.py} (59%)
 create mode 100755 modelscope/models/nlp/structbert/backbone.py
 rename modelscope/models/nlp/structbert/{configuration_sbert.py => configuration.py} (94%)
 rename modelscope/models/nlp/{sbert_for_faq_question_answering.py => structbert/faq_question_answering.py} (74%)
 create mode 100644 modelscope/models/nlp/structbert/fill_mask.py
 delete mode 100755 modelscope/models/nlp/structbert/modeling_sbert.py
 create mode 100644 modelscope/models/nlp/structbert/text_classification.py
 create mode 100644 modelscope/models/nlp/structbert/token_classification.py
 rename modelscope/models/nlp/structbert/{tokenization_sbert.py => tokenization.py} (100%)
 rename modelscope/models/nlp/structbert/{tokenization_sbert_fast.py => tokenization_fast.py} (99%)
 rename modelscope/models/nlp/{ => task_models}/nncrf_for_named_entity_recognition.py (83%)
 delete mode 100644 modelscope/models/nlp/text_ranking.py
 delete mode 100644 modelscope/models/nlp/token_classification.py
 create mode 100644 modelscope/models/nlp/veco/backbone.py
 rename modelscope/models/nlp/veco/{configuration_veco.py => configuration.py} (100%)
 create mode 100644 modelscope/models/nlp/veco/fill_mask.py
 delete mode 100644 modelscope/models/nlp/veco/modeling_veco.py
 create mode 100644 modelscope/models/nlp/veco/text_classification.py
 create mode 100644 modelscope/models/nlp/veco/token_classification.py
 rename modelscope/models/nlp/veco/{tokenization_veco.py => tokenization.py} (100%)
 rename modelscope/models/nlp/veco/{tokenization_veco_fast.py => tokenization_fast.py} (99%)
 create mode 100644 modelscope/outputs/__init__.py
 rename modelscope/{preprocessors/space_T_cn/fields => outputs/nlp}/__init__.py (100%)
 create mode 100644 modelscope/outputs/nlp/model_outputs.py
 rename modelscope/{ => outputs}/outputs.py (93%)
 delete mode 100644 modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline.py
 create mode 100644 modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
 create mode 100644 modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/fill_mask_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
 rename modelscope/preprocessors/{ => nlp}/space/__init__.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space/args.py (97%)
 rename modelscope/preprocessors/{ => nlp}/space/batch.py (96%)
 rename modelscope/preprocessors/{ => nlp}/space/data_loader.py (87%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_intent_prediction_preprocessor.py (64%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_modeling_preprocessor.py (75%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_state_tracking_preprocessor.py (92%)
 rename modelscope/preprocessors/{ => nlp}/space/dst_processors.py (100%)
 create mode 100644 modelscope/preprocessors/nlp/space/fields/__init__.py
 rename modelscope/preprocessors/{ => nlp}/space/fields/gen_field.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space/fields/intent_field.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space/lazy_dataset.py (93%)
 rename modelscope/preprocessors/{ => nlp}/space/preprocess.py (92%)
 rename modelscope/preprocessors/{ => nlp}/space/sampler.py (96%)
 rename modelscope/preprocessors/{ => nlp}/space/tensorlistdataset.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space/tokenizer.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/__init__.py (100%)
 create mode 100644 modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/database.py (98%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/schema_link.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/struct.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/table_question_answering_preprocessor.py (96%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/__init__.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/conversational_text_to_sql_preprocessor.py (84%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/__init__.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/common_utils.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/parse.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/preprocess_dataset.py (95%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/process_dataset.py (94%)
 create mode 100644 modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_generation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_ranking_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/token_classification_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
 delete mode 100644 modelscope/preprocessors/space/fields/__init__.py
 delete mode 100644 modelscope/preprocessors/space/fields/dst_processors.py
 create mode 100644 modelscope/utils/nlp/space_T_en/__init__.py
 rename modelscope/utils/nlp/{nlp_utils.py => space_T_en/utils.py} (52%)
 create mode 100644 tests/hub/test_download_dataset.py
 create mode 100644 tests/models/test_deberta_v2_backbone.py
 create mode 100644 tests/outputs/__init__.py
 create mode 100644 tests/outputs/test_model_outputs.py
 create mode 100644 tests/pipelines/nlp/test_faq.py
 create mode 100644 tests/pipelines/test_text_classification.py

diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin
index 812f7ba2..62581a26 100644
--- a/data/test/regression/fill_mask_sbert_zh.bin
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
-size 119940
+oid sha256:4eae921001139d7e3c06331c9ef2213f8fc1c23512acd95751559866fb770e96
+size 121855
diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin
index be3fddc8..4d2dba7d 100644
--- a/data/test/regression/fill_mask_veco_en.bin
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
-size 119619
+oid sha256:f97d34d7450d17d0a93647129ab10d16b1f6e70c34a73b6f7687b79519ee4f71
+size 121563
diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin
index c0d27e20..a6eb5621 100644
--- a/data/test/regression/fill_mask_veco_zh.bin
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
-size 119619
+oid sha256:a8355f27a3235209f206b5e75f4400353e5989e94cf4d71270b42ded8821d536
+size 121563
diff --git a/data/test/regression/sbert-base-tnews.bin b/data/test/regression/sbert-base-tnews.bin
index 1546860f..d2c63ab0 100644
--- a/data/test/regression/sbert-base-tnews.bin
+++ b/data/test/regression/sbert-base-tnews.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bce1341f4b55d536771dad6e2b280458579f46c3216474ceb8a926022ab53d0
-size 151572
+oid sha256:344ef971bdf310b76c6571d1f4994ab6abc5edc659654d71a4f75b14a30960c2
+size 152926
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
index 68efb778..52e31692 100644
--- a/data/test/regression/sbert_nli.bin
+++ b/data/test/regression/sbert_nli.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6af5024a26337a440c7ea2935fce84af558dd982ee97a2f027bb922cc874292b
-size 61741
+oid sha256:f0aeb07b6c9b40a0cfa7492e839431764e9bece93c906833a07c05e83520a399
+size 63161
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
index 362f762c..1c8efb81 100644
--- a/data/test/regression/sbert_sen_sim.bin
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbce084781342ca7274c2e4d02ed5c5de43ba213a3b76328d5994404d6544c41
-size 61745
+oid sha256:7aa5c7a2565ccf0d2eea4baf8adbd0e020dbe36a7159b31156c53141cc9b2df2
+size 63165
diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
index 6e441f7f..3ad45356 100644
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
-size 61239
+oid sha256:cc6de82a8485fbfa008f6c2d5411cd07ba03e4a780bcb4e67efc6fba3c6ce92f
+size 63597
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index b1841351..a85d787f 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
-size 61115
+oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
+size 63349
diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin
index 23d40946..04171523 100644
--- a/data/test/regression/sbert_zero_shot.bin
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
-size 61589
+oid sha256:01f9b9bf6f8bbf9bb377d4cb6f399b2e5e065381f5b7332343e0db7b4fae72a5
+size 62519
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
index f19d2bbb..c8b7900e 100644
--- a/modelscope/exporters/base.py
+++ b/modelscope/exporters/base.py
@@ -19,10 +19,13 @@ class Exporter(ABC):
     def from_model(cls, model: Model, **kwargs):
         """Build the Exporter instance.
 
-        @param model: A model instance. it will be used to output the generated file,
+        Args:
+            model: A Model instance. it will be used to generate the intermediate format file,
             and the configuration.json in its model_dir field will be used to create the exporter instance.
-        @param kwargs: Extra kwargs used to create the Exporter instance.
-        @return: The Exporter instance
+            kwargs: Extra kwargs used to create the Exporter instance.
+
+        Returns:
+            The Exporter instance
         """
         cfg = Config.from_file(
             os.path.join(model.model_dir, ModelFile.CONFIGURATION))
@@ -44,10 +47,13 @@ class Exporter(ABC):
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param opset: The version of the ONNX operator set to use.
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
-            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
-        @return: A dict contains the model name with the model file path.
+        Args:
+            opset: The version of the ONNX operator set to use.
+            outputs: The output dir.
+            kwargs: In this default implementation,
+                kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
+
+        Returns:
+            A dict contains the model name with the model file path.
         """
         pass
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index 52dab4bc..7cee331b 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -27,11 +27,14 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
                               **kwargs) -> Dict[str, Any]:
         """Generate dummy inputs for model exportation to onnx or other formats by tracing.
 
-        @param shape: A tuple of input shape which should have at most two dimensions.
-            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
-            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
-        @param pair: Generate sentence pairs or single sentences for dummy inputs.
-        @return: Dummy inputs.
+        Args:
+            shape: A tuple of input shape which should have at most two dimensions.
+                shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
+                shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+            pair(bool, `optional`): Whether to generate sentence pairs or single sentences.
+
+        Returns:
+            Dummy inputs.
         """
 
         cfg = Config.from_file(
diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
index 98a23fe5..94ef277a 100644
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -13,8 +13,8 @@ from modelscope.models import TorchModel
 from modelscope.pipelines.base import collate_fn
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.regress_test_utils import compare_arguments_nested
-from modelscope.utils.tensor_utils import torch_nested_numpify
+from modelscope.utils.regress_test_utils import (compare_arguments_nested,
+                                                 numpify_tensor_nested)
 from .base import Exporter
 
 logger = get_logger(__name__)
@@ -28,49 +28,61 @@ class TorchModelExporter(Exporter):
     and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
     """
 
-    def export_onnx(self, outputs: str, opset=11, **kwargs):
+    def export_onnx(self, output_dir: str, opset=13, **kwargs):
         """Export the model as onnx format files.
 
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param opset: The version of the ONNX operator set to use.
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
-            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
-            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
-        @return: A dict containing the model key - model file path pairs.
+        Args:
+            opset: The version of the ONNX operator set to use.
+            output_dir: The output dir.
+            kwargs:
+                model: A model instance which will replace the exporting of self.model.
+                In this default implementation,
+                you can pass the arguments needed by _torch_export_onnx, other unrecognized args
+                will be carried to generate_dummy_inputs as extra arguments (such as input shape).
+
+        Returns:
+            A dict containing the model key - model file path pairs.
         """
-        model = self.model
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             model = model.model
-        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
+        onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
         self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
         return {'model': onnx_file}
 
-    def export_torch_script(self, outputs: str, **kwargs):
+    def export_torch_script(self, output_dir: str, **kwargs):
         """Export the model as torch script files.
 
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
+        Args:
+            output_dir: The output dir.
+            kwargs:
+            model: A model instance which will replace the exporting of self.model.
+            In this default implementation,
             you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
             will be carried to generate_dummy_inputs as extra arguments (like input shape).
-        @return: A dict contains the model name with the model file path.
+
+        Returns:
+            A dict contains the model name with the model file path.
         """
-        model = self.model
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             model = model.model
-        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
+        ts_file = os.path.join(output_dir, ModelFile.TS_MODEL_FILE)
         # generate ts by tracing
         self._torch_export_torch_script(model, ts_file, **kwargs)
         return {'model': ts_file}
 
     def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
         """Generate dummy inputs for model exportation to onnx or other formats by tracing.
-        @return: Dummy inputs.
+
+        Returns:
+            Dummy inputs.
         """
         return None
 
@@ -93,7 +105,7 @@ class TorchModelExporter(Exporter):
     def _torch_export_onnx(self,
                            model: nn.Module,
                            output: str,
-                           opset: int = 11,
+                           opset: int = 13,
                            device: str = 'cpu',
                            validation: bool = True,
                            rtol: float = None,
@@ -101,18 +113,27 @@ class TorchModelExporter(Exporter):
                            **kwargs):
         """Export the model to an onnx format file.
 
-        @param model: A torch.nn.Module instance to export.
-        @param output: The output file.
-        @param opset: The version of the ONNX operator set to use.
-        @param device: The device used to forward.
-        @param validation: Whether validate the export file.
-        @param rtol: The rtol used to regress the outputs.
-        @param atol: The atol used to regress the outputs.
+        Args:
+            model: A torch.nn.Module instance to export.
+            output: The output file.
+            opset: The version of the ONNX operator set to use.
+            device: The device used to forward.
+            validation: Whether validate the export file.
+            rtol: The rtol used to regress the outputs.
+            atol: The atol used to regress the outputs.
+            kwargs:
+                dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
+                inputs: An inputs structure which will replace the calling of self.inputs.
+                outputs: An outputs structure which will replace the calling of self.outputs.
         """
 
-        dummy_inputs = self.generate_dummy_inputs(**kwargs)
-        inputs = self.inputs
-        outputs = self.outputs
+        dummy_inputs = self.generate_dummy_inputs(
+            **kwargs) if 'dummy_inputs' not in kwargs else kwargs.pop(
+                'dummy_inputs')
+        inputs = self.inputs if 'inputs' not in kwargs else kwargs.pop(
+            'inputs')
+        outputs = self.outputs if 'outputs' not in kwargs else kwargs.pop(
+            'outputs')
         if dummy_inputs is None or inputs is None or outputs is None:
             raise NotImplementedError(
                 'Model property dummy_inputs,inputs,outputs must be set.')
@@ -125,7 +146,7 @@ class TorchModelExporter(Exporter):
 
             if isinstance(dummy_inputs, Mapping):
                 dummy_inputs = dict(dummy_inputs)
-            onnx_outputs = list(self.outputs.keys())
+            onnx_outputs = list(outputs.keys())
 
             with replace_call():
                 onnx_export(
@@ -160,11 +181,13 @@ class TorchModelExporter(Exporter):
                 outputs_origin = model.forward(
                     *_decide_input_format(model, dummy_inputs))
             if isinstance(outputs_origin, Mapping):
-                outputs_origin = torch_nested_numpify(
+                outputs_origin = numpify_tensor_nested(
                     list(outputs_origin.values()))
+            elif isinstance(outputs_origin, (tuple, list)):
+                outputs_origin = numpify_tensor_nested(outputs_origin)
             outputs = ort_session.run(
                 onnx_outputs,
-                torch_nested_numpify(dummy_inputs),
+                numpify_tensor_nested(dummy_inputs),
             )
 
             tols = {}
@@ -184,19 +207,26 @@ class TorchModelExporter(Exporter):
                                    validation: bool = True,
                                    rtol: float = None,
                                    atol: float = None,
+                                   strict: bool = True,
                                    **kwargs):
         """Export the model to a torch script file.
 
-        @param model: A torch.nn.Module instance to export.
-        @param output: The output file.
-        @param device: The device used to forward.
-        @param validation: Whether validate the export file.
-        @param rtol: The rtol used to regress the outputs.
-        @param atol: The atol used to regress the outputs.
+        Args:
+            model: A torch.nn.Module instance to export.
+            output: The output file.
+            device: The device used to forward.
+            validation: Whether validate the export file.
+            rtol: The rtol used to regress the outputs.
+            atol: The atol used to regress the outputs.
+            strict: strict mode in torch script tracing.
+            kwargs:
+                dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
         """
 
         model.eval()
-        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        dummy_param = 'dummy_inputs' not in kwargs
+        dummy_inputs = self.generate_dummy_inputs(
+            **kwargs) if dummy_param else kwargs.pop('dummy_inputs')
         if dummy_inputs is None:
             raise NotImplementedError(
                 'Model property dummy_inputs must be set.')
@@ -207,7 +237,7 @@ class TorchModelExporter(Exporter):
             model.eval()
             with replace_call():
                 traced_model = torch.jit.trace(
-                    model, dummy_inputs, strict=False)
+                    model, dummy_inputs, strict=strict)
         torch.jit.save(traced_model, output)
 
         if validation:
@@ -216,9 +246,9 @@ class TorchModelExporter(Exporter):
                 model.eval()
                 ts_model.eval()
                 outputs = ts_model.forward(*dummy_inputs)
-                outputs = torch_nested_numpify(outputs)
+                outputs = numpify_tensor_nested(outputs)
                 outputs_origin = model.forward(*dummy_inputs)
-                outputs_origin = torch_nested_numpify(outputs_origin)
+                outputs_origin = numpify_tensor_nested(outputs_origin)
             tols = {}
             if rtol is not None:
                 tols['rtol'] = rtol
@@ -240,7 +270,6 @@ def replace_call():
     problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
     back after the tracing was done.
     """
-
     TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
     yield
     TorchModel.__call__ = TorchModel.call_origin
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 913589d8..01b08699 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -69,7 +69,6 @@ class Models(object):
     space_modeling = 'space-modeling'
     space_T_en = 'space-T-en'
     space_T_cn = 'space-T-cn'
-
     tcrf = 'transformer-crf'
     transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 3a9d810f..1b9db825 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -10,9 +10,6 @@ class Metric(ABC):
     complex metrics for a specific task with or without other Metric subclasses.
     """
 
-    def __init__(self, trainer=None, *args, **kwargs):
-        self.trainer = trainer
-
     @abstractmethod
     def add(self, outputs: Dict, inputs: Dict):
         """ Append logits and labels within an eval loop.
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 05b72170..f8595fc1 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -34,17 +34,24 @@ class TokenClassificationMetric(Metric):
         self.labels.append(
             torch_nested_numpify(torch_nested_detach(ground_truths)))
 
-    def __init__(self, return_entity_level_metrics=False, *args, **kwargs):
+    def __init__(self,
+                 return_entity_level_metrics=False,
+                 label2id=None,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.return_entity_level_metrics = return_entity_level_metrics
         self.preds = []
         self.labels = []
+        self.label2id = label2id
 
     def evaluate(self):
-        self.id2label = {
-            id: label
-            for label, id in self.trainer.label2id.items()
-        }
+        label2id = self.label2id
+        if label2id is None:
+            assert hasattr(self, 'trainer')
+            label2id = self.trainer.label2id
+
+        self.id2label = {id: label for label, id in label2id.items()}
         self.preds = np.concatenate(self.preds, axis=0)
         self.labels = np.concatenate(self.labels, axis=0)
         predictions = np.argmax(self.preds, axis=-1)
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index cdc71fcf..1246551e 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -5,11 +5,11 @@ from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.models.builder import build_model
-from modelscope.utils.checkpoint import save_pretrained
+from modelscope.models.builder import MODELS, build_model
+from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
 from modelscope.utils.config import Config
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
-from modelscope.utils.device import device_placement, verify_device
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
+from modelscope.utils.device import verify_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -66,7 +66,6 @@ class Model(ABC):
                         revision: Optional[str] = DEFAULT_MODEL_REVISION,
                         cfg_dict: Config = None,
                         device: str = None,
-                        *model_args,
                         **kwargs):
         """ Instantiate a model from local directory or remote model repo. Note
         that when loading from remote, the model revision can be specified.
@@ -90,11 +89,11 @@ class Model(ABC):
             cfg = Config.from_file(
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
+        if 'task' in kwargs:
+            task_name = kwargs.pop('task')
         model_cfg = cfg.model
-
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
-
         model_cfg.model_dir = local_model_dir
         for k, v in kwargs.items():
             model_cfg[k] = v
@@ -109,15 +108,19 @@ class Model(ABC):
         # dynamically add pipeline info to model for pipeline inference
         if hasattr(cfg, 'pipeline'):
             model.pipeline = cfg.pipeline
+
+        if not hasattr(model, 'cfg'):
+            model.cfg = cfg
         return model
 
     def save_pretrained(self,
                         target_folder: Union[str, os.PathLike],
                         save_checkpoint_names: Union[str, List[str]] = None,
-                        save_function: Callable = None,
+                        save_function: Callable = save_checkpoint,
                         config: Optional[dict] = None,
                         **kwargs):
-        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+        """save the pretrained model, its configuration and other related files to a directory,
+            so that it can be re-loaded
 
         Args:
             target_folder (Union[str, os.PathLike]):
@@ -133,5 +136,10 @@ class Model(ABC):
             The config for the configuration.json, might not be identical with model.config
 
         """
+        if config is None and hasattr(self, 'cfg'):
+            config = self.cfg
+        assert config is not None, 'Cannot save the model because the model config is empty.'
+        if isinstance(config, Config):
+            config = config.to_dict()
         save_pretrained(self, target_folder, save_checkpoint_names,
                         save_function, config, **kwargs)
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 7a8e28f4..a35358c1 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from modelscope.utils.config import ConfigDict
+from modelscope.utils.constant import Tasks
 from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg
 
 MODELS = Registry('models')
 BACKBONES = Registry('backbones')
+BACKBONES._modules = MODELS._modules
 HEADS = Registry('heads')
 
 
@@ -23,30 +25,27 @@ def build_model(cfg: ConfigDict,
         cfg, MODELS, group_key=task_name, default_args=default_args)
 
 
-def build_backbone(cfg: ConfigDict,
-                   field: str = None,
-                   default_args: dict = None):
+def build_backbone(cfg: ConfigDict, default_args: dict = None):
     """ build backbone given backbone config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for backbone object.
-        field (str, optional): field, such as CV, NLP's backbone
         default_args (dict, optional): Default initialization arguments.
     """
     return build_from_cfg(
-        cfg, BACKBONES, group_key=field, default_args=default_args)
+        cfg, BACKBONES, group_key=Tasks.backbone, default_args=default_args)
 
 
 def build_head(cfg: ConfigDict,
-               group_key: str = None,
+               task_name: str = None,
                default_args: dict = None):
     """ build head given config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for head object.
+        task_name (str, optional):  task name, refer to
+            :obj:`Tasks` for more details
         default_args (dict, optional): Default initialization arguments.
     """
-    if group_key is None:
-        group_key = cfg[TYPE_NAME]
     return build_from_cfg(
-        cfg, HEADS, group_key=group_key, default_args=default_args)
+        cfg, HEADS, group_key=task_name, default_args=default_args)
diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py
index 7c1cea36..cb0921c6 100644
--- a/modelscope/models/nlp/T5/__init__.py
+++ b/modelscope/models/nlp/T5/__init__.py
@@ -1,13 +1,17 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .t5_for_text_generation import T5ForConditionalGeneration
+    from .backbone import T5Model
+    from .text2text_generation import T5ForConditionalGeneration
 
 else:
     _import_structure = {
-        't5_for_text_generation': ['T5ForConditionalGeneration'],
+        'backbone': ['T5Model'],
+        'text2text_generation': ['T5ForConditionalGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/backbone.py
similarity index 73%
rename from modelscope/models/nlp/T5/modeling_t5.py
rename to modelscope/models/nlp/T5/backbone.py
index da50741e..9a46d980 100644
--- a/modelscope/models/nlp/T5/modeling_t5.py
+++ b/modelscope/models/nlp/T5/backbone.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,12 +22,8 @@ from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from torch.utils.checkpoint import checkpoint
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput, Seq2SeqModelOutput)
 from transformers.modeling_utils import (PreTrainedModel,
                                          find_pruneable_heads_and_indices,
                                          prune_linear_layer)
@@ -36,30 +33,20 @@ from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
 from transformers.utils.model_parallel_utils import (assert_device_map,
                                                      get_device_map)
 
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import (BaseModelOutput,
+                                BaseModelOutputWithPastAndCrossAttentions,
+                                Seq2SeqModelOutput)
+from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from .configuration_t5 import T5Config
+from .configuration import T5Config
 
 logger = get_logger(__name__)
 
-_CONFIG_FOR_DOC = 'T5Config'
-_TOKENIZER_FOR_DOC = 'T5Tokenizer'
-_CHECKPOINT_FOR_DOC = 't5-small'
 
-####################################################
-# This dict contains ids and associated url
-# for the pretrained weights provided with the models
-####################################################
-T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    't5-small',
-    't5-base',
-    't5-large',
-    't5-3b',
-    't5-11b',
-    # See all T5 models at https://huggingface.co/models?filter=t5
-]
-
-
-####################################################
+###################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
 ####################################################
@@ -173,65 +160,6 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
     return model
 
 
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs
-    # using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
 class T5LayerNorm(nn.Module):
 
     def __init__(self, hidden_size, eps=1e-6):
@@ -261,23 +189,6 @@ class T5LayerNorm(nn.Module):
         return self.weight * hidden_states
 
 
-try:
-    from apex.normalization import FusedRMSNorm
-
-    T5LayerNorm = FusedRMSNorm  # noqa
-
-    logger.info(
-        'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
-    )
-except ImportError:
-    # using the normal T5LayerNorm
-    pass
-except Exception:
-    logger.warning(
-        'discovered apex but it failed to load, falling back to T5LayerNorm')
-    pass
-
-
 class T5DenseReluDense(nn.Module):
 
     def __init__(self, config: T5Config):
@@ -791,7 +702,7 @@ class T5Block(nn.Module):
         return outputs
 
 
-class T5PreTrainedModel(PreTrainedModel):
+class T5PreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface
     for downloading and loading pretrained models.
@@ -803,6 +714,10 @@ class T5PreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     @property
     def dummy_inputs(self):
         input_ids = torch.tensor(DUMMY_INPUTS)
@@ -819,8 +734,7 @@ class T5PreTrainedModel(PreTrainedModel):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module,
-                        (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+        elif isinstance(module, T5Model):
             # Mesh TensorFlow embeddings initialization See
             # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
@@ -902,6 +816,36 @@ class T5PreTrainedModel(PreTrainedModel):
 
         return shifted_input_ids
 
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the
+                    label information. num_labels: An optional arg to tell the
+                    model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping
+                                    if num_labels not supplied. If num_labels is
+                                    not found, the model will use the default
+                                    setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by
+            transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir', None)
+        if model_dir is None:
+            config = T5Config(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
+
 
 class T5Stack(T5PreTrainedModel):
 
@@ -926,8 +870,42 @@ class T5Stack(T5PreTrainedModel):
         self.device_map = None
         self.gradient_checkpointing = False
 
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        r"""
+            This is an experimental feature and is a subject to change at a
+            moment's notice.
+
+            Uses a device map to distribute attention modules of the model
+            across several devices. If no device map is given, it will evenly
+            distribute blocks across all devices.
+
+            Args:
+                device_map (`Dict[int, list]`, optional, defaults to None):
+                    A dictionary that maps attention modules to devices. Note
+                    that the embedding module and LMHead are always
+                    automatically mapped to the first device (for esoteric
+                    reasons). That means that the first device should have fewer
+                    attention modules mapped to it than other devices. For
+                    reference, the t5 models have the following number of
+                    attention modules:
+
+                        - t5-small: 6
+                        - t5-base: 12
+                        - t5-large: 24
+                        - t5-3b: 24
+                        - t5-11b: 24
+
+            Example:
+
+            ```python # Here is an example of a device map on a machine with 4
+            GPUs # using t5-3b, which has a total of 24 attention modules: model
+            = T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
+                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+            } model.parallelize(device_map) ``` all of the parallelize methods
+            in this file are the same
+
+        """
         # Check validity of device_map
         self.device_map = (
             get_device_map(len(self.block), range(torch.cuda.device_count()))
@@ -948,8 +926,22 @@ class T5Stack(T5PreTrainedModel):
         # Set final layer norm to last device
         self.final_layer_norm = self.final_layer_norm.to(self.last_device)
 
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        r"""
+            Moves the model to cpu from a model parallel state.
+
+            Example:
+
+            ```python # On a 4 GPU machine with t5-3b: model =
+            T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
+                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+            } model.parallelize(device_map)  # Splits the model across several
+            devices model.deparallelize()  # Put the model back on cpu and
+            cleans memory by calling torch.cuda.empty_cache() ```
+
+            all of the deparallelize methods in this file are the same
+        """
         self.model_parallel = False
         self.device_map = None
         self.first_device = 'cpu'
@@ -1199,7 +1191,20 @@ class T5Stack(T5PreTrainedModel):
         )
 
 
-T5_START_DOCSTRING = r"""
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.T5)
+class T5Model(T5PreTrainedModel):
+    """The bare T5 Model transformer outputting raw hidden-states without any
+    specific head on top.
 
     The T5 model was proposed in [Exploring the Limits of Transfer Learning with
     a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
@@ -1224,10 +1229,99 @@ T5_START_DOCSTRING = r"""
             with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model
             weights.
-"""
+    """
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
 
-T5_INPUTS_DOCSTRING = r"""
-    Args:
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. T5 is a model
             with relative position embeddings so you should be able to pad the
@@ -1343,244 +1437,84 @@ T5_INPUTS_DOCSTRING = r"""
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain
             tuple.
-"""
+        Returns:
 
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model
-            with relative position embeddings so you should be able to pad the
-            inputs on both the right and the left.
+        Example:
 
-            Indices can be obtained using [`T5Tokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for detail.
+        ```python >>> from transformers import T5Tokenizer, T5Model
 
-            To know more on how to prepare `input_ids` for pretraining take a
-            look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
 
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
 
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
-            tuple.
-"""
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
 
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and
-`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
-but this feature is deprecated and will be removed in future versions. If you do
-not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
-torch.ones(num_layers, num_heads)`.
-"""
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
 
-
-@add_start_docstrings(
-    'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
-    T5_START_DOCSTRING,
-)
-class T5Model(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r'encoder\.embed_tokens\.weight',
-        r'decoder\.embed_tokens\.weight',
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.decoder = self.decoder.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
-        heads to prune in this layer} See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        decoder_inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import T5Tokenizer, T5Model
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5Model.from_pretrained("t5-small")
-
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-
-        >>> # forward pass
-        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1]
-                if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2]
-                if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(
-                    self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(
-                    self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
         if not return_dict:
             return decoder_outputs + encoder_outputs
@@ -1595,409 +1529,3 @@ class T5Model(T5PreTrainedModel):
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
-                      T5_START_DOCSTRING)
-class T5ForConditionalGeneration(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r'encoder\.embed_tokens\.weight',
-        r'decoder\.embed_tokens\.weight',
-        r'lm_head\.weight',
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.decoder.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.decoder = self.decoder.to('cpu')
-        self.lm_head = self.lm_head.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
-            labels set to `-100` are ignored (masked), the loss is only computed
-            for labels in `[0, ..., config.vocab_size]`
-
-        Returns:
-
-        Examples:
-
-        ```python >>> from transformers import T5Tokenizer,
-        T5ForConditionalGeneration
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        >>> # training
-        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
-        >>> outputs = model(input_ids=input_ids, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-
-        >>> # inference
-        >>> input_ids = tokenizer(
-        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> outputs = model.generate(input_ids)
-        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-        >>> # studies have shown that owning a dog is good for you.
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            # Convert encoder inputs in embeddings if needed
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1]
-                if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2]
-                if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(
-                    self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(
-                    self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = decoder_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.encoder.first_device)
-            self.lm_head = self.lm_head.to(self.encoder.first_device)
-            sequence_output = sequence_output.to(self.lm_head.weight.device)
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab See
-            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.model_dim**-0.5)
-
-        lm_logits = self.lm_head(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(
-                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-            # TODO(thom): Add z_loss
-            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        if not return_dict:
-            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
-            return ((loss, ) + output) if loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      head_mask=None,
-                                      decoder_head_mask=None,
-                                      cross_attn_head_mask=None,
-                                      use_cache=None,
-                                      encoder_outputs=None,
-                                      **kwargs):
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'decoder_input_ids': input_ids,
-            'past_key_values': past,
-            'encoder_outputs': encoder_outputs,
-            'attention_mask': attention_mask,
-            'head_mask': head_mask,
-            'decoder_head_mask': decoder_head_mask,
-            'cross_attn_head_mask': cross_attn_head_mask,
-            'use_cache': use_cache,
-        }
-
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning(
-                'You might want to consider setting `use_cache=True` to speed up decoding'
-            )
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(
-                        0, beam_idx.to(layer_past_state.device)), )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[
-                0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (
-                reordered_layer_past_states, )
-        return reordered_decoder_past
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-    T5_START_DOCSTRING,
-)
-class T5EncoderModel(T5PreTrainedModel):
-    authorized_missing_keys = [
-        r'encoder\.embed_tokens\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
-        heads to prune in this layer} See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import T5Tokenizer, T5EncoderModel
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5EncoderModel.from_pretrained("t5-small")
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> outputs = model(input_ids=input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        return encoder_outputs
diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration.py
similarity index 99%
rename from modelscope/models/nlp/T5/configuration_t5.py
rename to modelscope/models/nlp/T5/configuration.py
index 117a6bc1..1f9a965e 100644
--- a/modelscope/models/nlp/T5/configuration_t5.py
+++ b/modelscope/models/nlp/T5/configuration.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2020, The T5 Authors and HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py
deleted file mode 100644
index 27f077d8..00000000
--- a/modelscope/models/nlp/T5/t5_for_text_generation.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models.base import Tensor, TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from .modeling_t5 import T5Config
-from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
-
-
-@MODELS.register_module(
-    group_key=Tasks.text2text_generation,
-    module_name=Models.T5,
-)
-class T5ForConditionalGeneration(TorchModel):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        self.model = T5ForGeneration.from_pretrained(model_dir)
-        self.generate = self.model.generate
-        self.config = self.model.config
-
-    def forward(self,
-                input_ids: Optional[torch.LongTensor] = None,
-                attention_mask: Optional[torch.FloatTensor] = None,
-                decoder_input_ids: Optional[torch.LongTensor] = None,
-                decoder_attention_mask: Optional[torch.BoolTensor] = None,
-                head_mask: Optional[torch.FloatTensor] = None,
-                decoder_head_mask: Optional[torch.FloatTensor] = None,
-                cross_attn_head_mask: Optional[torch.Tensor] = None,
-                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-                inputs_embeds: Optional[torch.FloatTensor] = None,
-                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-                labels: Optional[torch.LongTensor] = None,
-                use_cache: Optional[bool] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                return_dict: Optional[bool] = None,
-                **kwargs):
-        return self.model.forward(
-            self, input_ids, attention_mask, decoder_input_ids,
-            decoder_attention_mask, head_mask, decoder_head_mask,
-            cross_attn_head_mask, encoder_outputs, past_key_values,
-            inputs_embeds, decoder_inputs_embeds, labels, use_cache,
-            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py
new file mode 100644
index 00000000..c4dcdfdb
--- /dev/null
+++ b/modelscope/models/nlp/T5/text2text_generation.py
@@ -0,0 +1,455 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.utils.model_parallel_utils import (assert_device_map,
+                                                     get_device_map)
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .backbone import T5PreTrainedModel, T5Stack
+from .configuration import T5Config
+
+logger = get_logger(__name__)
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@MODELS.register_module(
+    group_key=Tasks.text2text_generation,
+    module_name=Models.T5,
+)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+        r'lm_head\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.lm_head = self.lm_head.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                decoder_attention_mask: Optional[torch.BoolTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                decoder_head_mask: Optional[torch.FloatTensor] = None,
+                cross_attn_head_mask: Optional[torch.Tensor] = None,
+                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                **kwargs) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for
+            `decoder_input_ids` generation. If `past_key_values` is used,
+            optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining
+            take a look at [T5 Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in
+            `decoder_input_ids`. Causal mask will also be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in
+                the decoder. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+            sequence_length, hidden_size)` is a sequence of hidden states at the
+            output of the last layer of the encoder. Used in the cross-attention
+            of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to
+            directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more
+            control over how to convert `decoder_input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+            labels set to `-100` are ignored (masked), the loss is only computed
+            for labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python >>> from transformers import T5Tokenizer,
+        T5ForConditionalGeneration
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
+            return ((loss, ) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      head_mask=None,
+                                      decoder_head_mask=None,
+                                      cross_attn_head_mask=None,
+                                      use_cache=None,
+                                      encoder_outputs=None,
+                                      **kwargs):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'decoder_input_ids': input_ids,
+            'past_key_values': past,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                'You might want to consider setting `use_cache=True` to speed up decoding'
+            )
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)), )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[
+                0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states, )
+        return reordered_decoder_past
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 57222698..dff42d1c 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -4,80 +4,99 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .backbones import SbertModel
-    from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .bert_for_document_segmentation import BertForDocumentSegmentation
-    from .csanmt_for_translation import CsanmtForTranslation
+    from .bart import BartForTextErrorCorrection
+    from .csanmt import CsanmtForTranslation
     from .heads import SequenceClassificationHead
     from .gpt3 import GPT3ForTextGeneration
-    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
-                                  BertForMaskedLM, DebertaV2ForMaskedLM)
-    from .ponet_for_masked_language import PoNetForMaskedLM
-    from .nncrf_for_named_entity_recognition import (
-        TransformerCRFForNamedEntityRecognition,
-        LSTMCRFForNamedEntityRecognition)
     from .palm_v2 import PalmForTextGeneration
-    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
-    from .star_text_to_sql import StarForTextToSql
-    from .sequence_classification import (VecoForSequenceClassification,
-                                          SbertForSequenceClassification,
-                                          BertForSequenceClassification)
-    from .space import SpaceForDialogIntent
-    from .space import SpaceForDialogModeling
-    from .space import SpaceForDialogStateTracking
-    from .table_question_answering import TableQuestionAnswering
-    from .task_models import (FeatureExtractionModel,
-                              InformationExtractionModel,
-                              SequenceClassificationModel,
-                              SingleBackboneTaskModelBase,
-                              TokenClassificationModel,
-                              TaskModelForTextGeneration)
-    from .token_classification import SbertForTokenClassification
-    from .sentence_embedding import SentenceEmbedding
-    from .text_ranking import TextRanking
-    from .T5 import T5ForConditionalGeneration
+    from .space_T_en import StarForTextToSql
+    from .space_T_cn import TableQuestionAnswering
+    from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
+    from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
+    from .structbert import (
+        SbertForFaqQuestionAnswering,
+        SbertForMaskedLM,
+        SbertForSequenceClassification,
+        SbertForTokenClassification,
+        SbertTokenizer,
+        SbertTokenizerFast,
+    )
+    from .bert import (
+        BertForMaskedLM,
+        BertForTextRanking,
+        BertForSentenceEmbedding,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForDocumentSegmentation,
+        BertModel,
+        BertConfig,
+    )
+    from .veco import VecoModel, VecoConfig, VecoForTokenClassification, \
+        VecoForSequenceClassification, VecoForMaskedLM, VecoTokenizer, VecoTokenizerFast
+    from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
+    from .task_models import (
+        FeatureExtractionModel,
+        InformationExtractionModel,
+        LSTMCRFForNamedEntityRecognition,
+        SequenceClassificationModel,
+        SingleBackboneTaskModelBase,
+        TaskModelForTextGeneration,
+        TokenClassificationModel,
+        TransformerCRFForNamedEntityRecognition,
+    )
 
+    from .T5 import T5ForConditionalGeneration
+    from .gpt_neo import GPTNeoModel
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
-        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
-        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'bart': ['BartForTextErrorCorrection'],
+        'csanmt': ['CsanmtForTranslation'],
         'heads': ['SequenceClassificationHead'],
         'gpt3': ['GPT3ForTextGeneration'],
-        'masked_language': [
-            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
-            'DebertaV2ForMaskedLM'
+        'structbert': [
+            'SbertForFaqQuestionAnswering',
+            'SbertForMaskedLM',
+            'SbertForSequenceClassification',
+            'SbertForTokenClassification',
+            'SbertTokenizer',
+            'SbertTokenizerFast',
         ],
-        'nncrf_for_named_entity_recognition': [
-            'TransformerCRFForNamedEntityRecognition',
-            'LSTMCRFForNamedEntityRecognition'
-        ],
-        'ponet_for_masked_language': ['PoNetForMaskedLM'],
-        'palm_v2': ['PalmForTextGeneration'],
-        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
-        'star_text_to_sql': ['StarForTextToSql'],
-        'sequence_classification': [
-            'VecoForSequenceClassification', 'SbertForSequenceClassification',
-            'BertForSequenceClassification'
+        'veco': [
+            'VecoModel', 'VecoConfig', 'VecoForTokenClassification',
+            'VecoForSequenceClassification', 'VecoForMaskedLM',
+            'VecoTokenizer', 'VecoTokenizerFast'
         ],
-        'space': [
-            'SpaceForDialogIntent', 'SpaceForDialogModeling',
-            'SpaceForDialogStateTracking'
+        'bert': [
+            'BertForMaskedLM',
+            'BertForTextRanking',
+            'BertForSentenceEmbedding',
+            'BertForSequenceClassification',
+            'BertForTokenClassification',
+            'BertForDocumentSegmentation',
+            'BertModel',
+            'BertConfig',
         ],
+        'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
+        'palm_v2': ['PalmForTextGeneration'],
+        'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
+        'space_T_en': ['StarForTextToSql'],
+        'space_T_cn': ['TableQuestionAnswering'],
+        'space':
+        ['SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDST'],
         'task_models': [
             'FeatureExtractionModel',
             'InformationExtractionModel',
+            'LSTMCRFForNamedEntityRecognition',
             'SequenceClassificationModel',
             'SingleBackboneTaskModelBase',
-            'TokenClassificationModel',
             'TaskModelForTextGeneration',
+            'TokenClassificationModel',
+            'TransformerCRFForNamedEntityRecognition',
         ],
-        'token_classification': ['SbertForTokenClassification'],
-        'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
-        'text_ranking': ['TextRanking'],
         'T5': ['T5ForConditionalGeneration'],
+        'gpt_neo': ['GPTNeoModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py
deleted file mode 100644
index aa513944..00000000
--- a/modelscope/models/nlp/backbones/bert.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.models.nlp.bert import BertModel
-from modelscope.utils.constant import Fields
-
-BACKBONES.register_module(
-    group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
deleted file mode 100644
index 74735520..00000000
--- a/modelscope/models/nlp/backbones/structbert.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import BACKBONES
-from modelscope.models.nlp.structbert import SbertConfig
-from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
-class SbertModel(TorchModel, SbertModelTransform):
-
-    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
-        """
-        Args:
-            model_dir (str, optional): The model checkpoint directory. Defaults to None.
-            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
-        """
-        config = SbertConfig(**config)
-        super().__init__(model_dir)
-        self.config = config
-        SbertModelTransform.__init__(self, config, add_pooling_layer)
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return SbertModelTransform.forward(
-            self, input_ids, attention_mask, token_type_ids, position_ids,
-            head_mask, inputs_embeds, encoder_hidden_states,
-            encoder_attention_mask, past_key_values, use_cache,
-            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/bart/__init__.py b/modelscope/models/nlp/bart/__init__.py
new file mode 100644
index 00000000..31912efc
--- /dev/null
+++ b/modelscope/models/nlp/bart/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .text_error_correction import BartForTextErrorCorrection
diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
similarity index 100%
rename from modelscope/models/nlp/bart_for_text_error_correction.py
rename to modelscope/models/nlp/bart/text_error_correction.py
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
index cca79c2f..28a10f57 100644
--- a/modelscope/models/nlp/bert/__init__.py
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -4,43 +4,33 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .modeling_bert import (
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForNextSentencePrediction,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
+    from .backbone import (
         BertLayer,
-        BertLMHeadModel,
         BertModel,
         BertPreTrainedModel,
-        load_tf_weights_in_bert,
     )
-
-    from .configuration_bert import BertConfig, BertOnnxConfig
-
+    from .configuration import BertConfig
+    from .fill_mask import BertForMaskedLM
+    from .text_ranking import BertForTextRanking
+    from .sentence_embedding import BertForSentenceEmbedding
+    from .text_classification import BertForSequenceClassification
+    from .token_classification import BertForTokenClassification
+    from .document_segmentation import BertForDocumentSegmentation
 else:
     _import_structure = {
-        'configuration_bert': ['BertConfig', 'BertOnnxConfig'],
+        'backbone': [
+            'BertModel',
+            'BertPreTrainedModel',
+        ],
+        'configuration': ['BertConfig'],
+        'fill_mask': ['BertForMaskedLM'],
+        'text_ranking': ['BertForTextRanking'],
+        'sentence_embedding': ['BertForSentenceEmbedding'],
+        'text_classification': ['BertForSequenceClassification'],
+        'token_classification': ['BertForTokenClassification'],
+        'document_segmentation': ['BertForDocumentSegmentation'],
     }
 
-    _import_structure['modeling_bert'] = [
-        'BertForMaskedLM',
-        'BertForMultipleChoice',
-        'BertForNextSentencePrediction',
-        'BertForPreTraining',
-        'BertForQuestionAnswering',
-        'BertForSequenceClassification',
-        'BertForTokenClassification',
-        'BertLayer',
-        'BertLMHeadModel',
-        'BertModel',
-        'BertPreTrainedModel',
-        'load_tf_weights_in_bert',
-    ]
-
     import sys
 
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py
new file mode 100755
index 00000000..df0aebd2
--- /dev/null
+++ b/modelscope/models/nlp/bert/backbone.py
@@ -0,0 +1,952 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions,
+                                BaseModelOutputWithPoolingAndCrossAttentions)
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.logger import get_logger
+from .configuration import BertConfig
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
+            # cross attention key/value_states. Further calls to cross_attention
+            # layer can then reuse all cross-attention key/value_states (first
+            # "if" case) if uni-directional self-attention (decoder) save
+            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
+            # key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected
+            # key/value_states (third "elif" case) if encoder bi-directional
+            # self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = BertAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
+                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir', None)
+        if model_dir is None:
+            config = BertConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
+
+
+@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.bert)
+class BertModel(BertPreTrainedModel):
+    """The Bert Model transformer outputting raw hidden-states without any
+    specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
+        config = BertConfig(**config)
+        model = cls(config, add_pooling_layer)
+        return model
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        Others (**kwargs)
+            some additional parameters might passed in from upstream pipeline,
+            which not influence the results.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration.py
similarity index 99%
rename from modelscope/models/nlp/bert/configuration_bert.py
rename to modelscope/models/nlp/bert/configuration.py
index 2c9293ec..1e2cef95 100644
--- a/modelscope/models/nlp/bert/configuration_bert.py
+++ b/modelscope/models/nlp/bert/configuration.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert/document_segmentation.py
similarity index 99%
rename from modelscope/models/nlp/bert_for_document_segmentation.py
rename to modelscope/models/nlp/bert/document_segmentation.py
index dfa57597..b46c77e4 100644
--- a/modelscope/models/nlp/bert_for_document_segmentation.py
+++ b/modelscope/models/nlp/bert/document_segmentation.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Dict
 
+import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import TokenClassifierOutput
diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py
new file mode 100644
index 00000000..4f81f62d
--- /dev/null
+++ b/modelscope/models/nlp/bert/fill_mask.py
@@ -0,0 +1,299 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+from .configuration import BertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
+class BertForMaskedLM(BertPreTrainedModel):
+    r"""Bert Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Structbert, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: BertConfig, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_bert_backbone_base_std')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_backbone_base_std')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
deleted file mode 100755
index 7c1dfcf5..00000000
--- a/modelscope/models/nlp/bert/modeling_bert.py
+++ /dev/null
@@ -1,1961 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model. """
-
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    MultipleChoiceModelOutput, NextSentencePredictorOutput,
-    QuestionAnsweringModelOutput, SequenceClassifierOutput,
-    TokenClassifierOutput)
-from transformers.modeling_utils import (PreTrainedModel,
-                                         apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.utils.logger import get_logger
-from .configuration_bert import BertConfig
-
-logger = get_logger(__name__)
-
-_CONFIG_FOR_DOC = 'BertConfig'
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model
-        # variable name and be able to load any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and
-        # exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(self.position_ids.size(), dtype=torch.long),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor
-        # where it is all zeros, which usually occurs when its auto-generated,
-        # registered buffer helps users when tracing the model without passing
-        # token_type_ids, solves issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, 'position_embedding_type', 'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
-            # cross attention key/value_states. Further calls to cross_attention
-            # layer can then reuse all cross-attention key/value_states (first
-            # "if" case) if uni-directional self-attention (decoder) save
-            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
-            # key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected
-            # key/value_states (third "elif" case) if encoder bi-directional
-            # self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = BertSelfAttention(
-            config, position_embedding_type=position_embedding_type)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = BertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = BertAttention(
-                config, position_embedding_type='absolute')
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
-                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class BertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [BertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class BertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface
-    for downloading and loading pretrained models.
-    """
-
-    config_class = BertConfig
-    base_model_prefix = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, BertEncoder):
-            module.gradient_checkpointing = value
-
-
-@dataclass
-class BertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of [`BertForPreTraining`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided,
-        `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the
-            next sequence prediction (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each
-            vocabulary token before SoftMax).
-        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size,
-        2)`):
-            Prediction scores of the next sequence prediction (classification)
-            head (scores of True/False continuation before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings +
-            one for the output of each layer) of shape `(batch_size,
-            sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-BERT_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass
-    documentation for the generic methods the library implements for all its
-    model (such as downloading or saving, resizing the input embeddings, pruning
-    heads etc.)
-
-    This model is also a PyTorch
-    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
-    documentation for all matter related to general usage and behavior.
-
-    Parameters:
-        config ([`BertConfig`]): Model configuration class with all the
-        parameters of the model.
-            Initializing with a config file does not load the weights associated
-            with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`BertTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the
-            inputs. Indices are selected in `[0, 1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`,
-        *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
-            plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
-    BERT_START_DOCSTRING,
-)
-class BertModel(BertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a
-    decoder, in which case a layer of cross-attention is added between the
-    self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
-    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
-    Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    `is_decoder` argument of the configuration set to `True`. To be used in a
-    Seq2Seq model, the model needs to initialized with both `is_decoder`
-    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
-    is then expected as an input to the forward pass.
-    """
-
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-
-        self.pooler = BertPooler(config) if add_pooling_layer else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @classmethod
-    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
-        config = BertConfig(**config)
-        model = cls(config, add_pooling_layer)
-        return model
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the
-            encoder. Used in the cross-attention if the model is configured as a
-            decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of
-            the encoder input. This mask is used in the cross-attention if the
-            model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-        `config.n_layers` with each tuple having 4 tensors of shape
-        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention
-            blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only
-            the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead
-            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned
-            and can be used to speed up decoding (see `past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-
-@add_start_docstrings(
-    """
-    Bert Model with two heads on top as done during the pretraining: a `masked
-    language modeling` head and a `next sentence prediction (classification)`
-    head.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForPreTraining(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-            *optional*):
-                Labels for computing the masked language modeling loss. Indices
-                should be in `[-100, 0, ..., config.vocab_size]` (see
-                `input_ids` docstring) Tokens with indices set to `-100` are
-                ignored (masked), the loss is only computed for the tokens with
-                labels in `[0, ..., config.vocab_size]`
-            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`,
-            *optional*):
-                Labels for computing the next sequence prediction
-                (classification) loss. Input should be a sequence pair (see
-                `input_ids` docstring) Indices should be in `[0, 1]`:
-
-                - 0 indicates sequence B is a continuation of sequence A,
-                - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
-                Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer, BertForPreTraining
-        >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
-        ```
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 2),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores, seq_relationship_score) + outputs[2:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return BertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
-    BERT_START_DOCSTRING)
-class BertLMHeadModel(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-            sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the
-                encoder. Used in the cross-attention if the model is configured
-                as a decoder.
-            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-            sequence_length)`, *optional*):
-                Mask to avoid performing attention on the padding token indices
-                of the encoder input. This mask is used in the cross-attention
-                if the model is configured as a decoder. Mask values selected in
-                `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-            *optional*):
-                Labels for computing the left-to-right language modeling loss
-                (next word prediction). Indices should be in `[-100, 0, ...,
-                config.vocab_size]` (see `input_ids` docstring) Tokens with
-                indices set to `-100` are ignored (masked), the loss is only
-                computed for the tokens with labels n `[0, ...,
-                config.vocab_size]`
-            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-            `config.n_layers` with each tuple having 4 tensors of shape
-            `(batch_size, num_heads, sequence_length - 1,
-            embed_size_per_head)`):
-                Contains precomputed key and value hidden states of the
-                attention blocks. Can be used to speed up decoding.
-
-                If `past_key_values` are used, the user can optionally input
-                only the last `decoder_input_ids` (those that don't have their
-                past key value states given to this model) of shape
-                `(batch_size, 1)` instead of all `decoder_input_ids` of shape
-                `(batch_size, sequence_length)`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are
-                returned and can be used to speed up decoding (see
-                `past_key_values`).
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer, BertLMHeadModel,
-        BertConfig >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-        >>> config = BertConfig.from_pretrained("bert-base-cased")
-        >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
-        ```
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING)
-class BertForMaskedLM(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the masked language modeling loss. Indices
-            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
-            docstring) Tokens with indices set to `-100` are ignored (masked),
-            the loss is only computed for the tokens with labels in `[0, ...,
-            config.vocab_size]`
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        if self.config.pad_token_id is None:
-            raise ValueError('The PAD token should be defined for generation')
-
-        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
-        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
-        dummy_token = torch.full((effective_batch_size, 1),
-                                 self.config.pad_token_id,
-                                 dtype=torch.long,
-                                 device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING,
-)
-class BertForNextSentencePrediction(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the next sequence prediction (classification)
-            loss. Input should be a sequence pair (see `input_ids` docstring).
-            Indices should be in `[0, 1]`:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer,
-        BertForNextSentencePrediction >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
-
-        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
-        ```
-        """
-
-        if 'next_sentence_label' in kwargs:
-            warnings.warn(
-                'The `next_sentence_label` argument is deprecated, use `labels` instead.',
-                FutureWarning,
-            )
-            labels = kwargs.pop('next_sentence_label')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_scores = self.cls(pooled_output)
-
-        next_sentence_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(
-                seq_relationship_scores.view(-1, 2), labels.view(-1))
-
-        if not return_dict:
-            output = (seq_relationship_scores, ) + outputs[2:]
-            return ((next_sentence_loss, )
-                    + output) if next_sentence_loss is not None else output
-
-        return NextSentencePredictorOutput(
-            loss=next_sentence_loss,
-            logits=seq_relationship_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model transformer with a sequence classification/regression head on top
-    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForSequenceClassification(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        self.bert = BertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[0, ..., config.num_labels - 1]`. If
-            `config.num_labels == 1` a regression loss is computed (Mean-Square
-            loss), If `config.num_labels > 1` a classification loss is computed
-            (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a multiple choice classification head on top (a linear layer
-    on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForMultipleChoice(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format(
-            'batch_size, num_choices, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in `[0, ..., num_choices-1]` where `num_choices`
-            is the size of the second dimension of the input tensors. (See
-            `input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-
-        if not return_dict:
-            output = (reshaped_logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForTokenClassification(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the token classification loss. Indices should
-            be in `[0, ..., config.num_labels - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a span classification head on top for extractive
-    question-answering tasks like SQuAD (a linear layers on top of the
-    hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForQuestionAnswering(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`,
-        *optional*):
-            Labels for position (index) of the start of the labelled span for
-            computing the token classification loss. Positions are clamped to
-            the length of the sequence (`sequence_length`). Position outside of
-            the sequence are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for
-            computing the token classification loss. Positions are clamped to
-            the length of the sequence (`sequence_length`). Position outside of
-            the sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py
new file mode 100644
index 00000000..f4c2620e
--- /dev/null
+++ b/modelscope/models/nlp/bert/sentence_embedding.py
@@ -0,0 +1,113 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.builder import MODELS
+from modelscope.outputs import BackboneModelOutput
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+
+@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
+class BertForSentenceEmbedding(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        setattr(self, self.base_model_prefix,
+                BertModel(config, add_pooling_layer=False))
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ) -> BackboneModelOutput:
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> print(model(**preprocessor('This is a test')))
+        """
+        return self.base_model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        model = super(
+            Model,
+            cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py
new file mode 100644
index 00000000..b1d18d0f
--- /dev/null
+++ b/modelscope/models/nlp/bert/text_classification.py
@@ -0,0 +1,208 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.nli, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.bert)
+class BertForSequenceClassification(BertPreTrainedModel):
+    r"""Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Bert, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        setattr(self, self.base_model_prefix, BertModel(config))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py
new file mode 100644
index 00000000..79a63045
--- /dev/null
+++ b/modelscope/models/nlp/bert/text_ranking.py
@@ -0,0 +1,89 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.utils.checkpoint
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel
+from .text_classification import BertForSequenceClassification
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
+class BertForTextRanking(BertForSequenceClassification):
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        setattr(self, self.base_model_prefix,
+                BertModel(self.config, add_pooling_layer=True))
+        self.register_buffer(
+            'target_label',
+            torch.zeros(self.train_batch_size, dtype=torch.long))
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs) -> AttentionTextClassificationModelOutput:
+        outputs = self.base_model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(self.train_batch_size, -1)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(scores, self.target_label)
+            return AttentionTextClassificationModelOutput(
+                loss=loss,
+                logits=logits,
+            )
+        return AttentionTextClassificationModelOutput(logits=logits, )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (1 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        num_labels = kwargs.get('num_labels', 1)
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+
+        model_dir = kwargs.get('model_dir')
+        model = super(Model, cls).from_pretrained(
+            pretrained_model_name_or_path=model_dir, **model_args)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py
new file mode 100644
index 00000000..5dc6b0ce
--- /dev/null
+++ b/modelscope/models/nlp/bert/token_classification.py
@@ -0,0 +1,225 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.bert)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
+class BertForTokenClassification(BertPreTrainedModel):
+    r"""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Bert, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        setattr(self, self.base_model_prefix,
+                BertModel(config, add_pooling_layer=False))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ):
+        r"""
+        Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using
+            :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in ``[0, 1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
+        :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert :obj:`input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention
+            layers. See ``attentions`` under returned tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See
+            ``hidden_states`` under returned tensors for more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput`
+            instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If
+            :obj:`config.num_labels == 1` a regression loss is computed
+            (Mean-Square loss), If :obj:`config.num_labels > 1` a classification
+            loss is computed (Cross-Entropy).
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=offset_mapping,
+        )
diff --git a/modelscope/models/nlp/csanmt/__init__.py b/modelscope/models/nlp/csanmt/__init__.py
new file mode 100644
index 00000000..85531617
--- /dev/null
+++ b/modelscope/models/nlp/csanmt/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .translation import CsanmtForTranslation
diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt/translation.py
similarity index 100%
rename from modelscope/models/nlp/csanmt_for_translation.py
rename to modelscope/models/nlp/csanmt/translation.py
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
index 830210ed..08b184e5 100644
--- a/modelscope/models/nlp/deberta_v2/__init__.py
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -22,38 +22,28 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_deberta_v2 import DebertaV2Config
-    from .tokenization_deberta_v2 import DebertaV2Tokenizer
-    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
-
-    from .modeling_deberta_v2 import (
-        DebertaV2ForMaskedLM,
-        DebertaV2ForMultipleChoice,
-        DebertaV2ForQuestionAnswering,
-        DebertaV2ForSequenceClassification,
-        DebertaV2ForTokenClassification,
+    from .configuration import DebertaV2Config
+    from .tokenization import DebertaV2Tokenizer
+    from .tokenization_fast import DebertaV2TokenizerFast
+    from .backbone import (
         DebertaV2Model,
         DebertaV2PreTrainedModel,
     )
+    from .fill_mask import DebertaV2ForMaskedLM
 
 else:
     _import_structure = {
-        'configuration_deberta_v2':
-        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
-        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
+        'configuration': ['DebertaV2Config'],
+        'tokenization': ['DebertaV2Tokenizer'],
+        'tokenization_fast': ['DebertaV2TokenizerFast'],
+        'backbone': [
+            'DebertaV2Model',
+            'DebertaV2PreTrainedModel',
+        ],
+        'fill_mask': [
+            'DebertaV2ForMaskedLM',
+        ]
     }
-    _import_structure['tokenization_deberta_v2_fast'] = [
-        'DebertaV2TokenizerFast'
-    ]
-    _import_structure['modeling_deberta_v2'] = [
-        'DebertaV2ForMaskedLM',
-        'DebertaV2ForMultipleChoice',
-        'DebertaV2ForQuestionAnswering',
-        'DebertaV2ForSequenceClassification',
-        'DebertaV2ForTokenClassification',
-        'DebertaV2Model',
-        'DebertaV2PreTrainedModel',
-    ]
     import sys
 
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/backbone.py
similarity index 64%
rename from modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/backbone.py
index 1c6b9071..cca38133 100644
--- a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/backbone.py
@@ -20,28 +20,22 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import LayerNorm
 from transformers.activations import ACT2FN
-from transformers.file_utils import (add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward)
-from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
-                                           MultipleChoiceModelOutput,
-                                           QuestionAnsweringModelOutput,
-                                           SequenceClassifierOutput,
-                                           TokenClassifierOutput)
+from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import softmax_backward_data
 
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
 from modelscope.utils import logger as logging
-from .configuration_deberta_v2 import DebertaV2Config
+from modelscope.utils.constant import Tasks
+from .configuration import DebertaV2Config
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = 'DebertaV2Config'
-_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
-_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
-
 
 # Copied from transformers.models.deberta.modeling_deberta.ContextPooler
 class ContextPooler(nn.Module):
@@ -1006,7 +1000,7 @@ class DebertaV2Embeddings(nn.Module):
 
 
 # Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
-class DebertaV2PreTrainedModel(PreTrainedModel):
+class DebertaV2PreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
@@ -1018,6 +1012,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = ['position_embeddings']
     supports_gradient_checkpointing = True
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     def _init_weights(self, module):
         """Initialize the weights."""
         if isinstance(module, nn.Linear):
@@ -1037,8 +1035,24 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
         if isinstance(module, DebertaV2Encoder):
             module.gradient_checkpointing = value
 
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = DebertaV2Config(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.deberta_v2)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+    """The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.
 
-DEBERTA_START_DOCSTRING = r"""
     The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
     Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
     on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
@@ -1048,65 +1062,13 @@ DEBERTA_START_DOCSTRING = r"""
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
 
-
     Parameters:
-        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEBERTA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
-class DebertaV2Model(DebertaV2PreTrainedModel):
+            configuration.
+    """
 
-    def __init__(self, config):
+    def __init__(self, config, **kwargs):
         super().__init__(config)
 
         self.embeddings = DebertaV2Embeddings(config)
@@ -1130,14 +1092,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
         raise NotImplementedError(
             'The prune function is not implemented in DeBERTa model.')
 
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1148,7 +1102,53 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> Union[Tuple, AttentionBackboneModelOutput]:
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
+            Indices of input sequence tokens in the vocabulary.
+
+        attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+        position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+        inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a dataclass instead of a plain tuple.
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> print(model(**preprocessor('这是个测试')))
+        """
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else
@@ -1216,574 +1216,9 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
             return (sequence_output, ) + encoder_outputs[
                 (1 if output_hidden_states else 2):]
 
-        return BaseModelOutput(
+        return AttentionBackboneModelOutput(
             last_hidden_state=sequence_output,
             hidden_states=encoder_outputs.hidden_states
             if output_hidden_states else None,
             attentions=encoder_outputs.attentions,
         )
-
-
-@add_start_docstrings(
-    """DeBERTa Model with a `language modeling` head on top.""",
-    DEBERTA_START_DOCSTRING)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
-class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.deberta = DebertaV2Model(config)
-        self.cls = DebertaV2OnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, MaskedLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
-            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[1:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
-class DebertaV2PredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
-class DebertaV2LMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = DebertaV2PredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
-class DebertaV2OnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = DebertaV2LMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
-class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        num_labels = getattr(config, 'num_labels', 2)
-        self.num_labels = num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.pooler = ContextPooler(config)
-        output_dim = self.pooler.output_dim
-
-        self.classifier = nn.Linear(output_dim, num_labels)
-        drop_out = getattr(config, 'cls_dropout', None)
-        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-        self.dropout = StableDropout(drop_out)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.deberta.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.deberta.set_input_embeddings(new_embeddings)
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        encoder_layer = outputs[0]
-        pooled_output = self.pooler(encoder_layer)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    # regression task
-                    loss_fn = nn.MSELoss()
-                    logits = logits.view(-1).to(labels.dtype)
-                    loss = loss_fn(logits, labels.view(-1))
-                elif labels.dim() == 1 or labels.size(-1) == 1:
-                    label_index = (labels >= 0).nonzero()
-                    labels = labels.long()
-                    if label_index.size(0) > 0:
-                        labeled_logits = torch.gather(
-                            logits, 0,
-                            label_index.expand(
-                                label_index.size(0), logits.size(1)))
-                        labels = torch.gather(labels, 0, label_index.view(-1))
-                        loss_fct = CrossEntropyLoss()
-                        loss = loss_fct(
-                            labeled_logits.view(-1, self.num_labels).float(),
-                            labels.view(-1))
-                    else:
-                        loss = torch.tensor(0).to(logits)
-                else:
-                    log_softmax = nn.LogSoftmax(-1)
-                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
-            elif self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions)
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
-class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions)
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
-class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        start_positions: Optional[torch.Tensor] = None,
-        end_positions: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[1:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        num_labels = getattr(config, 'num_labels', 2)
-        self.num_labels = num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.pooler = ContextPooler(config)
-        output_dim = self.pooler.output_dim
-
-        self.classifier = nn.Linear(output_dim, 1)
-        drop_out = getattr(config, 'cls_dropout', None)
-        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-        self.dropout = StableDropout(drop_out)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.deberta.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.deberta.set_input_embeddings(new_embeddings)
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
-            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
-            `input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        flat_input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        flat_position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        flat_inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self.deberta(
-            flat_input_ids,
-            position_ids=flat_position_ids,
-            token_type_ids=flat_token_type_ids,
-            attention_mask=flat_attention_mask,
-            inputs_embeds=flat_inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        encoder_layer = outputs[0]
-        pooled_output = self.pooler(encoder_layer)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-
-        if not return_dict:
-            output = (reshaped_logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration.py
similarity index 98%
rename from modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/configuration.py
index 65e8f0b7..7921ca2f 100644
--- a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/configuration.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
-from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
 from transformers import PretrainedConfig
 
diff --git a/modelscope/models/nlp/deberta_v2/fill_mask.py b/modelscope/models/nlp/deberta_v2/fill_mask.py
new file mode 100644
index 00000000..ed127d4c
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/fill_mask.py
@@ -0,0 +1,230 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from .backbone import DebertaV2Model, DebertaV2PreTrainedModel
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    r"""DeBERTa_v2 Model with a `language modeling` head on top.
+
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Deberta_v2, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AttentionFillMaskModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
+                Indices of input sequence tokens in the vocabulary.
+
+            attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+                1]`:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+            position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert *input_ids* indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a dataclass instead of a plain tuple.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[1:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            input_ids=input_ids,
+            attentions=outputs.attentions,
+            hidden_states=outputs.hidden_states)
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/tokenization.py
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
rename to modelscope/models/nlp/deberta_v2/tokenization_fast.py
index a1fcecf4..913ea5bd 100644
--- a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
+++ b/modelscope/models/nlp/deberta_v2/tokenization_fast.py
@@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from modelscope.utils import logger as logging
 
 if is_sentencepiece_available():
-    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+    from .tokenization import DebertaV2Tokenizer
 else:
     DebertaV2Tokenizer = None
 
diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
index 9cae8cc8..051cc8f2 100644
--- a/modelscope/models/nlp/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -4,16 +4,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_gpt3 import GPT3Config
-    from .modeling_gpt3 import GPT3Model
-    from .gpt3_for_text_generation import GPT3ForTextGeneration
-    from .tokenizer_gpt3 import JiebaBPETokenizer
+    from .configuration import GPT3Config
+    from .backbone import GPT3Model
+    from .text_generation import GPT3ForTextGeneration
+    from .tokenizer import JiebaBPETokenizer
 else:
     _import_structure = {
-        'configuration_gpt3': ['GPT3Config'],
-        'modeling_gpt3': ['GPT3Model'],
-        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
-        'tokenizer_gpt3': ['JiebaBPETokenizer'],
+        'configuration': ['GPT3Config'],
+        'backbone': ['GPT3Model'],
+        'text_generation': ['GPT3ForTextGeneration'],
+        'tokenizer': ['JiebaBPETokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/backbone.py
similarity index 99%
rename from modelscope/models/nlp/gpt3/modeling_gpt3.py
rename to modelscope/models/nlp/gpt3/backbone.py
index 2c23f5db..587c7a9d 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/backbone.py
@@ -24,7 +24,7 @@ from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.utils.constant import ModelFile
-from .configuration_gpt3 import GPT3Config
+from .configuration import GPT3Config
 
 
 class GPT3SelfAttention(nn.Module):
diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/configuration_gpt3.py
rename to modelscope/models/nlp/gpt3/configuration.py
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
rename to modelscope/models/nlp/gpt3/text_generation.py
diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/tokenizer_gpt3.py
rename to modelscope/models/nlp/gpt3/tokenizer.py
diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/gpt_neo/__init__.py
similarity index 83%
rename from modelscope/models/nlp/backbones/__init__.py
rename to modelscope/models/nlp/gpt_neo/__init__.py
index 749cf995..ef5fdee5 100644
--- a/modelscope/models/nlp/backbones/__init__.py
+++ b/modelscope/models/nlp/gpt_neo/__init__.py
@@ -4,14 +4,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .structbert import SbertModel
+    from .backbone import GPTNeoModel
 else:
     _import_structure = {
-        'structbert': ['SbertModel'],
+        'backbone': ['GPTNeoModel'],
     }
-
     import sys
-
     sys.modules[__name__] = LazyImportModule(
         __name__,
         globals()['__file__'],
diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/gpt_neo/backbone.py
similarity index 74%
rename from modelscope/models/nlp/backbones/gpt_neo.py
rename to modelscope/models/nlp/gpt_neo/backbone.py
index a2d0c374..a809bcde 100644
--- a/modelscope/models/nlp/backbones/gpt_neo.py
+++ b/modelscope/models/nlp/gpt_neo/backbone.py
@@ -4,10 +4,11 @@ from transformers import GPTNeoModel as GPTNeoModelTransform
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Tasks
 
 
-@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo)
+@BACKBONES.register_module(
+    group_key=Tasks.backbone, module_name=Models.gpt_neo)
 class GPTNeoModel(GPTNeoModelTransform):
 
     def __init__(self, **kwargs):
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index 3f19ca67..443f93df 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -37,9 +37,9 @@ class TokenClassificationHead(TorchHead):
             sequence_output = inputs
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
-        return {OutputKeys.LOGITS: logits}
+        return logits
 
     def compute_loss(self, outputs: Dict[str, torch.Tensor],
                      labels) -> Dict[str, torch.Tensor]:
         logits = outputs[OutputKeys.LOGITS]
-        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
+        return F.cross_entropy(logits, labels)
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
deleted file mode 100644
index b7a890c1..00000000
--- a/modelscope/models/nlp/masked_language.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import \
-    BertForMaskedLM as BertForMaskedLMTransformer
-from modelscope.models.nlp.deberta_v2 import \
-    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
-from modelscope.models.nlp.structbert import SbertForMaskedLM
-from modelscope.models.nlp.veco import \
-    VecoForMaskedLM as VecoForMaskedLMTransformer
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
-class StructBertForMaskedLM(TorchModel, SbertForMaskedLM):
-    """Structbert for MLM model.
-
-    Inherited from structbert.SbertForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        SbertForMaskedLM.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = SbertForMaskedLM.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained(
-            pretrained_model_name_or_path=model_dir, model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
-class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer):
-    """Bert for MLM model.
-
-    Inherited from transformers.BertForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        BertForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = BertForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(BertForMaskedLMTransformer,
-                     BertForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
-class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
-    """Veco for MLM model.
-
-    Inherited from veco.VecoForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        VecoForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = VecoForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(VecoForMaskedLMTransformer,
-                     VecoForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
-class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
-    """Deberta v2 for MLM model.
-
-    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        DebertaV2ForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = DebertaV2ForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(DebertaV2ForMaskedLMTransformer,
-                     DebertaV2ForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py
index 3a9960ec..45ab6621 100644
--- a/modelscope/models/nlp/palm_v2/__init__.py
+++ b/modelscope/models/nlp/palm_v2/__init__.py
@@ -17,19 +17,19 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_palm import PalmConfig
-    from .modeling_palm import (
+    from .configuration import PalmConfig
+    from .backbone import (
         AbsSummarizer,
         PalmForConditionalGeneration,
         Translator,
     )
-    from .palm_for_text_generation import PalmForTextGeneration
+    from .text_generation import PalmForTextGeneration
 else:
     _import_structure = {
-        'configuration_palm': ['PalmConfig'],
-        'modeling_palm':
+        'configuration': ['PalmConfig'],
+        'backbone':
         ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
-        'palm_for_text_generation': ['PalmForTextGeneration'],
+        'text_generation': ['PalmForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/backbone.py
similarity index 99%
rename from modelscope/models/nlp/palm_v2/modeling_palm.py
rename to modelscope/models/nlp/palm_v2/backbone.py
index f395ebd4..3e0ff805 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/backbone.py
@@ -35,7 +35,7 @@ from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.utils import logger as logging
-from .configuration_palm import PalmConfig
+from .configuration import PalmConfig
 from .dureader_eval import compute_bleu_rouge, normalize
 
 CONFIG_NAME = 'config.json'
diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration.py
similarity index 100%
rename from modelscope/models/nlp/palm_v2/configuration_palm.py
rename to modelscope/models/nlp/palm_v2/configuration.py
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py
similarity index 100%
rename from modelscope/models/nlp/palm_v2/palm_for_text_generation.py
rename to modelscope/models/nlp/palm_v2/text_generation.py
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
index dbc20751..589a636a 100644
--- a/modelscope/models/nlp/plug/__init__.py
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -4,13 +4,13 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_plug import PlugNLGConfig
-    from .modeling_plug import PlugModel
+    from .configuration import PlugNLGConfig
+    from .backbone import PlugModel
     from .distributed_plug import DistributedPlug
 else:
     _import_structure = {
-        'configuration_plug': ['PlugNLGConfig'],
-        'modeling_plug': ['PlugModel'],
+        'configuration': ['PlugNLGConfig'],
+        'backbone': ['PlugModel'],
         'distributed_plug': ['DistributedPlug'],
     }
 
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/backbone.py
similarity index 99%
rename from modelscope/models/nlp/plug/modeling_plug.py
rename to modelscope/models/nlp/plug/backbone.py
index df00006b..7f3f12de 100644
--- a/modelscope/models/nlp/plug/modeling_plug.py
+++ b/modelscope/models/nlp/plug/backbone.py
@@ -28,7 +28,7 @@ from torch import nn
 
 from modelscope.utils.nlp.distributed import (normal_init_method,
                                               scaled_init_method)
-from .configuration_plug import PlugNLGConfig, PlugNLUConfig
+from .configuration import PlugNLGConfig, PlugNLUConfig
 
 logger = logging.getLogger(__name__)
 
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration.py
similarity index 100%
rename from modelscope/models/nlp/plug/configuration_plug.py
rename to modelscope/models/nlp/plug/configuration.py
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
index 06009ba1..c72e92ba 100644
--- a/modelscope/models/nlp/plug/distributed_plug.py
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict
 
@@ -14,7 +15,7 @@ from modelscope.utils.nlp.distributed import initialize_distributed
 from modelscope.utils.nlp.load_checkpoint import pre_load
 from modelscope.utils.torch_utils import set_random_seed_mpu
 from . import PlugModel
-from .configuration_plug import PlugNLGConfig
+from .configuration import PlugNLGConfig
 
 logger = get_logger(__name__)
 
diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py
index 6d26b194..df996167 100644
--- a/modelscope/models/nlp/ponet/__init__.py
+++ b/modelscope/models/nlp/ponet/__init__.py
@@ -18,16 +18,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_ponet import PoNetConfig
-    from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
-                                 PoNetPreTrainedModel)
-    from .tokenization_ponet import PoNetTokenizer
+    from .configuration import PoNetConfig
+    from .backbone import (PoNetModel, PoNetPreTrainedModel)
+    from .tokenization import PoNetTokenizer
+    from .fill_mask import PoNetForMaskedLM
 else:
     _import_structure = {
-        'configuration_ponet': ['PoNetConfig'],
-        'modeling_ponet':
-        ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
-        'tokenization_ponet': ['PoNetTokenizer'],
+        'configuration': ['PoNetConfig'],
+        'backbone': ['PoNetModel', 'PoNetPreTrainedModel'],
+        'fill_mask': ['PoNetForMaskedLM'],
+        'tokenization': ['PoNetTokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/backbone.py
similarity index 55%
rename from modelscope/models/nlp/ponet/modeling_ponet.py
rename to modelscope/models/nlp/ponet/backbone.py
index f37954db..fcc62fa2 100644
--- a/modelscope/models/nlp/ponet/modeling_ponet.py
+++ b/modelscope/models/nlp/ponet/backbone.py
@@ -16,43 +16,32 @@
 """PyTorch PoNet model. """
 
 import math
-from dataclasses import dataclass
 from distutils.version import LooseVersion
-from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_outputs import \
+    BaseModelOutputWithPastAndCrossAttentions
 from transformers.modeling_utils import (PreTrainedModel,
                                          apply_chunking_to_forward,
                                          find_pruneable_heads_and_indices,
                                          prune_linear_layer)
-from transformers.models.bert.modeling_bert import \
-    load_tf_weights_in_bert as load_tf_weights_in_ponet
 
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from .configuration_ponet import PoNetConfig
+from .configuration import PoNetConfig
 
 logger = get_logger(__name__)
 
 is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
 
-_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
-_CONFIG_FOR_DOC = 'PoNetConfig'
-_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
-
 CLS_ID = 101
 EOS_ID = 102
 
@@ -609,82 +598,20 @@ class PoNetPooler(nn.Module):
         return pooled_output
 
 
-class PoNetPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class PoNetLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = PoNetPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class PoNetOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = PoNetLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class PoNetPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = PoNetLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 3)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class PoNetPreTrainedModel(PreTrainedModel):
+class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
     config_class = PoNetConfig
-    load_tf_weights = load_tf_weights_in_ponet
     base_model_prefix = 'ponet'
     _keys_to_ignore_on_load_missing = [r'position_ids']
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -703,51 +630,22 @@ class PoNetPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-
-@dataclass
-class PoNetForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~transformers.PoNetForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Masked language modeling loss.
-        sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            sop loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states
-            (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
-            or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
-            or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    mlm_loss: Optional[torch.FloatTensor] = None
-    sop_loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = PoNetConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
 
 
-PONET_START_DOCSTRING = r"""
+@MODELS.register_module(Tasks.backbone, module_name=Models.ponet)
+class PoNetModel(PoNetPreTrainedModel):
+    """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.
 
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -763,65 +661,6 @@ PONET_START_DOCSTRING = r"""
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
             weights.
-"""
-
-PONET_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
-    PONET_START_DOCSTRING,
-)
-class PoNetModel(PoNetPreTrainedModel):
-    """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
     cross-attention is added between the self-attention layers, following the architecture described in `Attention is
@@ -834,8 +673,8 @@ class PoNetModel(PoNetPreTrainedModel):
     input to the forward pass.
     """
 
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(config, **kwargs)
         self.config = config
 
         self.embeddings = PoNetEmbeddings(config)
@@ -859,14 +698,6 @@ class PoNetModel(PoNetPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
         self,
         input_ids=None,
@@ -885,6 +716,49 @@ class PoNetModel(PoNetPreTrainedModel):
         return_dict=None,
     ):
         r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
         encoder_hidden_states
             (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -906,6 +780,16 @@ class PoNetModel(PoNetPreTrainedModel):
         use_cache (:obj:`bool`, `optional`):
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> print(model(**preprocessor('这是个测试')))
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1006,7 +890,7 @@ class PoNetModel(PoNetPreTrainedModel):
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPoolingAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             past_key_values=encoder_outputs.past_key_values,
@@ -1014,578 +898,3 @@ class PoNetModel(PoNetPreTrainedModel):
             attentions=encoder_outputs.attentions,
             cross_attentions=encoder_outputs.cross_attentions,
         )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
-    sentence prediction (classification)` head.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForPreTraining(PoNetPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.ponet = PoNetModel(config)
-        self.cls = PoNetPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-
-        Example::
-
-            >>> from transformers import PoNetTokenizer, PoNetForPreTraining
-            >>> import torch
-
-            >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
-            >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
-
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
-
-            >>> prediction_logits = outputs.prediction_logits
-            >>> seq_relationship_logits = outputs.seq_relationship_logits
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        masked_lm_loss = None
-        next_sentence_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 3),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores, seq_relationship_score) + outputs[2:]
-            return ((total_loss, masked_lm_loss, next_sentence_loss)
-                    + output) if total_loss is not None else output
-
-        return PoNetForPreTrainingOutput(
-            loss=total_loss,
-            mlm_loss=masked_lm_loss,
-            sop_loss=next_sentence_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
-    PONET_START_DOCSTRING)
-class PoNetLMHeadModel(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.cls = PoNetOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:
-            `(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-            with each tuple having 4 tensors of shape :
-            obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        Returns:
-
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """PoNet Model with a `language modeling` head on top. """,
-    PONET_START_DOCSTRING)
-class PoNetForMaskedLM(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.cls = PoNetOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        segment_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForSequenceClassification(PoNetPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        self.ponet = PoNetModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForTokenClassification(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration.py
similarity index 96%
rename from modelscope/models/nlp/ponet/configuration_ponet.py
rename to modelscope/models/nlp/ponet/configuration.py
index 70294fc2..7dfaba48 100644
--- a/modelscope/models/nlp/ponet/configuration_ponet.py
+++ b/modelscope/models/nlp/ponet/configuration.py
@@ -34,8 +34,7 @@ class PoNetConfig(PretrainedConfig):
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
+            :obj:`inputs_ids` passed.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -55,8 +54,7 @@ class PoNetConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
diff --git a/modelscope/models/nlp/ponet/fill_mask.py b/modelscope/models/nlp/ponet/fill_mask.py
new file mode 100644
index 00000000..fb09efc0
--- /dev/null
+++ b/modelscope/models/nlp/ponet/fill_mask.py
@@ -0,0 +1,252 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .backbone import PoNetModel, PoNetPreTrainedModel
+
+logger = get_logger(__name__)
+
+
+class PoNetPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class PoNetLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = PoNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class PoNetOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
+class PoNetForMaskedLM(PoNetPreTrainedModel):
+    r"""PoNet Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of PoNet, the preprocessor of this model
+        is `modelscope.preprocessors.FillMaskPoNetPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        segment_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`,
+            `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization.py
similarity index 98%
rename from modelscope/models/nlp/ponet/tokenization_ponet.py
rename to modelscope/models/nlp/ponet/tokenization.py
index 21544886..2da91545 100644
--- a/modelscope/models/nlp/ponet/tokenization_ponet.py
+++ b/modelscope/models/nlp/ponet/tokenization.py
@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from transformers.file_utils import PaddingStrategy
 from transformers.models.bert.tokenization_bert import BertTokenizer
+from transformers.tokenization_utils import BatchEncoding, EncodedInput
 
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py
deleted file mode 100644
index 11f4bc11..00000000
--- a/modelscope/models/nlp/ponet_for_masked_language.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.ponet import \
-    PoNetForMaskedLM as PoNetForMaskedLMTransformer
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['PoNetForMaskedLM']
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
-class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
-    """PoNet for MLM model.'.
-
-    Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        PoNetForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                segment_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = PoNetForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(PoNetForMaskedLMTransformer,
-                     PoNetForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
deleted file mode 100644
index 340c133f..00000000
--- a/modelscope/models/nlp/sentence_embedding.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentenceEmbedding']
-
-
-@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
-class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=False)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        return self.base_model(**input)
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
-        num_sent = embs.shape[0]
-        if num_sent >= 2:
-            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
-                                                      (1, 0))).tolist()[0]
-        else:
-            scores = []
-        result = {'text_embedding': embs, 'scores': scores}
-
-        return result
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_args = {}
-
-        return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
-            pretrained_model_name_or_path=kwargs.get('model_dir'),
-            model_dir=kwargs.get('model_dir'),
-            **model_args)
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
deleted file mode 100644
index 156c615c..00000000
--- a/modelscope/models/nlp/sequence_classification.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from abc import abstractmethod
-
-from torch import nn
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertPreTrainedModel
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.models.nlp.veco import \
-    VecoForSequenceClassification as VecoForSequenceClassificationTransform
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
-
-__all__ = [
-    'SbertForSequenceClassification', 'VecoForSequenceClassification',
-    'BertForSequenceClassification'
-]
-
-
-class SequenceClassificationBase(TorchModel):
-    """A sequence classification base class for all the fitted sequence classification models.
-    """
-    base_model_prefix: str = 'bert'
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.num_labels = config.num_labels
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    @abstractmethod
-    def build_base_model(self):
-        """Build the backbone model.
-
-        Returns: the backbone instance.
-        """
-        pass
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix)
-
-    def forward(self, **kwargs):
-        labels = None
-        if OutputKeys.LABEL in kwargs:
-            labels = kwargs.pop(OutputKeys.LABEL)
-        elif OutputKeys.LABELS in kwargs:
-            labels = kwargs.pop(OutputKeys.LABELS)
-
-        outputs = self.base_model.forward(**kwargs)
-
-        # backbone model should return pooled_output as its second output
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def postprocess(self, input, **kwargs):
-        logits = input[OutputKeys.LOGITS]
-        probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1)))
-        pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1)))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        res = {
-            OutputKeys.PREDICTIONS: pred,
-            OutputKeys.PROBABILITIES: probs,
-            OutputKeys.LOGITS: logits
-        }
-        return res
-
-
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.structbert)
-@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.structbert)
-class SbertForSequenceClassification(SequenceClassificationBase,
-                                     SbertPreTrainedModel):
-    """Sbert sequence classification model.
-
-    Inherited from SequenceClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            SbertForSequenceClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=labels)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-            cls.id2label = {id: label for label, id in label2id.items()}
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(SbertPreTrainedModel,
-                     SbertForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.veco)
-@MODELS.register_module(Tasks.nli, module_name=Models.veco)
-class VecoForSequenceClassification(TorchModel,
-                                    VecoForSequenceClassificationTransform):
-    """Veco sequence classification model.
-
-    Inherited from VecoForSequenceClassification and TorchModel, so this class can be registered into the model set.
-    This model cannot be inherited from SequenceClassificationBase, because Veco/XlmRoberta's classification structure
-    is different.
-    """
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        VecoForSequenceClassificationTransform.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                **kwargs):
-        return VecoForSequenceClassificationTransform.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            labels=labels)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by veco.VecoForSequenceClassification.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(VecoForSequenceClassificationTransform,
-                     VecoForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.bert)
-@MODELS.register_module(Tasks.nli, module_name=Models.bert)
-@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
-class BertForSequenceClassification(SequenceClassificationBase,
-                                    BertPreTrainedModel):
-    """Bert sequence classification model.
-
-        Inherited from SequenceClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .bert import BertModel
-        return BertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(BertPreTrainedModel,
-                     BertForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py
index 45f856c1..32713c34 100644
--- a/modelscope/models/nlp/space/__init__.py
+++ b/modelscope/models/nlp/space/__init__.py
@@ -1,20 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .model import SpaceGenerator
-    from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig
-    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
-    from .space_for_dialog_modeling import SpaceForDialogModeling
-    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
+    from .model import SpaceModelBase, SpaceTokenizer
+    from .dialog_intent_prediction import SpaceForDialogIntent
+    from .dialog_modeling import SpaceForDialogModeling
+    from .dialog_state_tracking import SpaceForDST
+    from .configuration import SpaceConfig
 else:
     _import_structure = {
-        'model':
-        ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'],
-        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
-        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
-        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+        'model': ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer'],
+        'dialog_intent_prediction': ['SpaceForDialogIntent'],
+        'dialog_modeling': ['SpaceForDialogModeling'],
+        'dialog_state_tracking': ['SpaceForDST'],
+        'configuration': ['SpaceConfig']
     }
 
     import sys
diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/configuration.py
similarity index 100%
rename from modelscope/models/nlp/space/model/configuration_space.py
rename to modelscope/models/nlp/space/configuration.py
diff --git a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/dialog_intent_prediction.py
similarity index 66%
rename from modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
rename to modelscope/models/nlp/space/dialog_intent_prediction.py
index b93a6d83..79ff01cd 100644
--- a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
+++ b/modelscope/models/nlp/space/dialog_intent_prediction.py
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
-from modelscope.preprocessors.space import IntentBPETextField
+from modelscope.preprocessors.nlp import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -24,6 +24,10 @@ class SpaceForDialogIntent(TorchModel):
 
         Args:
             model_dir (str): the model path.
+            text_field (`BPETextField`, *optional*, defaults to `IntentBPETextField`):
+                The text field.
+            config (`Config`, *optional*, defaults to config in model hub):
+                The config.
         """
 
         super().__init__(model_dir, *args, **kwargs)
@@ -72,10 +76,21 @@ class SpaceForDialogIntent(TorchModel):
                 Example:
                     {
                         'pred': array([2.62349960e-03 4.12110658e-03 4.12748595e-05 3.77560973e-05
- 1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
- 6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
- 2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32)
+                                1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
+                                6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
+                                2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32),
                     }
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDialogIntent
+            >>> from modelscope.preprocessors import DialogIntentPredictionPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-intent-prediction')
+            >>> preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
+            >>> model = SpaceForDialogIntent(
+                    model_dir=cache_path,
+                    text_field=preprocessor.text_field,
+                    config=preprocessor.config)
+            >>> print(model(preprocessor("What do I need to do for the card activation?")))
         """
         import numpy as np
         pred = self.trainer.forward(input)
diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/dialog_modeling.py
similarity index 73%
rename from modelscope/models/nlp/space/space_for_dialog_modeling.py
rename to modelscope/models/nlp/space/dialog_modeling.py
index efa9b851..16e9dc53 100644
--- a/modelscope/models/nlp/space/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/dialog_modeling.py
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
-from modelscope.preprocessors.space import MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -23,7 +23,12 @@ class SpaceForDialogModeling(TorchModel):
         """initialize the test generation model from the `model_dir` path.
 
         Args:
-            model_dir (str): the model path.
+            model_dir (`str`):
+                The model path.
+            text_field (`BPETextField`, *optional*, defaults to `MultiWOZBPETextField`):
+                The text field.
+            config (`Config`, *optional*, defaults to config in model hub):
+                The config.
         """
 
         super().__init__(model_dir, *args, **kwargs)
@@ -82,6 +87,19 @@ class SpaceForDialogModeling(TorchModel):
                         'aspn': array([47,8345,32,29,1983]),
                         'db': array([19, 24, 20]),
                     }
+        Examples:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDialogModeling
+            >>> from modelscope.preprocessors import DialogModelingPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-modeling')
+            >>> preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
+            >>> model = SpaceForDialogModeling(model_dir=cache_path,
+                    text_field=preprocessor.text_field,
+                    config=preprocessor.config)
+            >>> print(model(preprocessor({
+                    'user_input': 'i would like a taxi from saint john \'s college to pizza hut fen ditton .',
+                    'history': {}
+                })))
         """
 
         first_turn = input['first_turn']
diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/dialog_state_tracking.py
similarity index 57%
rename from modelscope/models/nlp/space/model/modeling_space.py
rename to modelscope/models/nlp/space/dialog_state_tracking.py
index f093cbc5..9a713a59 100644
--- a/modelscope/models/nlp/space/model/modeling_space.py
+++ b/modelscope/models/nlp/space/dialog_state_tracking.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,14 +16,22 @@
 # limitations under the License.
 """PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
 
+from typing import Dict
+
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.file_utils import add_start_docstrings
+from transformers.modeling_utils import PreTrainedModel
 
-from modelscope.models.nlp.structbert.modeling_sbert import (
-    SbertForMaskedLM, SbertModel, SbertPreTrainedModel)
-from .configuration_space import SpaceConfig
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import (SbertForMaskedLM, SbertModel,
+                                              SbertPreTrainedModel)
+from modelscope.utils.constant import Tasks
+from .configuration import SpaceConfig
 
 SPACE_START_DOCSTRING = r"""
 
@@ -57,6 +65,63 @@ class SpaceModel(SbertModel):
     config_class = SpaceConfig
 
 
+class SpacePreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SpaceConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = SpaceConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
+
+
 @add_start_docstrings(
     """
     Space Model transformer with Dialog state tracking heads on top (a inform projection
@@ -65,7 +130,9 @@ class SpaceModel(SbertModel):
     """,
     SPACE_START_DOCSTRING,
 )
-class SpaceForDST(SbertPreTrainedModel):
+@MODELS.register_module(
+    Tasks.task_oriented_conversation, module_name=Models.space_dst)
+class SpaceForDST(SpacePreTrainedModel):
 
     def __init__(self, config):
         super(SpaceForDST, self).__init__(config)
@@ -113,18 +180,105 @@ class SpaceForDST(SbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self,
-                input_ids,
-                input_mask=None,
-                segment_ids=None,
-                position_ids=None,
-                head_mask=None,
-                start_pos=None,
-                end_pos=None,
-                inform_slot_id=None,
-                refer_id=None,
-                class_label_id=None,
-                diag_state=None):
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'inputs': dict(input_ids, input_masks,start_pos), # tracking states
+                        'outputs': dict(slots_logits),
+                        'unique_ids': str(test-example.json-0), # default value
+                        'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
+                        'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
+                        'inform':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
+                        'prefix': str('final'), #default value
+                        'ds':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
+                    }
+
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDST
+            >>> from modelscope.preprocessors import DialogStateTrackingPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-state-tracking')
+            >>> model = SpaceForDST.from_pretrained(cache_path)
+            >>> preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
+            >>> print(model(preprocessor({
+                    'utter': {
+                        'User-1': "Hi, I'm looking for a train that is going"
+                            "to cambridge and arriving there by 20:45, is there anything like that?"
+                    },
+                    'history_states': [{}]
+                })))
+        """
+        import numpy as np
+        import torch
+
+        # self.model.eval() ????
+        batch = input['batch']
+
+        features = input['features']
+        diag_state = input['diag_state']
+        turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
+        reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
+        for slot in self.config.dst_slot_list:
+            for i in reset_diag_state:
+                diag_state[slot][i] = 0
+
+        with torch.no_grad():
+            inputs = {
+                'input_ids': batch[0],
+                'input_mask': batch[1],
+                'segment_ids': batch[2],
+                'start_pos': batch[3],
+                'end_pos': batch[4],
+                'inform_slot_id': batch[5],
+                'refer_id': batch[6],
+                'diag_state': diag_state,
+                'class_label_id': batch[8]
+            }
+            unique_ids = [features[i.item()].guid for i in batch[9]]
+            values = [features[i.item()].values for i in batch[9]]
+            input_ids_unmasked = [
+                features[i.item()].input_ids_unmasked for i in batch[9]
+            ]
+            inform = [features[i.item()].inform for i in batch[9]]
+            outputs = self._forward(**inputs)
+
+            # Update dialog state for next turn.
+            for slot in self.config.dst_slot_list:
+                updates = outputs[2][slot].max(1)[1]
+                for i, u in enumerate(updates):
+                    if u != 0:
+                        diag_state[slot][i] = u
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'unique_ids': unique_ids,
+            'input_ids_unmasked': input_ids_unmasked,
+            'values': values,
+            'inform': inform,
+            'prefix': 'final',
+            'ds': input['ds']
+        }
+
+    def _forward(self,
+                 input_ids,
+                 input_mask=None,
+                 segment_ids=None,
+                 position_ids=None,
+                 head_mask=None,
+                 start_pos=None,
+                 end_pos=None,
+                 inform_slot_id=None,
+                 refer_id=None,
+                 class_label_id=None,
+                 diag_state=None):
         outputs = self.bert(
             input_ids,
             attention_mask=input_mask,
@@ -132,8 +286,8 @@ class SpaceForDST(SbertPreTrainedModel):
             position_ids=position_ids,
             head_mask=head_mask)
 
-        sequence_output = outputs[0]
-        pooled_output = outputs[1]
+        sequence_output = outputs.last_hidden_state
+        pooled_output = outputs.pooler_output
 
         sequence_output = self.dropout(sequence_output)
         pooled_output = self.dropout(pooled_output)
@@ -233,36 +387,6 @@ class SpaceForDST(SbertPreTrainedModel):
             per_slot_start_logits,
             per_slot_end_logits,
             per_slot_refer_logits,
-        ) + outputs[2:]
+        ) + (outputs.embedding_output, )
 
         return outputs
-
-
-@add_start_docstrings(
-    'The Space Model Model with a `language modeling` head on tops',
-    SPACE_START_DOCSTRING,
-)
-class SpaceForMaskedLM(SbertForMaskedLM):
-    """
-    This class overrides [`SbertForMaskedLM`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = SpaceConfig
-
-
-@add_start_docstrings(
-    """
-    Space Model with only one head on top as done during the pretraining: a `masked language modeling` head.
-    """,
-    SPACE_START_DOCSTRING,
-)
-class SpaceForPreTraining(SbertPreTrainedModel):
-
-    def __init__(self, model_name_or_path: str):
-        super(SpaceForPreTraining, self).__init__()
-        self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path)
-
-    def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor):
-        outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels)
-        return outputs[0]
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
index bb1d18e4..cfff335d 100644
--- a/modelscope/models/nlp/space/model/__init__.py
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -1,10 +1,8 @@
-from .configuration_space import SpaceConfig
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .gen_unified_transformer import GenUnifiedTransformer
 from .generator import SpaceGenerator
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .model_base import SpaceModelBase
-from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
-                             SpaceForPreTraining, SpaceModel)
 from .tokenization_space import (BasicTokenizer, SpaceTokenizer,
                                  WordpieceTokenizer)
 from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
index 0e7833e6..2e05b545 100644
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -71,14 +71,11 @@ class SpaceGenerator(object):
         return
 
     def __call__(self, step_fn, state):
-        """
-        Running generation.
-
-        @param : step_fn : decoding one step
-        @type : function
+        """Running generation.
 
-        @param : state : initial state
-        @type : dict
+        Args:
+            step_fn (`function`) : decoding one step
+            state(`dict`) : initial state
         """
         raise NotImplementedError
 
@@ -104,11 +101,9 @@ class BeamSearch(SpaceGenerator):
         """
         Running beam search.
 
-        @param : step_fn : decoding one step
-        @type : function
-
-        @param : state : initial state
-        @type : dict
+        Args:
+            step_fn(`function`) : decoding one step
+            state(`dict`) : initial state
         """
         if prev_input is not None:
 
diff --git a/modelscope/models/nlp/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py
index d3d0baa4..b7812182 100644
--- a/modelscope/models/nlp/space/model/model_base.py
+++ b/modelscope/models/nlp/space/model/model_base.py
@@ -64,8 +64,8 @@ class SpaceModelBase(nn.Module):
         """
         Forward process, include real forward, collect metrices and optimize(optional)
 
-        @params : inputs : input data
-        @type : dict of numpy.ndarray/int/float/...
+        Args:
+            inputs(`dict` of numpy.ndarray/int/float/...) : input data
         """
         if is_training:
             self.train()
@@ -85,11 +85,10 @@ class SpaceModelBase(nn.Module):
               eos_id=None,
               max_gen_len=None,
               prev_input=None):
-        """
-        Inference process.
+        """Inference process.
 
-        @params : inputs : input data
-        @type : dict of numpy.ndarray/int/float/...
+        Args:
+            inputs(`dict` of numpy.ndarray/int/float/...) : input data
         """
         self.eval()
         results = self._infer(
diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py
index 84712b7b..e3b358d4 100644
--- a/modelscope/models/nlp/space/model/tokenization_space.py
+++ b/modelscope/models/nlp/space/model/tokenization_space.py
@@ -1,5 +1,5 @@
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
index b0775541..19069971 100644
--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -119,15 +119,12 @@ class UnifiedTransformer(SpaceModelBase):
                      input_mask,
                      append_head=False,
                      auto_regressive=False):
-        """
-        Create attention mask.
+        """Create attention mask.
         from sequence to matrix：[batch_size, max_seq_len， 1] -> [batch_size, max_seq_len, max_seq_len]
 
-        @param : input_mask
-        @type : Variable(shape: [batch_size, max_seq_len])
-
-        @param : auto_regressive
-        @type : bool
+        Args:
+            input_mask (Variable(shape: [batch_size, max_seq_len]))
+            auto_regressive(bool)
         """
         seq_len = input_mask.shape[1]
 
@@ -150,15 +147,12 @@ class UnifiedTransformer(SpaceModelBase):
         return mask
 
     def _join_mask(self, mask1, mask2):
-        """
-        Merge source attention mask and target attention mask.
+        """Merge source attention mask and target attention mask.
         There are four parts：left upper (lu) / right upper (ru) / left below (lb) / right below (rb)
 
-        @param : mask1 : source attention mask
-        @type : Variable(shape: [batch_size, max_src_len, max_src_len])
-
-        @param : mask1 : target attention mask
-        @type : Variable(shape: [batch_size, max_tgt_len, max_tgt_len])
+        Args:
+            mask1(Variable(shape: [batch_size, max_src_len, max_src_len])) : source attention mask
+            mask2(Variable(shape: [batch_size, max_tgt_len, max_tgt_len])) : target attention mask
         """
         batch_size = mask1.shape[0]
         seq_len1 = mask1.shape[1]
diff --git a/modelscope/models/nlp/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py
index 37f968d9..3044963a 100644
--- a/modelscope/models/nlp/space/modules/transformer_block.py
+++ b/modelscope/models/nlp/space/modules/transformer_block.py
@@ -30,18 +30,13 @@ class TransformerBlock(nn.Module):
         return
 
     def forward(self, inp, mask=None, cache=None):
-        """
-        Forward process on one transformer layer.
-
-        @param : x
-        @type : Variable(shape: [batch_size, seq_len, hidden_size])
-
-        @param : memory
-        @type : Variable(shape: [batch_size, seq_len, hidden_size])
-
-        @param : mask
+        """Forward process on one transformer layer.
 
-        @param : cache
+        Args:
+            x(Variable(shape: [batch_size, seq_len, hidden_size]))
+            memory(Variable(shape: [batch_size, seq_len, hidden_size]))
+            mask
+            cache
         """
         attn_out = self.attn(inp, mask, cache)
         attn_out = self.dropout_layer(attn_out)
diff --git a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
deleted file mode 100644
index 4b9cf5c3..00000000
--- a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from typing import Dict
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SpaceForDialogStateTracking']
-
-
-@MODELS.register_module(
-    Tasks.task_oriented_conversation, module_name=Models.space_dst)
-class SpaceForDialogStateTracking(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the test generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-
-        from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig
-        self.model_dir = model_dir
-
-        self.config = SpaceConfig.from_pretrained(self.model_dir)
-        self.model = SpaceForDST.from_pretrained(self.model_dir)
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Tensor]): the preprocessed data
-
-        Returns:
-            Dict[str, Tensor]: results
-                Example:
-                    {
-                        'inputs': dict(input_ids, input_masks,start_pos), # tracking states
-                        'outputs': dict(slots_logits),
-                        'unique_ids': str(test-example.json-0), # default value
-                        'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
-                        'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
-                        'inform':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
-                        'prefix': str('final'), #default value
-                        'ds':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
-                    }
-        """
-        import numpy as np
-        import torch
-
-        self.model.eval()
-        batch = input['batch']
-
-        features = input['features']
-        diag_state = input['diag_state']
-        turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
-        reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
-        for slot in self.config.dst_slot_list:
-            for i in reset_diag_state:
-                diag_state[slot][i] = 0
-
-        with torch.no_grad():
-            inputs = {
-                'input_ids': batch[0],
-                'input_mask': batch[1],
-                'segment_ids': batch[2],
-                'start_pos': batch[3],
-                'end_pos': batch[4],
-                'inform_slot_id': batch[5],
-                'refer_id': batch[6],
-                'diag_state': diag_state,
-                'class_label_id': batch[8]
-            }
-            unique_ids = [features[i.item()].guid for i in batch[9]]
-            values = [features[i.item()].values for i in batch[9]]
-            input_ids_unmasked = [
-                features[i.item()].input_ids_unmasked for i in batch[9]
-            ]
-            inform = [features[i.item()].inform for i in batch[9]]
-            outputs = self.model(**inputs)
-
-            # Update dialog state for next turn.
-            for slot in self.config.dst_slot_list:
-                updates = outputs[2][slot].max(1)[1]
-                for i, u in enumerate(updates):
-                    if u != 0:
-                        diag_state[slot][i] = u
-
-        return {
-            'inputs': inputs,
-            'outputs': outputs,
-            'unique_ids': unique_ids,
-            'input_ids_unmasked': input_ids_unmasked,
-            'values': values,
-            'inform': inform,
-            'prefix': 'final',
-            'ds': input['ds']
-        }
diff --git a/modelscope/models/nlp/space_T_cn/__init__.py b/modelscope/models/nlp/space_T_cn/__init__.py
index e69de29b..b9deb700 100644
--- a/modelscope/models/nlp/space_T_cn/__init__.py
+++ b/modelscope/models/nlp/space_T_cn/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .table_question_answering import TableQuestionAnswering
+else:
+    _import_structure = {
+        'table_question_answering': ['TableQuestionAnswering']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py b/modelscope/models/nlp/space_T_cn/backbone.py
similarity index 99%
rename from modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
rename to modelscope/models/nlp/space_T_cn/backbone.py
index 72c94724..5afde06e 100644
--- a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
+++ b/modelscope/models/nlp/space_T_cn/backbone.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@ import numpy as np
 import torch
 from torch import nn
 
-from modelscope.models.nlp.space_T_cn.configuration_space_T_cn import \
-    SpaceTCnConfig
+from modelscope.models.nlp.space_T_cn.configuration import SpaceTCnConfig
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 
diff --git a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py b/modelscope/models/nlp/space_T_cn/configuration.py
similarity index 100%
rename from modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
rename to modelscope/models/nlp/space_T_cn/configuration.py
index 553d8592..e698b310 100644
--- a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
+++ b/modelscope/models/nlp/space_T_cn/configuration.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/space_T_cn/table_question_answering.py
similarity index 94%
rename from modelscope/models/nlp/table_question_answering.py
rename to modelscope/models/nlp/space_T_cn/table_question_answering.py
index 8e05dd0f..a3f504b7 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/space_T_cn/table_question_answering.py
@@ -11,11 +11,11 @@ from transformers import BertTokenizer
 from modelscope.metainfo import Models
 from modelscope.models.base import Model, Tensor
 from modelscope.models.builder import MODELS
-from modelscope.preprocessors.space_T_cn.fields.struct import Constant
+from modelscope.preprocessors.nlp.space_T_cn.fields.struct import Constant
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import verify_device
-from .space_T_cn.configuration_space_T_cn import SpaceTCnConfig
-from .space_T_cn.modeling_space_T_cn import Seq2SQL, SpaceTCnModel
+from .backbone import Seq2SQL, SpaceTCnModel
+from .configuration import SpaceTCnConfig
 
 __all__ = ['TableQuestionAnswering']
 
@@ -732,9 +732,41 @@ class TableQuestionAnswering(Model):
         Args:
             input (Dict[str, Tensor]): the preprocessed data
 
+
         Returns:
             Dict[str, Tensor]: results
                 Example:
+                    {
+                        'result':
+                            {
+                                'question_tok': ['有', '哪', '些', '风', '险', '类', '型', '？'],
+                                'question': '有哪些风险类型？',
+                                'table_id': 'fund',
+                                'sql': {
+                                    'cond_conn_op': 0,
+                                    'sel': [5],
+                                    'agg': [0],
+                                    'conds': [[10, 2, 'Nulll']]
+                                },
+                                'action': 10,
+                                'model_out': [
+                                    [6, 0, 0, 0],
+                                    [0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0],
+                                    [2, 0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0]
+                                ]
+                            },
+                        'history_sql': None
+                    }
+
+        Example:
+            >>> from modelscope.models.nlp import TableQuestionAnswering
+            >>> from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+            >>> model = TableQuestionAnswering.from_pretrained('damo/nlp_convai_text2sql_pretrain_cn')
+            >>> preprocessor = TableQuestionAnsweringPreprocessor(model_dir=model.model_dir)
+            >>> print(model(preprocessor({'question': '有哪些风险类型？'})))
         """
         result = self.predict(input['datas'])[0]
 
diff --git a/modelscope/models/nlp/space_T_en/__init__.py b/modelscope/models/nlp/space_T_en/__init__.py
new file mode 100644
index 00000000..46c8b38c
--- /dev/null
+++ b/modelscope/models/nlp/space_T_en/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .text_to_sql import StarForTextToSql
+else:
+    _import_structure = {
+        'text_to_sql': ['StarForTextToSql'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/star_text_to_sql.py b/modelscope/models/nlp/space_T_en/text_to_sql.py
similarity index 59%
rename from modelscope/models/nlp/star_text_to_sql.py
rename to modelscope/models/nlp/space_T_en/text_to_sql.py
index 089f1c89..ca2d2596 100644
--- a/modelscope/models/nlp/star_text_to_sql.py
+++ b/modelscope/models/nlp/space_T_en/text_to_sql.py
@@ -4,14 +4,13 @@ import os
 from typing import Dict, Optional
 
 import torch
-import torch.nn as nn
 from text2sql_lgesql.asdl.asdl import ASDLGrammar
 from text2sql_lgesql.asdl.transition_system import TransitionSystem
 from text2sql_lgesql.model.model_constructor import Text2SQL
-from text2sql_lgesql.utils.constants import GRAMMAR_FILEPATH
 
 from modelscope.metainfo import Models
-from modelscope.models.base import Model, Tensor
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
@@ -21,7 +20,7 @@ __all__ = ['StarForTextToSql']
 
 @MODELS.register_module(
     Tasks.table_question_answering, module_name=Models.space_T_en)
-class StarForTextToSql(Model):
+class StarForTextToSql(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the star model from the `model_dir` path.
@@ -59,6 +58,33 @@ class StarForTextToSql(Model):
         Returns:
             Dict[str, Tensor]: results
                 Example:
+
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import StarForTextToSql
+            >>> from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
+            >>> test_case = {
+                    'database_id': 'employee_hire_evaluation',
+                    'local_db_path': None,
+                    'utterance': [
+                        "I'd like to see Shop names.", 'Which of these are hiring?',
+                        'Which shop is hiring the highest number of employees?'
+                        ' | do you want the name of the shop ? | Yes'
+                    ]
+                }
+            >>> cache_path = snapshot_download('damo/nlp_star_conversational-text-to-sql')
+            >>> preprocessor = ConversationalTextToSqlPreprocessor(
+                    model_dir=cache_path,
+                    database_id=test_case['database_id'],
+                db_content=True)
+            >>> model = StarForTextToSql(cache_path, config=preprocessor.config)
+            >>> print(model(preprocessor({
+                    'utterance': "I'd like to see Shop names.",
+                    'history': [],
+                    'last_sql': '',
+                    'database_id': 'employee_hire_evaluation',
+                    'local_db_path': None
+                })))
         """
         self.model.eval()
         hyps = self.model.parse(input['batch'], self.beam_size)  #
diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py
index d42db83c..60d369e0 100644
--- a/modelscope/models/nlp/structbert/__init__.py
+++ b/modelscope/models/nlp/structbert/__init__.py
@@ -18,20 +18,26 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_sbert import SbertConfig
-    from .modeling_sbert import (SbertForMaskedLM, SbertModel,
-                                 SbertPreTrainedModel)
-    from .tokenization_sbert import (BasicTokenizer, SbertTokenizer,
-                                     WordpieceTokenizer)
-    from .tokenization_sbert_fast import SbertTokenizerFast
+    from .backbone import (SbertModel, SbertPreTrainedModel)
+    from .configuration import SbertConfig
+    from .faq_question_answering import SbertForFaqQuestionAnswering
+    from .fill_mask import SbertForMaskedLM
+    from .text_classification import SbertForSequenceClassification
+    from .token_classification import SbertForTokenClassification
+    from .tokenization import (BasicTokenizer, SbertTokenizer,
+                               WordpieceTokenizer)
+    from .tokenization_fast import SbertTokenizerFast
 else:
     _import_structure = {
-        'configuration_sbert': ['SbertConfig'],
-        'modeling_sbert':
-        ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'],
-        'tokenization_sbert':
+        'backbone': ['SbertModel', 'SbertPreTrainedModel'],
+        'configuration': ['SbertConfig'],
+        'fill_mask': ['SbertForMaskedLM'],
+        'faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'text_classification': ['SbertForSequenceClassification'],
+        'token_classification': ['SbertForTokenClassification'],
+        'tokenization':
         ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'],
-        'tokenization_sbert_fast': ['SbertTokenizerFast'],
+        'tokenization_fast': ['SbertTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py
new file mode 100755
index 00000000..039db3ce
--- /dev/null
+++ b/modelscope/models/nlp/structbert/backbone.py
@@ -0,0 +1,932 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch StructBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from packaging import version
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import \
+    BaseModelOutputWithPastAndCrossAttentions
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.logger import get_logger
+from .configuration import SbertConfig
+
+logger = get_logger(__name__)
+
+
+class SbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                return_inputs_embeds=False):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users
+        # when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not return_inputs_embeds:
+            return embeddings
+        else:
+            return embeddings, inputs_embeds
+
+
+class SbertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class SbertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = SbertSelfAttention(config)
+        self.output = SbertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class SbertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SbertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = SbertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = SbertAttention(config)
+        self.intermediate = SbertIntermediate(config)
+        self.output = SbertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
+                    f'layers by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SbertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SbertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SbertPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SbertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, SbertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = SbertConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
+
+
+@dataclass
+class AttentionBackboneModelOutputWithEmbedding(AttentionBackboneModelOutput):
+    embedding_output: torch.FloatTensor = None
+    logits: Optional[Union[tuple, torch.FloatTensor]] = None
+    kwargs: dict = None
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.structbert)
+class SbertModel(SbertPreTrainedModel):
+    """The StructBERT Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config: SbertConfig, add_pooling_layer=True, **kwargs):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = SbertEmbeddings(config)
+        self.encoder = SbertEncoder(config)
+
+        self.pooler = SbertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_backbone_base_std', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_backbone_base_std')
+            >>> print(model(**preprocessor('这是个测试')))
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output, orignal_embeds = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            return_inputs_embeds=True,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output,
+                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+
+        return AttentionBackboneModelOutputWithEmbedding(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            embedding_output=orignal_embeds)
diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration.py
similarity index 94%
rename from modelscope/models/nlp/structbert/configuration_sbert.py
rename to modelscope/models/nlp/structbert/configuration.py
index a727a978..8f095f9d 100644
--- a/modelscope/models/nlp/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """
+""" StructBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """
 from transformers import PretrainedConfig
 
 from modelscope.utils import logger as logging
@@ -26,7 +26,7 @@ class SbertConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration
     of a :class:`~modelscope.models.nlp.structbert.SbertModel`.
-    It is used to instantiate a SBERT model according to the specified arguments.
+    It is used to instantiate a StructBERT model according to the specified arguments.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -74,15 +74,15 @@ class SbertConfig(PretrainedConfig):
             relevant if ``config.is_decoder=True``.
         classifier_dropout (:obj:`float`, `optional`):
             The dropout ratio for the classification head.
-        adv_grad_factor (:obj:`float`, `optional`): This factor will be multipled by the KL loss grad and then
+        adv_grad_factor (:obj:`float`, `optional`): This factor will be multiplied by the KL loss grad and then
             the result will be added to the original embedding.
             More details please check:https://arxiv.org/abs/1908.04577
-            The range of this value always be 1e-3~1e-7
+            The range of this value should between 1e-3~1e-7
         adv_bound (:obj:`float`, `optional`): adv_bound is used to cut the top and the bottom bound of
             the produced embedding.
-            If not proveded, 2 * sigma will be used as the adv_bound factor
+            If not provided, 2 * sigma will be used as the adv_bound factor
         sigma (:obj:`float`, `optional`): The std factor used to produce a 0 mean normal distribution.
-            If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
+            If adv_bound not provided, 2 * sigma will be used as the adv_bound factor
     """
 
     model_type = 'structbert'
diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
similarity index 74%
rename from modelscope/models/nlp/sbert_for_faq_question_answering.py
rename to modelscope/models/nlp/structbert/faq_question_answering.py
index 23ccdcc5..c8dbf302 100644
--- a/modelscope/models/nlp/sbert_for_faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os
 from collections import namedtuple
@@ -15,103 +17,6 @@ from modelscope.models.nlp.task_models.task_model import BaseTaskModel
 from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['SbertForFaqQuestionAnswering']
-
-
-class SbertForFaqQuestionAnsweringBase(BaseTaskModel):
-    """base class for faq models
-    """
-
-    def __init__(self, model_dir, *args, **kwargs):
-        super(SbertForFaqQuestionAnsweringBase,
-              self).__init__(model_dir, *args, **kwargs)
-
-        backbone_cfg = SbertConfig.from_pretrained(model_dir)
-        self.bert = SbertModel(backbone_cfg)
-
-        model_config = Config.from_file(
-            os.path.join(model_dir,
-                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
-
-        metric = model_config.get('metric', 'cosine')
-        pooling_method = model_config.get('pooling', 'avg')
-
-        Arg = namedtuple('args', [
-            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
-        ])
-        args = Arg(
-            metrics=metric,
-            proj_hidden_size=self.bert.config.hidden_size,
-            hidden_size=self.bert.config.hidden_size,
-            dropout=0.0,
-            pooling=pooling_method)
-
-        self.metrics_layer = MetricsLayer(args)
-        self.pooling = PoolingLayer(args)
-
-    def _get_onehot_labels(self, labels, support_size, num_cls):
-        labels_ = labels.view(support_size, 1)
-        target_oh = torch.zeros(support_size, num_cls).to(labels)
-        target_oh.scatter_(dim=1, index=labels_, value=1)
-        return target_oh.view(support_size, num_cls).float()
-
-    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
-        input_ids = inputs['input_ids']
-        input_mask = inputs['attention_mask']
-        if not isinstance(input_ids, Tensor):
-            input_ids = torch.IntTensor(input_ids)
-        if not isinstance(input_mask, Tensor):
-            input_mask = torch.IntTensor(input_mask)
-        rst = self.bert(input_ids, input_mask)
-        last_hidden_states = rst.last_hidden_state
-        if len(input_mask.shape) == 2:
-            input_mask = input_mask.unsqueeze(-1)
-        pooled_representation = self.pooling(last_hidden_states, input_mask)
-        return pooled_representation
-
-
-@MODELS.register_module(
-    Tasks.faq_question_answering, module_name=Models.structbert)
-class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase):
-    _backbone_prefix = ''
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        assert not self.training
-        query = input['query']
-        support = input['support']
-        if isinstance(query, list):
-            query = torch.stack(query)
-        if isinstance(support, list):
-            support = torch.stack(support)
-        n_query = query.shape[0]
-        n_support = support.shape[0]
-        query_mask = torch.ne(query, 0).view([n_query, -1])
-        support_mask = torch.ne(support, 0).view([n_support, -1])
-
-        support_labels = input['support_labels']
-        num_cls = torch.max(support_labels) + 1
-        onehot_labels = self._get_onehot_labels(support_labels, n_support,
-                                                num_cls)
-
-        input_ids = torch.cat([query, support])
-        input_mask = torch.cat([query_mask, support_mask], dim=0)
-        pooled_representation = self.forward_sentence_embedding({
-            'input_ids':
-            input_ids,
-            'attention_mask':
-            input_mask
-        })
-        z_query = pooled_representation[:n_query]
-        z_support = pooled_representation[n_query:]
-        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
-        protos = torch.matmul(onehot_labels.transpose(0, 1),
-                              z_support) / cls_n_support.unsqueeze(-1)
-        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
-        if self.metrics_layer.name == 'relation':
-            scores = torch.sigmoid(scores)
-        return {'scores': scores}
-
-
 activations = {
     'relu': F.relu,
     'tanh': torch.tanh,
@@ -247,3 +152,142 @@ class PoolingLayer(nn.Module):
 
     def forward(self, x, mask):
         return self.pooling(x, mask)
+
+
+@MODELS.register_module(
+    Tasks.faq_question_answering, module_name=Models.structbert)
+class SbertForFaqQuestionAnswering(BaseTaskModel):
+    _backbone_prefix = ''
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model = cls(kwargs.get('model_dir'))
+        model.load_checkpoint(kwargs.get('model_dir'))
+        return model
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = SbertConfig.from_pretrained(model_dir)
+        self.bert = SbertModel(backbone_cfg)
+
+        model_config = Config.from_file(
+            os.path.join(model_dir,
+                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+
+        metric = model_config.get('metric', 'cosine')
+        pooling_method = model_config.get('pooling', 'avg')
+
+        Arg = namedtuple('args', [
+            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
+        ])
+        args = Arg(
+            metrics=metric,
+            proj_hidden_size=self.bert.config.hidden_size,
+            hidden_size=self.bert.config.hidden_size,
+            dropout=0.0,
+            pooling=pooling_method)
+
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data, it contains the following keys:
+                query(:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                    The query to be predicted.
+                support(:obj:`torch.LongTensor` of shape :obj:`(support_size, sequence_length)`):
+                    The support set.
+                support_label(:obj:`torch.LongTensor` of shape :obj:`(support_size, )`):
+                    The labels of support set.
+
+        Returns:
+            Dict[str, Tensor]: result, it contains the following key:
+                scores(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_cls)`):
+                    Predicted scores of all classes for each query.
+        Examples:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+            >>> from modelscope.models.nlp import SbertForFaqQuestionAnswering
+            >>> cache_path = snapshot_download('damo/nlp_structbert_faq-question-answering_chinese-base')
+            >>> preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(cache_path)
+            >>> model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
+            >>> param = {
+            >>>            'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
+            >>>            'support_set': [{
+            >>>                    'text': '卖品代金券怎么用',
+            >>>                    'label': '6527856'
+            >>>               }, {
+            >>>                    'text': '怎么使用优惠券',
+            >>>                    'label': '6527856'
+            >>>                }, {
+            >>>                    'text': '这个可以一起领吗',
+            >>>                    'label': '1000012000'
+            >>>                }, {
+            >>>                    'text': '付款时送的优惠券哪里领',
+            >>>                    'label': '1000012000'
+            >>>                }, {
+            >>>                    'text': '购物等级怎么长',
+            >>>                    'label': '13421097'
+            >>>                }, {
+            >>>                    'text': '购物等级二心',
+            >>>                    'label': '13421097'
+            >>>               }]
+            >>>           }
+            >>> result = model(preprocessor(param))
+        """
+        assert not self.training
+        query = input['query']
+        support = input['support']
+        if isinstance(query, list):
+            query = torch.stack(query)
+        if isinstance(support, list):
+            support = torch.stack(support)
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        query_mask = torch.ne(query, 0).view([n_query, -1])
+        support_mask = torch.ne(support, 0).view([n_support, -1])
+
+        support_labels = input['support_labels']
+        num_cls = torch.max(support_labels) + 1
+        onehot_labels = self._get_onehot_labels(support_labels, n_support,
+                                                num_cls)
+
+        input_ids = torch.cat([query, support])
+        input_mask = torch.cat([query_mask, support_mask], dim=0)
+        pooled_representation = self.forward_sentence_embedding({
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            input_mask
+        })
+        z_query = pooled_representation[:n_query]
+        z_support = pooled_representation[n_query:]
+        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
+        protos = torch.matmul(onehot_labels.transpose(0, 1),
+                              z_support) / cls_n_support.unsqueeze(-1)
+        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        if self.metrics_layer.name == 'relation':
+            scores = torch.sigmoid(scores)
+        return {'scores': scores}
+
+    def _get_onehot_labels(self, labels, support_size, num_cls):
+        labels_ = labels.view(support_size, 1)
+        target_oh = torch.zeros(support_size, num_cls).to(labels)
+        target_oh.scatter_(dim=1, index=labels_, value=1)
+        return target_oh.view(support_size, num_cls).float()
+
+    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.pooling(last_hidden_states, input_mask)
+        return pooled_representation
diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py
new file mode 100644
index 00000000..e611aa88
--- /dev/null
+++ b/modelscope/models/nlp/structbert/fill_mask.py
@@ -0,0 +1,284 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class SbertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class SbertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = SbertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class SbertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class SbertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
+class SbertForMaskedLM(SbertPreTrainedModel):
+    r"""StructBERT Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor, NLPPreprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_fill-mask_chinese-large')
+            >>> preprocessor = NLPPreprocessor('damo/nlp_structbert_fill-mask_chinese-large')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+        attention_mask_zero = attention_mask.new_zeros(
+            (attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, attention_mask_zero],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py
deleted file mode 100755
index e789037a..00000000
--- a/modelscope/models/nlp/structbert/modeling_sbert.py
+++ /dev/null
@@ -1,1963 +0,0 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
-
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    MultipleChoiceModelOutput, NextSentencePredictorOutput,
-    QuestionAnsweringModelOutput, SequenceClassifierOutput,
-    TokenClassifierOutput)
-from transformers.modeling_utils import (PreTrainedModel,
-                                         apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-from .adv_utils import compute_adv_loss, compute_adv_loss_pair
-from .configuration_sbert import SbertConfig
-
-logger = get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std'
-_CONFIG_FOR_DOC = 'SbertConfig'
-_TOKENIZER_FOR_DOC = 'SbertTokenizer'
-
-
-class SbertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(
-                    self.position_ids.size(),
-                    dtype=torch.long,
-                    device=self.position_ids.device),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0,
-                return_inputs_embeds=False):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users
-        # when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        if not return_inputs_embeds:
-            return embeddings
-        else:
-            return embeddings, inputs_embeds
-
-
-class SbertSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class SbertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.self = SbertSelfAttention(config)
-        self.output = SbertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class SbertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class SbertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = SbertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = SbertAttention(config)
-        self.intermediate = SbertIntermediate(config)
-        self.output = SbertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
-                    f'layers by setting `config.add_cross_attention=True`')
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class SbertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class SbertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class SbertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class SbertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = SbertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class SbertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = SbertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class SbertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class SbertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = SbertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class SbertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = SbertConfig
-    base_model_prefix = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, SbertEncoder):
-            module.gradient_checkpointing = value
-
-
-@dataclass
-class SbertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~transformers.BertForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``
-            is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True``
-            is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-SBERT_START_DOCSTRING = r"""
-
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
-            all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-"""
-
-SBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-"""
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-        BaseModelOutputWithPoolingAndCrossAttentions):
-    embedding_output: torch.FloatTensor = None
-    logits: Optional[Union[tuple, torch.FloatTensor]] = None
-    kwargs: dict = None
-
-
-@add_start_docstrings(
-    'The Sbert Model transformer outputting raw hidden-states without any specific head on top.',
-    SBERT_START_DOCSTRING,
-)
-class SbertModel(SbertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    input to the forward pass.
-    """
-
-    def __init__(self, config: SbertConfig, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = SbertEmbeddings(config)
-        self.encoder = SbertEncoder(config)
-
-        self.pooler = SbertPooler(config) if add_pooling_layer else None
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
-            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output, orignal_embeds = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-            return_inputs_embeds=True,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
-
-        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
-    sentence prediction (classification)` head.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForPreTraining(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        self.bert = SbertModel(config)
-        self.cls = SbertPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 2),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores,
-                      seq_relationship_score) + outputs[2:-1]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return SbertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """,
-    SBERT_START_DOCSTRING)
-class SbertLMHeadModel(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        self.cls = SbertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-            with each tuple having 4 tensors of
-            shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        Returns:
-
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:-1]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """Sbert Model with a `language modeling` head on top. """,
-    SBERT_START_DOCSTRING)
-class SbertForMaskedLM(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.bert = SbertModel(config)
-        self.cls = SbertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:-1]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
-        attention_mask_zero = attention_mask.new_zeros(
-            (attention_mask.shape[0], 1))
-        attention_mask = torch.cat([attention_mask, attention_mask_zero],
-                                   dim=-1)
-        dummy_token = torch.full((effective_batch_size, 1),
-                                 self.config.pad_token_id,
-                                 dtype=torch.long,
-                                 device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-@add_start_docstrings(
-    """Sbert Model with a `next sentence prediction (classification)` head on top. """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForNextSentencePrediction(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        self.bert = SbertModel(config)
-        self.cls = SbertOnlyNSPHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-
-        Returns:
-
-        """
-
-        if 'next_sentence_label' in kwargs:
-            warnings.warn(
-                'The `next_sentence_label` argument is deprecated and will be removed '
-                'in a future version, use `labels` instead.',
-                FutureWarning,
-            )
-            labels = kwargs.pop('next_sentence_label')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_scores = self.cls(pooled_output)
-
-        next_sentence_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(
-                seq_relationship_scores.view(-1, 2), labels.view(-1))
-
-        if not return_dict:
-            output = (seq_relationship_scores, ) + outputs[2:-1]
-            return ((next_sentence_loss, )
-                    + output) if next_sentence_loss is not None else output
-
-        return NextSentencePredictorOutput(
-            loss=next_sentence_loss,
-            logits=seq_relationship_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForSequenceClassification(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        outputs['logits'] = logits
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-        return self.compute_loss(outputs, labels, **outputs.kwargs)
-
-    def compute_loss(self, outputs, labels, **kwargs):
-        logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-                if self.config.adv_grad_factor is not None and self.training:
-                    loss = compute_adv_loss(
-                        embedding=embedding_output,
-                        model=self._forward_call,
-                        ori_logits=logits,
-                        ori_loss=loss,
-                        adv_bound=self.config.adv_bound,
-                        adv_grad_factor=self.config.adv_grad_factor,
-                        sigma=self.config.sigma,
-                        **kwargs)
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForMultipleChoice(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    def _forward_call(self, num_choices, **kwargs):
-        outputs = self.bert(**kwargs)
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        outputs['logits'] = logits.view(-1, num_choices)
-        kwargs['num_choices'] = num_choices
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format(
-            'batch_size, num_choices, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            :obj:`input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            num_choices=num_choices)
-
-        reshaped_logits = outputs.logits
-        kwargs = outputs.kwargs
-        embedding_output = outputs.embedding_output
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            if self.config.adv_grad_factor is not None and self.training:
-                loss = compute_adv_loss(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    ori_logits=reshaped_logits,
-                    ori_loss=loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    **kwargs)
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForTokenClassification(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        outputs['logits'] = logits
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-
-        logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            if self.config.adv_grad_factor is not None and self.training:
-                loss = compute_adv_loss(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    ori_logits=logits,
-                    ori_loss=loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    with_attention_mask=attention_mask is not None,
-                    **outputs.kwargs)
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForQuestionAnswering(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        outputs['logits'] = (start_logits, end_logits)
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-        return self.compute_loss(outputs, start_positions, end_positions,
-                                 **outputs.kwargs)
-
-    def compute_loss(self,
-                     outputs,
-                     start_positions=None,
-                     end_positions=None,
-                     **kwargs):
-        start_logits, end_logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            if self.config.adv_grad_factor is not None and self.training:
-                total_loss = compute_adv_loss_pair(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    start_logits=start_logits,
-                    end_logits=end_logits,
-                    ori_loss=total_loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    **kwargs)
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py
new file mode 100644
index 00000000..044cf8d0
--- /dev/null
+++ b/modelscope/models/nlp/structbert/text_classification.py
@@ -0,0 +1,235 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .adv_utils import compute_adv_loss
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(
+    Tasks.text_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassification(SbertPreTrainedModel):
+    r"""StructBERT Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the text classification model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+
+        SbertForSequenceClassification.base_model_prefix = getattr(
+            config, 'base_model_prefix',
+            SbertForSequenceClassification.base_model_prefix)
+        setattr(self, self.base_model_prefix, SbertModel(config))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.base_model(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor(('这是个测试', '这也是个测试'))))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins(('这是个测试', '这也是个测试')))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, labels, **outputs.kwargs)
+
+    def compute_loss(self, outputs, labels, **kwargs):
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+                if self.config.adv_grad_factor is not None and self.training:
+                    loss = compute_adv_loss(
+                        embedding=embedding_output,
+                        model=self._forward_call,
+                        ori_logits=logits,
+                        ori_loss=loss,
+                        adv_bound=self.config.adv_bound,
+                        adv_grad_factor=self.config.adv_grad_factor,
+                        sigma=self.config.sigma,
+                        **kwargs)
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py
new file mode 100644
index 00000000..a040ff3e
--- /dev/null
+++ b/modelscope/models/nlp/structbert/token_classification.py
@@ -0,0 +1,229 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .adv_utils import compute_adv_loss
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(
+    Tasks.token_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
+class SbertForTokenClassification(SbertPreTrainedModel):
+    r"""StructBERT Model with a token classification head on top (a linear layer on top of the hidden-states output)
+    e.g. for Named-Entity-Recognition (NER) tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the token-classification model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.TokenClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in modelscope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        setattr(self, self.base_model_prefix,
+                SbertModel(config, add_pooling_layer=False))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    with_attention_mask=attention_mask is not None,
+                    **outputs.kwargs)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=offset_mapping,
+        )
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/structbert/tokenization_sbert.py
rename to modelscope/models/nlp/structbert/tokenization.py
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/structbert/tokenization_sbert_fast.py
rename to modelscope/models/nlp/structbert/tokenization_fast.py
index a0a81121..6f7b7ba7 100644
--- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
+++ b/modelscope/models/nlp/structbert/tokenization_fast.py
@@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
-from .tokenization_sbert import SbertTokenizer
+from .tokenization import SbertTokenizer
 
 logger = get_logger(__name__)
 
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 38359044..e733efe2 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -7,6 +7,9 @@ if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
     from .feature_extraction import FeatureExtractionModel
     from .fill_mask import FillMaskModel
+    from .nncrf_for_named_entity_recognition import (
+        TransformerCRFForNamedEntityRecognition,
+        LSTMCRFForNamedEntityRecognition)
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
@@ -17,6 +20,10 @@ else:
         'information_extraction': ['InformationExtractionModel'],
         'feature_extraction': ['FeatureExtractionModel'],
         'fill_mask': ['FillMaskModel'],
+        'nncrf_for_named_entity_recognition': [
+            'TransformerCRFForNamedEntityRecognition',
+            'LSTMCRFForNamedEntityRecognition'
+        ],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
index 069c37aa..9360ec08 100644
--- a/modelscope/models/nlp/task_models/feature_extraction.py
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
@@ -31,13 +32,8 @@ class FeatureExtractionModel(SingleBackboneTaskModelBase):
         self.build_backbone(self.backbone_cfg)
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-
         # backbone do not need labels, only head need for loss compute
-        labels = input.pop(OutputKeys.LABELS, None)
-
+        input.pop(OutputKeys.LABELS, None)
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        if labels is not None:
-            input[OutputKeys.LABELS] = labels
-
+        sequence_output = outputs.last_hidden_state
         return {OutputKeys.TEXT_EMBEDDING: sequence_output}
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
index f7ef1cc2..0f7d3345 100644
--- a/modelscope/models/nlp/task_models/fill_mask.py
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
@@ -36,7 +37,7 @@ class FillMaskModel(SingleBackboneTaskModelBase):
         labels = input.pop(OutputKeys.LABELS, None)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        sequence_output = outputs.last_hidden_state
         outputs = self.head.forward(sequence_output)
 
         if labels is not None:
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index a206c2fc..ce0e21a3 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -33,7 +33,7 @@ class InformationExtractionModel(SingleBackboneTaskModelBase):
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        sequence_output = outputs.last_hidden_state
         outputs = self.head.forward(sequence_output, input['text'],
                                     input['offsets'])
         return {OutputKeys.SPO_LIST: outputs}
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
similarity index 83%
rename from modelscope/models/nlp/nncrf_for_named_entity_recognition.py
rename to modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
index 8b0c59b2..017e35e5 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
@@ -12,6 +12,7 @@ from transformers import AutoConfig, AutoModel
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierWithPredictionsOutput
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = [
@@ -39,28 +40,116 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
     def eval(self):
         return self.model.eval()
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ) -> Dict[str, Any]:
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
         input_tensor = {
-            'input_ids': input['input_ids'],
-            'attention_mask': input['attention_mask'],
-            'label_mask': input['label_mask'],
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
         }
         output = {
-            'text': input['text'],
-            'offset_mapping': input['offset_mapping'],
+            'offset_mapping': offset_mapping,
             **input_tensor,
             **self.model(input_tensor)
         }
         return output
 
-    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+    def postprocess(self, input: Dict[str, Any], **kwargs):
         predicts = self.model.decode(input)
-        output = {
-            'text': input['text'],
-            'offset_mapping': input['offset_mapping'],
-            'predicts': predicts['predicts'].squeeze(0).cpu().numpy(),
-        }
-        return output
+        offset_len = len(input['offset_mapping'])
+        predictions = torch.narrow(
+            predicts, 1, 0,
+            offset_len)  # index_select only move loc, not resize
+        return TokenClassifierWithPredictionsOutput(
+            loss=None,
+            logits=None,
+            hidden_states=None,
+            attentions=None,
+            offset_mapping=input['offset_mapping'],
+            predictions=predictions,
+        )
 
 
 @MODELS.register_module(
@@ -133,8 +222,7 @@ class TransformerCRF(nn.Module):
             inputs['label_mask'].shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
         predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
-        outputs = {'predicts': predicts}
-        return outputs
+        return predicts
 
 
 class LSTMCRF(nn.Module):
@@ -183,8 +271,7 @@ class LSTMCRF(nn.Module):
             inputs['label_mask'].shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
         predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
-        outputs = {'predicts': predicts}
-        return outputs
+        return predicts
 
 
 class CRF(nn.Module):
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 1f5e46c3..6c0c09a2 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -1,8 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from typing import Any, Dict
 
-import json
 import numpy as np
 
 from modelscope.metainfo import TaskModels
@@ -16,11 +14,6 @@ from modelscope.utils.hub import parse_label_mapping
 __all__ = ['SequenceClassificationModel']
 
 
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=TaskModels.text_classification)
-@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
 @MODELS.register_module(
     Tasks.text_classification, module_name=TaskModels.text_classification)
 class SequenceClassificationModel(SingleBackboneTaskModelBase):
@@ -54,25 +47,10 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         labels = input.pop(OutputKeys.LABELS, None)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        pooled_output = outputs.pooler_output
         outputs = self.head.forward(pooled_output)
         if labels is not None:
             input[OutputKeys.LABELS] = labels
             loss = self.compute_loss(outputs, labels)
             outputs.update(loss)
         return outputs
-
-    def extract_logits(self, outputs):
-        return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        probs = logits.softmax(-1).numpy()
-        pred = logits.argmax(-1).numpy()
-        logits = logits.numpy()
-        res = {
-            OutputKeys.PREDICTIONS: pred,
-            OutputKeys.PROBABILITIES: probs,
-            OutputKeys.LOGITS: logits
-        }
-        return res
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 0b43044f..8c83517a 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -404,7 +404,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
     def build_backbone(self, cfg):
         if 'prefix' in cfg:
             self._backbone_prefix = cfg['prefix']
-        backbone = build_backbone(cfg, field=Fields.nlp)
+        backbone = build_backbone(cfg)
         setattr(self, cfg['prefix'], backbone)
 
     def build_head(self, cfg):
@@ -414,7 +414,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
             )
         if 'prefix' in cfg:
             self._head_prefix = cfg['prefix']
-        head = build_head(cfg, group_key=self.group_key)
+        head = build_head(cfg, task_name=self.group_key)
         setattr(self, self._head_prefix, head)
         return head
 
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index a39f58bf..2739bf11 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -8,7 +8,7 @@ from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import OutputKeys, TokenClassifierOutput
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
@@ -53,27 +53,20 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             labels = input.pop(OutputKeys.LABELS)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        outputs = self.head.forward(sequence_output)
+        sequence_output = outputs[0]
+        logits = self.head.forward(sequence_output)
+        loss = None
         if labels in input:
             loss = self.compute_loss(outputs, labels)
-            outputs.update(loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=input['offset_mapping'],
+        )
         return outputs
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = torch_nested_numpify(torch_nested_detach(pred))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
-        return res
diff --git a/modelscope/models/nlp/text_ranking.py b/modelscope/models/nlp/text_ranking.py
deleted file mode 100644
index 5bc0635a..00000000
--- a/modelscope/models/nlp/text_ranking.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp import SbertForSequenceClassification
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['TextRanking']
-
-
-@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
-class TextRanking(SbertForSequenceClassification, SbertPreTrainedModel):
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir, *args, **kwargs):
-        if hasattr(config, 'base_model_prefix'):
-            TextRanking.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-        self.train_batch_size = kwargs.get('train_batch_size', 4)
-        self.register_buffer(
-            'target_label',
-            torch.zeros(self.train_batch_size, dtype=torch.long))
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=True)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        outputs = self.base_model.forward(**input)
-
-        # backbone model should return pooled_output as its second output
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if self.base_model.training:
-            scores = logits.view(self.train_batch_size, -1)
-            loss_fct = torch.nn.CrossEntropyLoss()
-            loss = loss_fct(scores, self.target_label)
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def sigmoid(self, logits):
-        return np.exp(logits) / (1 + np.exp(logits))
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        logits = inputs['logits'].squeeze(-1).detach().cpu().numpy()
-        logits = self.sigmoid(logits).tolist()
-        result = {OutputKeys.SCORES: logits}
-        return result
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (1 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        num_labels = kwargs.get('num_labels', 1)
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-
-        return super(SbertPreTrainedModel, TextRanking).from_pretrained(
-            pretrained_model_name_or_path=kwargs.get('model_dir'),
-            model_dir=kwargs.get('model_dir'),
-            **model_args)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
deleted file mode 100644
index e58967a5..00000000
--- a/modelscope/models/nlp/token_classification.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from abc import abstractmethod
-from typing import Dict
-
-import numpy as np
-import torch
-from torch import nn
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertPreTrainedModel
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
-
-__all__ = ['SbertForTokenClassification']
-
-
-class TokenClassification(TorchModel):
-    """A token classification base class for all the fitted token classification models.
-    """
-
-    base_model_prefix: str = 'bert'
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.num_labels = config.num_labels
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    @abstractmethod
-    def build_base_model(self):
-        """Build the backbone model.
-
-        Returns: the backbone instance.
-        """
-        pass
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix)
-
-    def compute_loss(self, logits, labels, **kwargs):
-        """Compute loss.
-
-        For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip
-        useless tokens.
-
-        Args:
-            logits: The logits from the classifier
-            labels: The labels
-            **kwargs: Other input params.
-
-        Returns: The loss.
-
-        """
-        pass
-
-    def forward(self, **kwargs):
-        labels = None
-        if OutputKeys.LABEL in kwargs:
-            labels = kwargs.pop(OutputKeys.LABEL)
-        elif OutputKeys.LABELS in kwargs:
-            labels = kwargs.pop(OutputKeys.LABELS)
-
-        outputs = self.base_model(**kwargs)
-        # base model should return the sequence_output as its first output
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        if labels is not None:
-            loss = self.compute_loss(logits, labels, **kwargs)
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def postprocess(self, input: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        logits = input[OutputKeys.LOGITS]
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = torch_nested_numpify(torch_nested_detach(pred))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
-        return rst
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
-@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.token_classification, module_name=Models.structbert)
-class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
-    """Sbert token classification model.
-
-    Inherited from TokenClassification.
-    """
-
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            SbertForTokenClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=False)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=labels)
-
-    def compute_loss(self, logits, labels, attention_mask=None, **kwargs):
-        """Compute the loss with an attention mask.
-
-        @param logits: The logits output from the classifier.
-        @param labels: The labels.
-        @param attention_mask: The attention_mask.
-        @param kwargs: Unused input args.
-        @return: The loss
-        """
-        loss_fct = nn.CrossEntropyLoss()
-        # Only keep active parts of the loss
-        if attention_mask is not None:
-            active_loss = attention_mask.view(-1) == 1
-            active_logits = logits.view(-1, self.num_labels)
-            active_labels = torch.where(
-                active_loss, labels.view(-1),
-                torch.tensor(loss_fct.ignore_index).type_as(labels))
-            return loss_fct(active_logits, active_labels)
-        else:
-            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(SbertPreTrainedModel,
-                     SbertForTokenClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
-@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
-class BertForTokenClassification(TokenClassification, BertPreTrainedModel):
-    """Bert token classification model.
-
-        Inherited from TokenClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            BertForTokenClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .bert import BertModel
-        return BertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            **kwargs)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(BertPreTrainedModel,
-                     BertForTokenClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py
index 0fe786fd..0774e9b4 100644
--- a/modelscope/models/nlp/veco/__init__.py
+++ b/modelscope/models/nlp/veco/__init__.py
@@ -18,18 +18,22 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_veco import VecoConfig
-    from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification,
-                                VecoModel)
-    from .tokenization_veco import VecoTokenizer
-    from .tokenization_veco_fast import VecoTokenizerFast
+    from .configuration import VecoConfig
+    from .backbone import VecoModel
+    from .text_classification import VecoForSequenceClassification
+    from .token_classification import VecoForTokenClassification
+    from .fill_mask import VecoForMaskedLM
+    from .tokenization import VecoTokenizer
+    from .tokenization_fast import VecoTokenizerFast
 else:
     _import_structure = {
-        'configuration_veco': ['VecoConfig'],
-        'modeling_veco':
-        ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'],
-        'tokenization_veco': ['VecoTokenizer'],
-        'tokenization_veco_fast': ['VecoTokenizerFast'],
+        'configuration': ['VecoConfig'],
+        'backbone': ['VecoModel'],
+        'text_classification': ['VecoForSequenceClassification'],
+        'fill_mask': ['VecoForMaskedLM'],
+        'token_classification': ['VecoForTokenClassification'],
+        'tokenization': ['VecoTokenizer'],
+        'tokenization_fast': ['VecoTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/veco/backbone.py b/modelscope/models/nlp/veco/backbone.py
new file mode 100644
index 00000000..98d8c30a
--- /dev/null
+++ b/modelscope/models/nlp/veco/backbone.py
@@ -0,0 +1,96 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+from transformers import RobertaModel
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .configuration import VecoConfig
+
+logger = logging.get_logger(__name__)
+
+VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.veco)
+class VecoModel(TorchModel, RobertaModel):
+    """The bare Veco Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> print(model(**preprocessor('这是个测试')))
+
+        """
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionBackboneModelOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            pooler_output=outputs.pooler_output,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = VecoConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration.py
similarity index 100%
rename from modelscope/models/nlp/veco/configuration_veco.py
rename to modelscope/models/nlp/veco/configuration.py
diff --git a/modelscope/models/nlp/veco/fill_mask.py b/modelscope/models/nlp/veco/fill_mask.py
new file mode 100644
index 00000000..de2cdb4a
--- /dev/null
+++ b/modelscope/models/nlp/veco/fill_mask.py
@@ -0,0 +1,99 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForMaskedLM
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
+class VecoForMaskedLM(TorchModel, RobertaForMaskedLM):
+    """Veco Model transformer with a masked language model head on top (a linear layer on top of the
+    pooled output).
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可<mask>不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可<mask>不动我。'))
+        """
+
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionFillMaskModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=kwargs['input_ids'],
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = VecoConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py
deleted file mode 100644
index b519c236..00000000
--- a/modelscope/models/nlp/veco/modeling_veco.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
-
-from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice,
-                          RobertaForQuestionAnswering,
-                          RobertaForSequenceClassification,
-                          RobertaForTokenClassification, RobertaModel)
-from transformers.file_utils import add_start_docstrings
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.utils import logger as logging
-from modelscope.utils.constant import Fields
-from .configuration_veco import VecoConfig
-
-logger = logging.get_logger(__name__)
-
-VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
-
-VECO_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config ([`VecoConfig`]): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.',
-    VECO_START_DOCSTRING,
-)
-class VecoModel(RobertaModel):
-    """
-    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForSequenceClassification(RobertaForSequenceClassification):
-    """
-    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model transformer with a masked language model head on top (a linear layer on top of the
-    pooled output).
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForMaskedLM(RobertaForMaskedLM):
-    """
-    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
-    a softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForMultipleChoice(RobertaForMultipleChoice):
-    """
-    This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
-    for Named-Entity-Recognition (NER) tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForTokenClassification(RobertaForTokenClassification):
-    """
-    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
-    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForQuestionAnswering(RobertaForQuestionAnswering):
-    """
-    This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
diff --git a/modelscope/models/nlp/veco/text_classification.py b/modelscope/models/nlp/veco/text_classification.py
new file mode 100644
index 00000000..e4e74d8f
--- /dev/null
+++ b/modelscope/models/nlp/veco/text_classification.py
@@ -0,0 +1,150 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForSequenceClassification
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.nli, module_name=Models.veco)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.veco)
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.veco)
+class VecoForSequenceClassification(TorchModel,
+                                    RobertaForSequenceClassification):
+    """Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the text classification model of Veco, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model should be trained by dataset which has mixed languages,
+        and evaluated by datasets of languages one by one.
+        For example, if the training dataset is xnli (which has sub datasets of multiple languages), then you
+        should mix the sub-datasets with the languages you want to train to one training dataset, and evaluate
+        the model one sub-dataset by one sub-dataset of different languages.
+        This procedure can be done by custom code. If you are using trainer of ModelScope,
+        the `VecoTrainer` is suggested to use to train this model. This trainer overrides the basic evaluation
+        loop, and will call the evaluation dataset one by one. Besides, this trainer will use the `VecoTaskDataset`
+        to mix the input datasets to one, you can check the API Doc for the details.
+
+        To check the complete example please
+        view the unittest `test_veco_xnli` in `tests.trainers.test_finetune_sequence_classification.py`
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large',
+            >>>                               task='text-classification', num_labels=2)
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large',
+            >>>                                             label2id={'0': 0, '1': 1})
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('这是个测试')))
+            >>> # Call the pipeline, the result may be incorrect
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification', pipeline_name='text-classification',
+            >>>                         model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('这是个测试'))
+        """
+
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionTextClassificationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = VecoConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
diff --git a/modelscope/models/nlp/veco/token_classification.py b/modelscope/models/nlp/veco/token_classification.py
new file mode 100644
index 00000000..f6252209
--- /dev/null
+++ b/modelscope/models/nlp/veco/token_classification.py
@@ -0,0 +1,107 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForTokenClassification
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTokenClassificationModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.veco)
+class VecoForTokenClassification(TorchModel, RobertaForTokenClassification):
+    """Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionTokenClassificationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = VecoConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/veco/tokenization_veco.py
rename to modelscope/models/nlp/veco/tokenization.py
diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/veco/tokenization_veco_fast.py
rename to modelscope/models/nlp/veco/tokenization_fast.py
index 3edae0e7..b41a5c3b 100644
--- a/modelscope/models/nlp/veco/tokenization_veco_fast.py
+++ b/modelscope/models/nlp/veco/tokenization_fast.py
@@ -27,7 +27,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from modelscope.utils import logger as logging
 
 if is_sentencepiece_available():
-    from .tokenization_veco import VecoTokenizer
+    from .tokenization import VecoTokenizer
 else:
     VecoTokenizer = None
 
diff --git a/modelscope/msdatasets/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
index 014e4faa..4d82b741 100644
--- a/modelscope/msdatasets/task_datasets/torch_base_dataset.py
+++ b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
@@ -19,6 +19,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
                  preprocessor=None,
                  **kwargs):
         TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs)
+        self.trainer = None
 
     def __getitem__(self, index) -> Any:
         return self.prepare_sample(self._inner_dataset[index])
diff --git a/modelscope/outputs/__init__.py b/modelscope/outputs/__init__.py
new file mode 100644
index 00000000..47e66714
--- /dev/null
+++ b/modelscope/outputs/__init__.py
@@ -0,0 +1,2 @@
+from .nlp.model_outputs import *  # noqa
+from .outputs import TASK_OUTPUTS, ModelOutputBase, OutputKeys
diff --git a/modelscope/preprocessors/space_T_cn/fields/__init__.py b/modelscope/outputs/nlp/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/fields/__init__.py
rename to modelscope/outputs/nlp/__init__.py
diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp/model_outputs.py
new file mode 100644
index 00000000..dcb37145
--- /dev/null
+++ b/modelscope/outputs/nlp/model_outputs.py
@@ -0,0 +1,543 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+from modelscope.outputs.outputs import ModelOutputBase
+
+Tensor = Union['torch.Tensor', 'tf.Tensor']
+
+
+@dataclass
+class TextClassificationModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model. loss (`Tensor`,
+        *optional*) The loss of the model, available when training.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+        output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+
+
+@dataclass
+class TokenClassificationModelOutput(ModelOutputBase):
+    """The output class for token classification models.
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    offset_mapping: Tensor = None
+
+
+@dataclass
+class FillMaskModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+        input_ids (`Tensor`, *optional*) The input id tensor fed into the model.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+            output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    input_ids: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutputBase):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided) :
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+
+    """
+
+    loss: Tensor = None
+    logits: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    offset_mapping: Tensor = None
+
+
+@dataclass
+class TokenClassifierWithPredictionsOutput(ModelOutputBase):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided) :
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        predictions: A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+
+    """
+
+    loss: Tensor = None
+    logits: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    offset_mapping: Tensor = None
+    predictions: Tensor = None
+
+
+@dataclass
+class BaseModelOutput(ModelOutputBase):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    hidden_states: Optional[Tuple[Tensor]] = None
+    attentions: Optional[Tuple[Tensor]] = None
+
+
+@dataclass
+class BackboneModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        last_hidden_state (`Tensor`, *optional*): Sequence of hidden-states at
+            the output of the last layer of the model.
+        pooler_output (`Tensor`, *optional*) The tensor of the pooled hidden state.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at
+            the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: Tensor = None
+    pooler_output: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionBackboneModelOutput(BackboneModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+    past_key_values: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class AttentionTextClassificationModelOutput(TextClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax,
+        used to compute the weighted average in the self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionFillMaskModelOutput(FillMaskModelOutput):
+    """The output class for the fill mask and attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase):
+    """
+    Base class for model's outputs that also contains a pooling of the last
+    hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size,
+        hidden_size)`):
+            Last layer hidden-state of the first token of the sequence
+            (classification token) after further processing through the layers
+            used for the auxiliary pretraining task. E.g. for BERT-family of
+            models, this returns the classification token after processing
+            through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction
+            (classification) objective during pretraining.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` and `config.add_cross_attention=True` is passed
+        or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that
+            can be used (see `past_key_values` input) to speed up sequential
+            decoding.
+    """
+
+    last_hidden_state: Tensor = None
+    pooler_output: Tensor = None
+    hidden_states: Tensor = None
+    past_key_values: Tensor = None
+    attentions: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutputBase):
+    """
+    Base class for model's outputs that may also contain a past key/values (to
+    speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+
+            If `past_key_values` is used only the last hidden-state of the
+            sequences of shape `(batch_size, 1, hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that
+            can be used (see `past_key_values` input) to speed up sequential
+            decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` and `config.add_cross_attention=True` is passed
+        or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    past_key_values: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutputBase):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed
+    hidden states that can speed up sequential decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the
+            sequences of shape `(batch_size, 1, hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and in the cross-attention blocks) that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the
+            optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the
+            optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tensor]] = None
+    cross_attentions: Optional[Tuple[Tensor]] = None
+    encoder_last_hidden_state: Optional[Tensor] = None
+    encoder_hidden_states: Optional[Tuple[Tensor]] = None
+    encoder_attentions: Optional[Tuple[Tensor]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutputBase):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and in the cross-attention blocks) that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the
+            initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the
+            initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[Tensor] = None
+    logits: Tensor = None
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tensor]] = None
+    cross_attentions: Optional[Tuple[Tensor]] = None
+    encoder_last_hidden_state: Optional[Tensor] = None
+    encoder_hidden_states: Optional[Tuple[Tensor]] = None
+    encoder_attentions: Optional[Tuple[Tensor]] = None
diff --git a/modelscope/outputs.py b/modelscope/outputs/outputs.py
similarity index 93%
rename from modelscope/outputs.py
rename to modelscope/outputs/outputs.py
index 34bde76a..721fb271 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from collections import OrderedDict, namedtuple
+from dataclasses import dataclass, fields
 
 from modelscope.utils.constant import Tasks
 
@@ -488,7 +490,6 @@ TASK_OUTPUTS = {
     # ]
     # }
     Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
-    Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
 
     # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
@@ -499,6 +500,7 @@ TASK_OUTPUTS = {
     #   ]
     # }
     Tasks.named_entity_recognition: [OutputKeys.OUTPUT],
+    Tasks.part_of_speech: [OutputKeys.OUTPUT],
 
     # text_error_correction result for a single sample
     # {
@@ -779,3 +781,60 @@ TASK_OUTPUTS = {
     # }
     Tasks.product_segmentation: [OutputKeys.MASKS],
 }
+
+
+class ModelOutputBase(list):
+
+    def __post_init__(self):
+        self.reconstruct()
+        self.post_init = True
+
+    def reconstruct(self):
+        # Low performance, but low frequency.
+        self.clear()
+        for idx, key in enumerate(self.keys()):
+            self.append(getattr(self, key))
+
+    def __getitem__(self, item):
+        if isinstance(item, str):
+            if hasattr(self, item):
+                return getattr(self, item)
+        elif isinstance(item, (int, slice)):
+            return super().__getitem__(item)
+        raise IndexError(f'No Index {item} found in the dataclass.')
+
+    def __setitem__(self, key, value):
+        if isinstance(key, str):
+            if key in [f.name for f in fields(self)]:
+                if key not in self.keys():
+                    super().__setattr__(key, value)
+                    self.reconstruct()
+                elif id(getattr(self, key)) != id(value):
+                    super().__setattr__(key, value)
+                    super().__setitem__(self.keys().index(key), value)
+            else:
+                super().__setattr__(key, value)
+        elif isinstance(key, int):
+            super().__setitem__(key, value)
+            key_name = self.keys()[key]
+            super().__setattr__(key_name, value)
+
+    def __setattr__(self, key, value):
+        if getattr(self, 'post_init', False):
+            return self.__setitem__(key, value)
+        else:
+            return super().__setattr__(key, value)
+
+    def keys(self):
+        return [
+            f.name for f in fields(self) if getattr(self, f.name) is not None
+        ]
+
+    def items(self):
+        return self.to_dict().items()
+
+    def to_dict(self):
+        output = OrderedDict()
+        for key in self.keys():
+            output[key] = getattr(self, key)
+        return output
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 644749fc..bca80502 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -33,7 +33,7 @@ if is_tf_available():
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
-InputModel = Union[str, Model]
+InputModel = Union[str, Model, 'torch.nn.Module']
 
 logger = get_logger()
 
@@ -49,13 +49,7 @@ class Pipeline(ABC):
             return Model.from_pretrained(
                 model, model_prefetched=True,
                 device=self.device_name) if is_model(model) else model
-        elif isinstance(model, Model):
-            return model
         else:
-            if model and not isinstance(model, str):
-                raise ValueError(
-                    f'model type for single model is either str or Model, but got type {type(model)}'
-                )
             return model
 
     def initiate_multiple_models(self, input_models: List[InputModel]):
@@ -139,12 +133,10 @@ class Pipeline(ABC):
     def _get_framework(self) -> str:
         frameworks = []
         for m in self.models:
-            if isinstance(m, Model):
-                model_dir = m.model_dir
-            else:
-                assert isinstance(m,
-                                  str), 'model should be either str or Model.'
+            if isinstance(m, str):
                 model_dir = m
+            else:
+                model_dir = m.model_dir
             cfg_file = osp.join(model_dir, ModelFile.CONFIGURATION)
             cfg = Config.from_file(cfg_file)
             frameworks.append(cfg.framework)
@@ -387,10 +379,13 @@ class DistributedPipeline(Pipeline):
     def _instantiate_one(cls, rank, model_dir, **kwargs):
         """Instantiate one model piece.
 
-        @param rank: The model rank.
-        @param model_dir: The model_dir in the node.
-        @param kwargs: Any extra args.
-        @return: None. The model handler should be kept in the class field.
+        Args:
+            rank: The model rank.
+            model_dir: The model_dir in the node.
+            kwargs: Any extra args.
+
+        Returns:
+            None. The model handler should be kept in the class field.
         """
         pass
 
@@ -410,8 +405,11 @@ class DistributedPipeline(Pipeline):
 
         Use the model handler kept in the class field to forward.
 
-        @param inputs: The inputs after the preprocessing.
-        @return: The forward results.
+        Args:
+            inputs: The inputs after the preprocessing.
+
+        Returns:
+            The forward results.
         """
         pass
 
@@ -429,7 +427,7 @@ def collate_fn(data, device):
 
     """
     from torch.utils.data.dataloader import default_collate
-    from modelscope.preprocessors import InputFeatures
+    from modelscope.preprocessors.nlp import InputFeatures
     if isinstance(data, dict) or isinstance(data, Mapping):
         return type(data)({k: collate_fn(v, device) for k, v in data.items()})
     elif isinstance(data, (tuple, list)):
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index e1583387..498c9ed8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -285,9 +285,6 @@ def pipeline(task: str = None,
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
 
-    assert isinstance(model, (type(None), str, Model, list)), \
-        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
-
     model = normalize_model_input(model, model_revision)
     if pipeline_name is None:
         # get default pipeline for this task
@@ -304,8 +301,7 @@ def pipeline(task: str = None,
             else:
                 # used for test case, when model is str and is not hub path
                 pipeline_name = get_pipeline_by_model_name(task, model)
-        elif isinstance(model, Model) or \
-                (isinstance(model, list) and isinstance(model[0], Model)):
+        elif model is not None:
             # get pipeline info from Model object
             first_model = model[0] if isinstance(model, list) else model
             if not hasattr(first_model, 'pipeline'):
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 677151c0..73bd0d8c 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .automatic_post_editing_pipeline import AutomaticPostEditingPipeline
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
@@ -14,16 +15,13 @@ if TYPE_CHECKING:
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
-    from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
-    from .sequence_classification_pipeline import SequenceClassificationPipeline
+    from .text_classification_pipeline import TextClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
-    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
-    from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
     from .text2text_generation_pipeline import Text2TextGenerationPipeline
@@ -47,13 +45,11 @@ else:
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
-        'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
         'text_ranking_pipeline': ['TextRankingPipeline'],
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
-        'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'table_question_answering_pipeline':
         ['TableQuestionAnsweringPipeline'],
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index 73c6429d..48df0c40 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -11,8 +11,6 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
-from modelscope.preprocessors.star.fields import (SubPreprocessor,
-                                                  process_tables)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ConversationalTextToSqlPipeline']
@@ -39,17 +37,6 @@ class ConversationalTextToSqlPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = ConversationalTextToSqlPreprocessor(model.model_dir)
 
-        preprocessor.device = 'cuda' if \
-            ('device' not in kwargs or kwargs['device'] == 'gpu') \
-            and torch.cuda.is_available() else 'cpu'
-        use_device = True if preprocessor.device == 'cuda' else False
-        preprocessor.processor = \
-            SubPreprocessor(model_dir=model.model_dir,
-                            db_content=True,
-                            use_gpu=use_device)
-        preprocessor.output_tables = \
-            process_tables(preprocessor.processor,
-                           preprocessor.tables)
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index 79d32ace..9520c06f 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SpaceForDialogStateTracking
+from modelscope.models.nlp import SpaceForDST
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -20,7 +20,7 @@ __all__ = ['DialogStateTrackingPipeline']
 class DialogStateTrackingPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[SpaceForDialogStateTracking, str],
+                 model: Union[SpaceForDST, str],
                  preprocessor: DialogStateTrackingPreprocessor = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a dialog state tracking pipeline for
@@ -33,8 +33,7 @@ class DialogStateTrackingPipeline(Pipeline):
         """
 
         model = model if isinstance(
-            model,
-            SpaceForDialogStateTracking) else Model.from_pretrained(model)
+            model, SpaceForDST) else Model.from_pretrained(model)
         self.model = model
         if preprocessor is None:
             preprocessor = DialogStateTrackingPreprocessor(model.model_dir)
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
index e5c05e86..8499f7ff 100644
--- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -27,7 +27,8 @@ class DistributedPlugPipeline(DistributedPipeline):
                  **kwargs):
         """Create a plug pipeline instance.
 
-        @param model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        Args:
+            model: The model_id of plug(damo/nlp_plug_text-generation_27B).
         The default path to damo/nlp_plug_text-generation_27B can be obtained by function
         get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
         this path before calling this class by model_id.
@@ -52,11 +53,11 @@ class DistributedPlugPipeline(DistributedPipeline):
                 |_ mp_rank_05_model_states.pt
                 |_ mp_rank_06_model_states.pt
                 |_ mp_rank_07_model_states.pt
-        @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+            preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
             be used as default.
-        @param first_sequence: The first_sequence key name if the input format is a dict.
-        @param kwargs:
-            sequence_length: The input sequence_length.
+            first_sequence: The first_sequence key name if the input format is a dict.
+            kwargs:
+                sequence_length: The input sequence_length.
         """
         if preprocessor is None:
             preprocessor = TextGenerationPreprocessor(
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 1d46d8fd..fd614e91 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -2,15 +2,12 @@
 
 from typing import Any, Dict, Union
 
-import torch
-
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForFaqQuestionAnswering
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['FaqQuestionAnsweringPipeline']
@@ -21,19 +18,19 @@ __all__ = ['FaqQuestionAnsweringPipeline']
 class FaqQuestionAnsweringPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[str, SbertForFaqQuestionAnswering],
-                 preprocessor: FaqQuestionAnsweringPreprocessor = None,
+                 model: Union[str, Model],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        model = model if isinstance(
-            model,
-            SbertForFaqQuestionAnswering) else Model.from_pretrained(model)
-        model.eval()
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
         if preprocessor is None:
-            preprocessor = FaqQuestionAnsweringPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 model.model_dir, **kwargs)
-        self.preprocessor = preprocessor
-        super(FaqQuestionAnsweringPipeline, self).__init__(
-            model=model, preprocessor=preprocessor, **kwargs)
+            if preprocessor is None:
+                from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+                preprocessor = FaqQuestionAnsweringPreprocessor(
+                    model.model_dir, **kwargs)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, pipeline_parameters, pipeline_parameters
@@ -46,8 +43,7 @@ class FaqQuestionAnsweringPipeline(Pipeline):
 
     def forward(self, inputs: [list, Dict[str, Any]],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(inputs)
+        return self.model(inputs)
 
     def postprocess(self, inputs: [list, Dict[str, Any]],
                     **postprocess_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 3d515e2d..0f3446e6 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,145 +1,103 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import os
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NLPPreprocessor, Preprocessor
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Tasks
 
 __all__ = ['FillMaskPipeline']
-_type_map = {
-    'veco': 'roberta',
-    'sbert': 'bert',
-}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
+@PIPELINES.register_module(
+    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='sentence',
+                 first_sequence: str = 'sentence',
                  **kwargs):
-        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+        """The inference pipeline for all the fill mask sub-tasks.
 
         Args:
-            model (str or Model): Supply either a local model dir which supported mlm task, or a
-            mlm model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            first_sequence (`str`， `optional`): The key to read the sentence in.
+            sequence_length (`int`， `optional`): Max sequence length in the user's custom scenario, default 128.
+
+            NOTE1: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
             param will have no effect.
 
-            Example:
+            Example1:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large')
             >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
             >>> print(pipeline_ins(input))
+            Example2:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+            >>> print(pipeline_ins(input))
 
             NOTE2: Please pay attention to the model's special tokens.
             If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
             If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
             To view other examples plese check the tests/pipelines/test_fill_mask.py.
         """
-        fill_mask_model = model if isinstance(
-            model, Model) else Model.from_pretrained(model)
+
+        fill_mask_model = Model.from_pretrained(model) if isinstance(
+            model, str) else model
 
         if preprocessor is None:
-            preprocessor = NLPPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 fill_mask_model.model_dir,
                 first_sequence=first_sequence,
                 second_sequence=None,
                 sequence_length=kwargs.pop('sequence_length', 128))
         fill_mask_model.eval()
+        assert hasattr(
+            preprocessor, 'mask_id'
+        ), 'The input preprocessor should have the mask_id attribute.'
         super().__init__(
             model=fill_mask_model, preprocessor=preprocessor, **kwargs)
 
-        self.preprocessor = preprocessor
-        self.config = Config.from_file(
-            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
-        self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4}
-
-        self.rep_map = {
-            'bert': {
-                '[unused0]': '',
-                '[PAD]': '',
-                '[unused1]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[unused2]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-            'roberta': {
-                r' +': ' ',
-                '<mask>': '<q>',
-                '<pad>': '',
-                '<s>': '',
-                '</s>': '',
-                '<unk>': ' '
-            },
-            'deberta_v2': {
-                '[PAD]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-        }
-
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
-
+            inputs (Dict[str, Any]): The model outputs.
+            The output should follow some rules:
+                1. Values can be retrieved by keys(dict-like, or the __getitem__ method is overriden)
+                2. 'logits' and 'input_ids' key exists.
+            Models in modelscope will return the output dataclass `modelscope.outputs.FillMaskModelOutput`.
         Returns:
             Dict[str, str]: the prediction results
         """
-        import numpy as np
         logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
         input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
-        if hasattr(self.model.config, 'backbone'):
-            model_type = self.model.config.backbone.type
-        else:
-            model_type = self.model.config.model_type
-        process_type = model_type if model_type in self.mask_id else _type_map[
-            model_type]
-        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
+        rst_ids = np.where(input_ids == self.preprocessor.mask_id, pred_ids,
                            input_ids)
 
-        def rep_tokens(string, rep_map):
-            for k, v in rep_map.items():
-                string = string.replace(k, v)
-            return string.strip()
-
         pred_strings = []
         for ids in rst_ids:  # batch
-            if 'language' in self.config.model and self.config.model.language == 'zh':
-                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
-                pred_string = ''.join(pred_string)
-            else:
-                pred_string = self.tokenizer.decode(ids)
-            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
+            pred_string = self.preprocessor.decode(
+                ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True)
             pred_strings.append(pred_string)
 
         return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
deleted file mode 100644
index 9770fc38..00000000
--- a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os
-from typing import Any, Dict, Optional, Union
-
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
-
-__all__ = ['FillMaskPonetPipeline']
-_type_map = {'ponet': 'bert'}
-
-
-@PIPELINES.register_module(
-    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
-class FillMaskPonetPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='sentence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported fill-mask task,
-            or a fill-mask model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the sentence in.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(
-                    'fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
-            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
-            >>> print(pipeline_ins(input))
-
-            NOTE2: Please pay attention to the model's special tokens.
-            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
-            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
-            To view other examples plese check the tests/pipelines/test_fill_mask.py.
-        """
-        fill_mask_model = model if isinstance(
-            model, Model) else Model.from_pretrained(model)
-
-        self.config = Config.from_file(
-            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
-
-        if preprocessor is None:
-            preprocessor = FillMaskPoNetPreprocessor(
-                fill_mask_model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=None,
-                sequence_length=kwargs.pop('sequence_length', 512))
-
-        fill_mask_model.eval()
-        super().__init__(
-            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
-
-        self.preprocessor = preprocessor
-
-        self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103}
-
-        self.rep_map = {
-            'bert': {
-                '[unused0]': '',
-                '[PAD]': '',
-                '[unused1]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[unused2]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-            'roberta': {
-                r' +': ' ',
-                '<mask>': '<q>',
-                '<pad>': '',
-                '<s>': '',
-                '</s>': '',
-                '<unk>': ' '
-            }
-        }
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-        import numpy as np
-        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
-        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
-        pred_ids = np.argmax(logits, axis=-1)
-        model_type = self.model.config.model_type
-        process_type = model_type if model_type in self.mask_id else _type_map[
-            model_type]
-        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
-                           input_ids)
-
-        def rep_tokens(string, rep_map):
-            for k, v in rep_map.items():
-                string = string.replace(k, v)
-            return string.strip()
-
-        pred_strings = []
-        for ids in rst_ids:  # batch
-            if 'language' in self.config.model and self.config.model.language == 'zh':
-                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
-                pred_string = ''.join(pred_string)
-            else:
-                pred_string = self.tokenizer.decode(ids)
-            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
-            pred_strings.append(pred_string)
-
-        return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 7275feca..8d8c4542 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['NamedEntityRecognitionPipeline']
 
@@ -59,37 +61,68 @@ class NamedEntityRecognitionPipeline(Pipeline):
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return {
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
+            }
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): should be tensors from model
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
         text = inputs['text']
+        if OutputKeys.PREDICTIONS not in inputs:
+            logits = inputs[OutputKeys.LOGITS]
+            predictions = torch.argmax(logits[0], dim=-1)
+        else:
+            predictions = inputs[OutputKeys.PREDICTIONS].squeeze(
+                0).cpu().numpy()
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
         offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
-        labels = [self.id2label[x] for x in inputs['predicts']]
-        entities = []
-        entity = {}
+
+        labels = [self.id2label[x] for x in predictions]
+        chunks = []
+        chunk = {}
         for label, offsets in zip(labels, offset_mapping):
             if label[0] in 'BS':
-                if entity:
-                    entity['span'] = text[entity['start']:entity['end']]
-                    entities.append(entity)
-                entity = {
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
                     'type': label[2:],
                     'start': offsets[0],
                     'end': offsets[1]
                 }
             if label[0] in 'IES':
-                if entity:
-                    entity['end'] = offsets[1]
+                if chunk:
+                    chunk['end'] = offsets[1]
+
             if label[0] in 'ES':
-                if entity:
-                    entity['span'] = text[entity['start']:entity['end']]
-                    entities.append(entity)
-                    entity = {}
-        if entity:
-            entity['span'] = text[entity['start']:entity['end']]
-            entities.append(entity)
-        outputs = {OutputKeys.OUTPUT: entities}
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
+        if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
+            chunks.append(chunk)
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
 
+        # for ner outpus
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index 16dedb2e..cfa5c2f1 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -2,15 +2,14 @@
 
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      SentenceEmbeddingPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['SentenceEmbeddingPipeline']
@@ -33,20 +32,18 @@ class SentenceEmbeddingPipeline(Pipeline):
             the model if supplied.
             sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
         """
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
         if preprocessor is None:
-            preprocessor = SentenceEmbeddingPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 model.model_dir if isinstance(model, Model) else model,
                 first_sequence=first_sequence,
                 sequence_length=kwargs.pop('sequence_length', 128))
-        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return {**self.model(inputs, **forward_params)}
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """process the prediction results
@@ -57,6 +54,11 @@ class SentenceEmbeddingPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the predicted text representation
         """
-        embs = inputs[OutputKeys.TEXT_EMBEDDING]
-        scores = inputs[OutputKeys.SCORES]
+        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
+        num_sent = embs.shape[0]
+        if num_sent >= 2:
+            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
+                                                      (1, 0))).tolist()[0]
+        else:
+            scores = []
         return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
deleted file mode 100644
index 69f6217a..00000000
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      SequenceClassificationPreprocessor)
-from modelscope.utils.constant import Tasks
-
-
-@PIPELINES.register_module(
-    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-@PIPELINES.register_module(
-    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
-class SequenceClassificationPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 **kwargs):
-        """This is the base class for all the sequence classification sub-tasks.
-
-        Args:
-            model (str or Model): A model instance or a model local dir or a model id in the model hub.
-            preprocessor (Preprocessor): a preprocessor instance, must not be None.
-        """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-        first_sequence = kwargs.pop('first_sequence', 'first_sequence')
-        second_sequence = kwargs.pop('second_sequence', None)
-
-        if preprocessor is None:
-            preprocessor = SequenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-
-        assert preprocessor is not None
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-            topk (int): The topk probs to take
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs[OutputKeys.PROBABILITIES][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index fc0d07b1..826e35a9 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -13,9 +13,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.space_T_cn.fields.database import Database
-from modelscope.preprocessors.space_T_cn.fields.struct import (Constant,
-                                                               SQLQuery)
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.struct import (Constant,
+                                                                   SQLQuery)
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index 13d9964d..9e00ad7f 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -1,43 +1,124 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
 from modelscope.models.multi_modal import OfaForAllTasks
-from modelscope.pipelines.base import Model, Pipeline
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
 
 
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
 @PIPELINES.register_module(
     Tasks.text_classification, module_name=Pipelines.text_classification)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentence_similarity)
+@PIPELINES.register_module(
+    Tasks.sentiment_classification,
+    module_name=Pipelines.sentiment_classification)
 class TextClassificationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
-                 preprocessor: [Preprocessor] = None,
+                 preprocessor: Preprocessor = None,
                  **kwargs):
+        """The inference pipeline for all the text classification sub-tasks.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            first_sequence (`str`, `optional`): The key of the first sentence.
+            second_sequence (`str`, `optional`): The key of the second sentence.
+            sequence_length (`int`, `optional`): The sequence length.
+            id2label (`dict`, `optional`): The id-label mapping.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification',
+                model='damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> input = ('这是个测试', '这也是个测试')
+            >>> print(pipeline_ins(input))
+
+        NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence'
+            param will have no affection.
         """
-        use `model` and `preprocessor` to create a kws pipeline for prediction
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
+
+        if preprocessor is None:
+            if isinstance(model, OfaForAllTasks):
+                preprocessor = OfaPreprocessor(model_dir=model.model_dir)
+            else:
+                first_sequence = kwargs.pop('first_sequence', 'first_sequence')
+                second_sequence = kwargs.pop('second_sequence', None)
+                preprocessor = Preprocessor.from_pretrained(
+                    model if isinstance(model, str) else model.model_dir,
+                    first_sequence=first_sequence,
+                    second_sequence=second_sequence,
+                    sequence_length=kwargs.pop('sequence_length', 512))
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        if isinstance(self.model, OfaForAllTasks):
+            return super().forward(inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
         Args:
-            model: model id on modelscope hub.
+            inputs (`Dict[str, Any]` or `TextClassificationModelOutput`): The model output, please check
+                the `TextClassificationModelOutput` class for details.
+            topk (int): The topk probs to take
+        Returns:
+            Dict[str, str]: the prediction results.
+                scores: The probabilities of each label.
+                labels: The real labels.
+            Label at index 0 is the smallest probability.
         """
-        super().__init__(model=model)
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or OfaForAllTasks'
-        if isinstance(model, str):
-            pipe_model = Model.from_pretrained(model)
-        elif isinstance(model, Model):
-            pipe_model = model
+        if isinstance(self.model, OfaForAllTasks):
+            return inputs
         else:
-            raise NotImplementedError
-        pipe_model.model.eval()
-        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
-            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
-        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
+            assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                              'as a parameter or make sure the preprocessor has the attribute.'
+            logits = inputs[OutputKeys.LOGITS].cpu().numpy()
+            if logits.shape[0] == 1:
+                logits = logits[0]
+
+            def softmax(logits):
+                exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+                return exp / exp.sum(axis=-1, keepdims=True)
+
+            probs = softmax(logits)
+            num_classes = probs.shape[-1]
+            topk = min(topk, num_classes)
+            top_indices = np.argpartition(probs, -topk)[-topk:]
+            probs = np.take_along_axis(probs, top_indices, axis=-1).tolist()
+
+            def map_to_label(id):
+                return self.id2label[id]
+
+            v_func = np.vectorize(map_to_label)
+            return {
+                OutputKeys.SCORES: probs,
+                OutputKeys.LABELS: v_func(top_indices).tolist()
+            }
diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py
index 4aa57238..9cee327b 100644
--- a/modelscope/pipelines/nlp/text_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -2,7 +2,7 @@
 
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
@@ -32,20 +32,18 @@ class TextRankingPipeline(Pipeline):
             the model if supplied.
             sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
         """
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
 
         if preprocessor is None:
-            preprocessor = TextRankingPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
+            preprocessor = Preprocessor.from_pretrained(
+                model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
-        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return {**self.model(inputs, **forward_params)}
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """process the prediction results
@@ -55,6 +53,10 @@ class TextRankingPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the predicted text representation
         """
-        pred_list = inputs[OutputKeys.SCORES]
 
+        def sigmoid(logits):
+            return np.exp(logits) / (1 + np.exp(logits))
+
+        logits = inputs[OutputKeys.LOGITS].squeeze(-1).detach().cpu().numpy()
+        pred_list = sigmoid(logits).tolist()
         return {OutputKeys.SCORES: pred_list}
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 055a4b8a..c36f0dfc 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -7,17 +7,22 @@ import torch
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      TokenClassificationPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['TokenClassificationPipeline']
 
 
 @PIPELINES.register_module(
     Tasks.token_classification, module_name=Pipelines.part_of_speech)
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.word_segmentation)
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.named_entity_recognition)
 @PIPELINES.register_module(
     Tasks.part_of_speech, module_name=Pipelines.part_of_speech)
 class TokenClassificationPipeline(Pipeline):
@@ -32,24 +37,18 @@ class TokenClassificationPipeline(Pipeline):
             model (str or Model): A model instance or a model local dir or a model id in the model hub.
             preprocessor (Preprocessor): a preprocessor instance, must not be None.
         """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
+
         if preprocessor is None:
-            preprocessor = TokenClassificationPreprocessor(
+            preprocessor = Model.from_pretrained(
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if hasattr(model, 'id2label'):
-            self.id2label = getattr(model, 'id2label')
-        else:
-            model_config = getattr(model, 'config')
-            self.id2label = getattr(model_config, 'id2label')
-
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
@@ -64,38 +63,59 @@ class TokenClassificationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
+            inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
             Dict[str, str]: the prediction results
         """
+        text = inputs['text']
+        if not hasattr(inputs, 'predictions'):
+            logits = inputs[OutputKeys.LOGITS]
+            predictions = torch.argmax(logits[0], dim=-1)
+        else:
+            predictions = inputs[OutputKeys.PREDICTIONS].squeeze(
+                0).cpu().numpy()
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
 
-        pred_list = inputs['predictions']
-        labels = []
-        for pre in pred_list:
-            labels.append(self.id2label[pre])
-        labels = labels[1:-1]
+        labels = [self.id2label[x] for x in predictions]
+        if len(labels) > len(offset_mapping):
+            labels = labels[1:-1]
         chunks = []
-        tags = []
-        chunk = ''
-        assert len(inputs['text']) == len(labels)
-        for token, label in zip(inputs['text'], labels):
-            if label[0] == 'B' or label[0] == 'I':
-                chunk += token
-            else:
-                chunk += token
-                chunks.append(chunk)
-                chunk = ''
-                tags.append(label.split('-')[-1])
+        chunk = {}
+        for label, offsets in zip(labels, offset_mapping):
+            if label[0] in 'BS':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
+                    'type': label[2:],
+                    'start': offsets[0],
+                    'end': offsets[1]
+                }
+            if label[0] in 'IES':
+                if chunk:
+                    chunk['end'] = offsets[1]
+
+            if label[0] in 'ES':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
         if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
-            tags.append(label.split('-')[-1])
-        pos_result = []
-        seg_result = ' '.join(chunks)
-        for chunk, tag in zip(chunks, tags):
-            pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag})
-        outputs = {
-            OutputKeys.OUTPUT: seg_result,
-            OutputKeys.LABELS: pos_result
-        }
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for ner outputs
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index eb7f7f74..68a03631 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -34,7 +34,8 @@ class TranslationPipeline(Pipeline):
     def __init__(self, model: Model, **kwargs):
         """Build a translation pipeline with a model dir or a model id in the model hub.
 
-        @param model: A Model instance.
+        Args:
+            model: A Model instance.
         """
         super().__init__(model=model, **kwargs)
         model = self.model.model_dir
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 9d4bb67f..0df8f1ad 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['WordSegmentationPipeline']
 
@@ -72,28 +74,56 @@ class WordSegmentationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
+            inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
             Dict[str, str]: the prediction results
         """
-
-        pred_list = inputs['predictions']
-        labels = []
-        for pre in pred_list:
-            labels.append(self.id2label[pre])
-        labels = labels[1:-1]
+        text = inputs['text']
+        logits = inputs[OutputKeys.LOGITS]
+        predictions = torch.argmax(logits[0], dim=-1)
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
+
+        labels = [self.id2label[x] for x in predictions]
+        if len(labels) > len(offset_mapping):
+            labels = labels[1:-1]
         chunks = []
-        chunk = ''
-        assert len(inputs['text']) == len(labels)
-        for token, label in zip(inputs['text'], labels):
-            if label[0] == 'B' or label[0] == 'I':
-                chunk += token
-            else:
-                chunk += token
-                chunks.append(chunk)
-                chunk = ''
+        chunk = {}
+        for label, offsets in zip(labels, offset_mapping):
+            if label[0] in 'BS':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
+                    'type': label[2:],
+                    'start': offsets[0],
+                    'end': offsets[1]
+                }
+            if label[0] in 'IES':
+                if chunk:
+                    chunk['end'] = offsets[1]
+
+            if label[0] in 'ES':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
         if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
-        seg_result = ' '.join(chunks)
-        return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for ner outpus
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
+        return outputs
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index fc7051c7..88792b45 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -86,8 +86,7 @@ class ZeroShotClassificationPipeline(Pipeline):
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -99,7 +98,7 @@ class ZeroShotClassificationPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the prediction results
         """
-        logits = inputs[OutputKeys.LOGITS]
+        logits = inputs[OutputKeys.LOGITS].cpu().numpy()
         if multi_label or len(candidate_labels) == 1:
             logits = logits[..., [self.contradiction_id, self.entailment_id]]
             scores = softmax(logits, axis=-1)[..., 1]
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 423b3f46..76c6d877 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -16,31 +16,20 @@ if TYPE_CHECKING:
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (
-        DocumentSegmentationPreprocessor,
-        FaqQuestionAnsweringPreprocessor,
-        FillMaskPoNetPreprocessor,
-        NLPPreprocessor,
-        NLPTokenizerPreprocessorBase,
-        TextRankingPreprocessor,
-        RelationExtractionPreprocessor,
-        SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor,
-        TokenClassificationPreprocessor,
-        TextErrorCorrectionPreprocessor,
-        TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        Tokenize,
+        DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor, NLPPreprocessor,
+        NLPTokenizerPreprocessorBase, TextRankingPreprocessor,
+        RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor, TokenClassificationPreprocessor,
+        TextErrorCorrectionPreprocessor, TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor, Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor,
-        TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor,
-    )
-    from .space import (DialogIntentPredictionPreprocessor,
-                        DialogModelingPreprocessor,
-                        DialogStateTrackingPreprocessor)
+        ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor,
+        SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
+        DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
+        ConversationalTextToSqlPreprocessor,
+        TableQuestionAnsweringPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
-    from .star import ConversationalTextToSqlPreprocessor
-    from .space_T_cn import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -58,30 +47,22 @@ else:
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
             'DocumentSegmentationPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
-            'FillMaskPoNetPreprocessor',
-            'NLPPreprocessor',
-            'NLPTokenizerPreprocessorBase',
-            'TextRankingPreprocessor',
-            'RelationExtractionPreprocessor',
+            'FaqQuestionAnsweringPreprocessor', 'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor', 'NLPTokenizerPreprocessorBase',
+            'TextRankingPreprocessor', 'RelationExtractionPreprocessor',
             'SentenceEmbeddingPreprocessor',
             'SequenceClassificationPreprocessor',
             'TokenClassificationPreprocessor',
-            'TextErrorCorrectionPreprocessor',
-            'TextGenerationPreprocessor',
-            'Tokenize',
-            'Text2TextGenerationPreprocessor',
+            'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
+            'Tokenize', 'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
-            'TextGenerationJiebaPreprocessor',
-            'SentencePiecePreprocessor',
-        ],
-        'space': [
+            'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
-            'DialogStateTrackingPreprocessor', 'InputFeatures'
+            'DialogStateTrackingPreprocessor',
+            'ConversationalTextToSqlPreprocessor',
+            'TableQuestionAnsweringPreprocessor'
         ],
-        'star': ['ConversationalTextToSqlPreprocessor'],
-        'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index 6360a907..c2716a13 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -1,15 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from copy import deepcopy
+from typing import Any, Dict, Optional, Sequence
 
-from modelscope.utils.constant import ModeKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks
+from modelscope.utils.hub import read_config, snapshot_download
+from modelscope.utils.logger import get_logger
+from .builder import build_preprocessor
+
+logger = get_logger(__name__)
 
 
 class Preprocessor(ABC):
 
-    def __init__(self, *args, **kwargs):
-        self._mode = ModeKeys.INFERENCE
+    def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs):
+        self._mode = mode
         self.device = int(
             os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
         pass
@@ -25,3 +32,61 @@ class Preprocessor(ABC):
     @mode.setter
     def mode(self, value):
         self._mode = value
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name_or_path: str,
+                        revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                        cfg_dict: Config = None,
+                        preprocessor_mode=ModeKeys.INFERENCE,
+                        **kwargs):
+        """ Instantiate a model from local directory or remote model repo. Note
+        that when loading from remote, the model revision can be specified.
+        """
+        if not os.path.exists(model_name_or_path):
+            model_dir = snapshot_download(
+                model_name_or_path, revision=revision)
+        else:
+            model_dir = model_name_or_path
+        if cfg_dict is None:
+            cfg = read_config(model_dir)
+        else:
+            cfg = cfg_dict
+        task = cfg.task
+        if 'task' in kwargs:
+            task = kwargs.pop('task')
+        field_name = Tasks.find_field_by_task(task)
+        if not hasattr(cfg, 'preprocessor'):
+            logger.error('No preprocessor field found in cfg.')
+            return None
+
+        sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val'
+
+        if 'type' not in cfg.preprocessor:
+            if sub_key in cfg.preprocessor:
+                sub_cfg = getattr(cfg.preprocessor, sub_key)
+            else:
+                logger.error(
+                    f'No {sub_key} key and type key found in '
+                    f'preprocessor domain of configuration.json file.')
+                return None
+        else:
+            sub_cfg = cfg.preprocessor
+
+        if len(sub_cfg):
+            if isinstance(sub_cfg, Sequence):
+                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
+                # and add mode for Compose or other plans
+                raise NotImplementedError('Not supported yet!')
+            sub_cfg = deepcopy(sub_cfg)
+            sub_cfg.update({'model_dir': model_dir})
+            sub_cfg.update(kwargs)
+            preprocessor = build_preprocessor(sub_cfg, field_name)
+        else:
+            logger.error(
+                f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, '
+                f'please check the preprocessor field in the configuration.json file.'
+            )
+            return None
+        preprocessor.mode = preprocessor_mode
+        return preprocessor
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index b95048ba..ea7b6bf4 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -5,50 +5,68 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
-    from .nlp_base import (
-        DocumentSegmentationPreprocessor,
-        FaqQuestionAnsweringPreprocessor,
-        FillMaskPoNetPreprocessor,
-        NLPPreprocessor,
-        NLPTokenizerPreprocessorBase,
-        TextRankingPreprocessor,
-        RelationExtractionPreprocessor,
-        SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor,
-        TokenClassificationPreprocessor,
-        TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        Tokenize,
-        WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor,
-        TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor,
-    )
-
+    from .nlp_base import (NLPTokenizerPreprocessorBase, NLPBasePreprocessor)
+    from .text_generation_jieba_preprocessor import TextGenerationJiebaPreprocessor
+    from .sentence_piece_preprocessor import SentencePiecePreprocessor
+    from .bert_seq_cls_tokenizer import Tokenize
+    from .document_segmentation_preprocessor import DocumentSegmentationPreprocessor
+    from .faq_question_answering_preprocessor import FaqQuestionAnsweringPreprocessor
+    from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, NLPPreprocessor
+    from .text_ranking_preprocessor import TextRankingPreprocessor
+    from .relation_extraction_preprocessor import RelationExtractionPreprocessor
+    from .sentence_classification_preprocessor import SequenceClassificationPreprocessor
+    from .sentence_embedding_preprocessor import SentenceEmbeddingPreprocessor
+    from .text_generation_preprocessor import TextGenerationPreprocessor
+    from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor
+    from .token_classification_preprocessor import TokenClassificationPreprocessor, \
+        WordSegmentationBlankSetToLabelPreprocessor
+    from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor
+    from .space import (DialogIntentPredictionPreprocessor,
+                        DialogModelingPreprocessor,
+                        DialogStateTrackingPreprocessor, InputFeatures,
+                        MultiWOZBPETextField, IntentBPETextField)
+    from .space_T_en import ConversationalTextToSqlPreprocessor
+    from .space_T_cn import TableQuestionAnsweringPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
-            'DocumentSegmentationPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
-            'FillMaskPoNetPreprocessor',
-            'NLPPreprocessor',
             'NLPTokenizerPreprocessorBase',
-            'TextRankingPreprocessor',
-            'RelationExtractionPreprocessor',
-            'SentenceEmbeddingPreprocessor',
-            'SequenceClassificationPreprocessor',
+            'NLPBasePreprocessor',
+        ],
+        'text_generation_jieba_preprocessor':
+        ['TextGenerationJiebaPreprocessor'],
+        'sentence_piece_preprocessor': ['SentencePiecePreprocessor'],
+        'bert_seq_cls_tokenizer': ['Tokenize'],
+        'document_segmentation_preprocessor':
+        ['DocumentSegmentationPreprocessor'],
+        'faq_question_answering_preprocessor':
+        ['FaqQuestionAnsweringPreprocessor'],
+        'fill_mask_preprocessor':
+        ['FillMaskPoNetPreprocessor', 'NLPPreprocessor'],
+        'text_ranking_preprocessor': ['TextRankingPreprocessor'],
+        'relation_extraction_preprocessor': ['RelationExtractionPreprocessor'],
+        'sentence_classification_preprocessor':
+        ['SequenceClassificationPreprocessor'],
+        'sentence_embedding_preprocessor': ['SentenceEmbeddingPreprocessor'],
+        'text_generation_preprocessor': ['TextGenerationPreprocessor'],
+        'text2text_generation_preprocessor':
+        ['Text2TextGenerationPreprocessor'],
+        'token_classification_preprocessor': [
             'TokenClassificationPreprocessor',
-            'TextGenerationPreprocessor',
-            'Tokenize',
-            'Text2TextGenerationPreprocessor',
-            'WordSegmentationBlankSetToLabelPreprocessor',
-            'ZeroShotClassificationPreprocessor',
-            'TextGenerationJiebaPreprocessor',
-            'SentencePiecePreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor'
         ],
+        'zero_shot_classification_reprocessor':
+        ['ZeroShotClassificationPreprocessor'],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
+        'space': [
+            'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
+            'DialogStateTrackingPreprocessor', 'InputFeatures',
+            'MultiWOZBPETextField', 'IntentBPETextField'
+        ],
+        'space_T_en': ['ConversationalTextToSqlPreprocessor'],
+        'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
new file mode 100644
index 00000000..576687ce
--- /dev/null
+++ b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from transformers import AutoTokenizer
+
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, InputFields
+
+
+@PREPROCESSORS.register_module(Fields.nlp)
+class Tokenize(Preprocessor):
+
+    def __init__(self, tokenizer_name) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(data, str):
+            data = {InputFields.text: data}
+        token_dict = self.tokenizer(data[InputFields.text])
+        data.update(token_dict)
+        return data
diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
new file mode 100644
index 00000000..5ab0a0c6
--- /dev/null
+++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
@@ -0,0 +1,220 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+from .nlp_base import NLPBasePreprocessor
+
+logger = get_logger()
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from transformers import BertTokenizerFast
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            logger.error(e)
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
new file mode 100644
index 00000000..72c8ed99
--- /dev/null
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -0,0 +1,90 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
+class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super(FaqQuestionAnsweringPreprocessor, self).__init__(
+            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
+        from transformers import BertTokenizer
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        preprocessor_config = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
+                ConfigFields.preprocessor, {})
+        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        self.label_dict = None
+
+    def pad(self, samples, max_len):
+        result = []
+        for sample in samples:
+            pad_len = max_len - len(sample[:max_len])
+            result.append(sample[:max_len]
+                          + [self.tokenizer.pad_token_id] * pad_len)
+        return result
+
+    def set_label_dict(self, label_dict):
+        self.label_dict = label_dict
+
+    def get_label(self, label_id):
+        assert self.label_dict is not None and label_id < len(self.label_dict)
+        return self.label_dict[label_id]
+
+    def encode_plus(self, text):
+        return [
+            self.tokenizer.cls_token_id
+        ] + self.tokenizer.convert_tokens_to_ids(
+            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
+        queryset = data['query_set']
+        if not isinstance(queryset, list):
+            queryset = [queryset]
+        supportset = data['support_set']
+        supportset = sorted(supportset, key=lambda d: d['label'])
+
+        queryset_tokenized = [self.encode_plus(text) for text in queryset]
+        supportset_tokenized = [
+            self.encode_plus(item['text']) for item in supportset
+        ]
+
+        max_len = max(
+            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
+        max_len = min(TMP_MAX_LEN, max_len)
+        queryset_padded = self.pad(queryset_tokenized, max_len)
+        supportset_padded = self.pad(supportset_tokenized, max_len)
+
+        supportset_labels_ori = [item['label'] for item in supportset]
+        label_dict = []
+        for label in supportset_labels_ori:
+            if label not in label_dict:
+                label_dict.append(label)
+        self.set_label_dict(label_dict)
+        supportset_labels_ids = [
+            label_dict.index(label) for label in supportset_labels_ori
+        ]
+        return {
+            'query': queryset_padded,
+            'support': supportset_padded,
+            'support_labels': supportset_labels_ids
+        }
+
+    def batch_encode(self, sentence_list: list, max_length=None):
+        if not max_length:
+            max_length = self.MAX_LEN
+        return self.tokenizer.batch_encode_plus(
+            sentence_list, padding=True, max_length=max_length)
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
new file mode 100644
index 00000000..b0638dbc
--- /dev/null
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -0,0 +1,142 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import re
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.nlp import import_external_nltk_data
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class NLPPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    @property
+    def mask_id(self):
+        return self.tokenizer.mask_token_id
+
+    def decode(self,
+               token_ids,
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        return self.tokenizer.decode(token_ids, skip_special_tokens,
+                                     clean_up_tokenization_spaces, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
+class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in PoNet model's MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.language = self.cfg.model.get('language', 'en')
+        if self.language == 'en':
+            from nltk.tokenize import sent_tokenize
+            import_external_nltk_data(
+                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
+        elif self.language in ['zh', 'cn']:
+
+            def sent_tokenize(para):
+                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
+                              para)  # noqa *
+                para = para.rstrip()
+                return [_ for _ in para.split('\n') if _]
+        else:
+            raise NotImplementedError
+
+        self.sent_tokenize = sent_tokenize
+        self.max_length = kwargs['max_length']
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        max_seq_length = self.max_length
+
+        if text_b is None:
+            segment_ids = []
+            seg_lens = list(
+                map(
+                    len,
+                    self.tokenizer(
+                        self.sent_tokenize(text_a),
+                        add_special_tokens=False,
+                        truncation=True)['input_ids']))
+            segment_id = [0] + sum(
+                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
+            segment_id = segment_id[:max_seq_length - 1]
+            segment_ids.append(segment_id + [segment_id[-1] + 1]
+                               * (max_seq_length - len(segment_id)))
+            if self.mode == ModeKeys.INFERENCE:
+                segment_ids = torch.tensor(segment_ids)
+            output['segment_ids'] = segment_ids
+
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+
+        self.labels_to_id(labels, output)
+        return output
+
+    @property
+    def mask_id(self):
+        return self.tokenizer.mask_token_id
+
+    def decode(self,
+               token_ids,
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        return self.tokenizer.decode(token_ids, skip_special_tokens,
+                                     clean_up_tokenization_spaces, **kwargs)
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 6075a4b3..48a04d7a 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -1,67 +1,41 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
-import os.path as osp
-import re
-from typing import Any, Dict, Optional, Tuple, Union
+from abc import ABC
+from collections.abc import Mapping
+from typing import Any, Dict, List, Tuple, Union
 
 import json
 import numpy as np
-import sentencepiece as spm
 import torch
 from transformers import AutoTokenizer
 
-from modelscope.metainfo import Models, Preprocessors
+from modelscope.metainfo import Models
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import Config, ConfigFields
-from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.constant import ModeKeys
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
-from modelscope.utils.nlp import import_external_nltk_data
-from modelscope.utils.type_assert import type_assert
 
 logger = get_logger()
 
 __all__ = [
-    'DocumentSegmentationPreprocessor',
-    'FaqQuestionAnsweringPreprocessor',
-    'NLPPreprocessor',
-    'FillMaskPoNetPreprocessor',
+    'NLPBasePreprocessor',
     'NLPTokenizerPreprocessorBase',
-    'TextRankingPreprocessor',
-    'RelationExtractionPreprocessor',
-    'SentenceEmbeddingPreprocessor',
-    'SequenceClassificationPreprocessor',
-    'TokenClassificationPreprocessor',
-    'Text2TextGenerationPreprocessor',
-    'TextGenerationPreprocessor',
-    'Tokenize',
-    'WordSegmentationBlankSetToLabelPreprocessor',
-    'ZeroShotClassificationPreprocessor',
 ]
 
 
-@PREPROCESSORS.register_module(Fields.nlp)
-class Tokenize(Preprocessor):
-
-    def __init__(self, tokenizer_name) -> None:
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-
-    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, str):
-            data = {InputFields.text: data}
-        token_dict = self.tokenizer(data[InputFields.text])
-        data.update(token_dict)
-        return data
-
-
-class NLPTokenizerPreprocessorBase(Preprocessor):
-
-    def __init__(self, model_dir: str, mode: str, **kwargs):
-        """The NLP tokenizer preprocessor base class.
+class NLPBasePreprocessor(Preprocessor, ABC):
 
-        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence=None,
+                 second_sequence=None,
+                 label=None,
+                 label2id=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        """The NLP preprocessor base class.
 
         Args:
             model_dir (str): The local model path
@@ -71,18 +45,12 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
-            kwargs: These kwargs will be directly fed into the tokenizer.
         """
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.label = label
 
-        super().__init__(**kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self._mode = mode
-        self.label = kwargs.pop('label', OutputKeys.LABEL)
         self.use_fast = kwargs.pop('use_fast', None)
         if self.use_fast is None and os.path.isfile(
                 os.path.join(model_dir, 'tokenizer_config.json')):
@@ -92,15 +60,82 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 self.use_fast = json_config.get('use_fast')
         self.use_fast = False if self.use_fast is None else self.use_fast
 
-        self.label2id = None
-        if 'label2id' in kwargs:
-            self.label2id = kwargs.pop('label2id')
+        self.label2id = label2id
         if self.label2id is None:
             self.label2id = parse_label_mapping(self.model_dir)
+        super().__init__(mode, **kwargs)
 
-        self.tokenize_kwargs = kwargs
+    @property
+    def mask_id(self):
+        """Child preprocessor can override this property to return the id of mask token.
 
+        Returns:
+            The id of mask token, default None.
+        """
+        return None
+
+    def decode(self,
+               token_ids: Union[int, List[int], 'np.ndarray', 'torch.Tensor',
+                                'tf.Tensor'],
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        """Turn the token_ids to real sentence.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            The real sentence decoded by the preprocessor.
+        """
+        raise NotImplementedError()
+
+
+class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 label: str = 'label',
+                 label2id: dict = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 **kwargs):
+        """The NLP tokenizer preprocessor base class.
+
+        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
+
+        Args:
+            model_dir (str): The local model path
+            first_sequence: The key for the first sequence
+            second_sequence: The key for the second sequence
+            label: The key for the label
+            label2id: An optional label2id dict.
+                If label2id is None, the preprocessor will try to parse label-id mapping from:
+                - configuration.json model.label2id/model.id2label
+                - config.json label2id/id2label
+                - label_mapping.json
+            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
+            kwargs: These kwargs will be directly fed into the tokenizer.
+        """
+
+        super().__init__(model_dir, first_sequence, second_sequence, label,
+                         label2id, mode)
+        self.model_dir = model_dir
+        self.tokenize_kwargs = kwargs
         self.tokenizer = self.build_tokenizer(model_dir)
+        logger.info(f'The key of sentence1: {self.first_sequence}, '
+                    f'The key of sentence2: {self.second_sequence}, '
+                    f'The key of label: {self.label}')
+        if self.first_sequence is None:
+            logger.warning('[Important] first_sequence attribute is not set, '
+                           'this will cause an error if your input is a dict.')
 
     @property
     def id2label(self):
@@ -118,8 +153,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a
         multi-thread problem.
 
-        @param model_dir:  The local model dir.
-        @return: The initialized tokenizer.
+        Args:
+            model_dir:  The local model dir.
+
+        Returns:
+            The initialized tokenizer.
         """
         self.is_transformer_based_model = 'lstm' not in model_dir
         # fast version lead to parallel inference failed
@@ -180,8 +218,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         If the pair param is False, data will be parsed as the first_sentence and the label,
         else it will be parsed as the first_sentence and the second_sentence.
 
-        @param data: The input data.
-        @return: The sentences and labels tuple.
+        Args:
+            data: The input data.
+
+        Returns:
+            The sentences and labels tuple.
         """
         text_a, text_b, labels = None, None, None
         if isinstance(data, str):
@@ -194,7 +235,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                     text_a, text_b = data
                 else:
                     text_a, labels = data
-        elif isinstance(data, dict):
+        elif isinstance(data, Mapping):
             text_a = data.get(self.first_sequence)
             text_b = data.get(self.second_sequence)
             labels = data.get(self.label)
@@ -208,1007 +249,34 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         If the original label's type is float, or the label2id mapping does not exist,
         the original label will be returned.
 
-        @param labels: The input labels.
-        @param output: The label id.
-        @return: The final labels.
+        Args:
+            labels: The input labels.
+            output: The label id.
+
+        Returns:
+            The final labels.
         """
 
         def label_can_be_mapped(label):
             return isinstance(label, str) or isinstance(label, int)
 
-        if labels is not None:
+        try:
             if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
                     and self.label2id is not None:
                 output[OutputKeys.LABELS] = [
-                    self.label2id[str(label)] for label in labels
+                    self.label2id[label]
+                    if label in self.label2id else self.label2id[str(label)]
+                    for label in labels
                 ]
             elif label_can_be_mapped(labels) and self.label2id is not None:
-                output[OutputKeys.LABELS] = self.label2id[str(labels)]
-            else:
+                output[OutputKeys.LABELS] = self.label2id[
+                    labels] if labels in self.label2id else self.label2id[str(
+                        labels)]
+            elif labels is not None:
                 output[OutputKeys.LABELS] = labels
-
-
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.feature_extraction)
-class NLPPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_ranking)
-class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text-ranking model.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 mode=ModeKeys.INFERENCE,
-                 *args,
-                 **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'source_sentence')
-        self.second_sequence = kwargs.pop('second_sequence',
-                                          'sentences_to_compare')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
-        if isinstance(data, tuple):
-            sentence1, sentence2 = data
-        elif isinstance(data, dict):
-            sentence1 = data.get(self.first_sequence)
-            sentence2 = data.get(self.second_sequence)
-        if isinstance(sentence2, str):
-            sentence2 = [sentence2]
-        if isinstance(sentence1, str):
-            sentence1 = [sentence1]
-        sentence1 = sentence1 * len(sentence2)
-
-        max_seq_length = self.sequence_length
-        feature = self.tokenizer(
-            sentence1,
-            sentence2,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length,
-            return_tensors='pt')
-        if 'labels' in data:
-            labels = data['labels']
-            feature['labels'] = labels
-        if 'qid' in data:
-            qid = data['qid']
-            feature['qid'] = qid
-        return feature
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in sequence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sentence_embedding)
-class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in sentence embedding.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data Dict:
-                keys: "source_sentence" && "sentences_to_compare"
-                values: list of sentences
-                Example:
-                    {"source_sentence": ["how long it take to get a master's degree"],
-                     "sentences_to_compare": ["On average, students take about 18 to 24 months
-                     to complete a master's degree.",
-                     "On the other hand, some students prefer to go at a slower pace
-                     and choose to take several years to complete their studies.",
-                     "It can take anywhere from two semesters"]}
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        source_sentence = data['source_sentence']
-        compare_sentences = data['sentences_to_compare']
-        sentences = []
-        sentences.append(source_sentence[0])
-        for sent in compare_sentences:
-            sentences.append(sent)
-
-        tokenized_inputs = self.tokenizer(
-            sentences,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            padding=True,
-            truncation=True)
-        return tokenized_inputs
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in zero shot classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str or dict): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
-
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            truncation_strategy='only_first',
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
-        return features
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
-class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
-        kwargs['padding'] = kwargs.get('padding', False)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        text_a, _, _ = self.parse_text_and_label(data)
-
-        inputs = self.tokenizer(
-            text_a,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-
-        # This is produced by tokenizers but is an invalid generate kwargs
-        if 'token_type_ids' in inputs:
-            del inputs['token_type_ids']
-        return inputs
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    @staticmethod
-    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
-        import os
-        for name in os.listdir(model_dir):
-            full_name = os.path.join(model_dir, name)
-            if 'roberta' in name and os.path.isdir(full_name):
-                return full_name
-
-    def build_tokenizer(self, model_dir: str):
-        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
-        if roberta_tokenizer_dir:
-            from transformers import RobertaTokenizer
-            return RobertaTokenizer.from_pretrained(
-                roberta_tokenizer_dir, do_lower_case=False)
-        return super().build_tokenizer(model_dir)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        if self._mode == ModeKeys.INFERENCE:
-            return super().__call__(data)
-        src_rst = super().__call__(data['src_txt'])
-        src_input_ids = src_rst['input_ids']
-        src_attention_mask = src_rst['attention_mask']
-        if 'tgt_txt' in data:
-            labels = super().__call__(data['tgt_txt'])['input_ids']
-        else:
-            labels = src_input_ids[1:]
-            src_input_ids = src_input_ids[:-1]
-            src_attention_mask = src_attention_mask[:-1]
-
-        return {
-            'input_ids': src_input_ids,
-            'attention_mask': src_attention_mask,
-            'labels': labels,
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
-class TextGenerationJiebaPreprocessor(Preprocessor):
-    """The jieba tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
-        super().__init__(*args, **kwargs)
-        self.tokenizer = JiebaBPETokenizer(
-            osp.join(model_dir, 'tokenizer.json'))
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
-        """
-        import torch
-
-        return {
-            'input_ids':
-            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp,
-    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
-class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
-    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.label = kwargs.pop('label', OutputKeys.LABELS)
-
-    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
-        data = data.split(' ')
-        data = list(filter(lambda x: len(x) > 0, data))
-
-        def produce_train_sample(words):
-            chars = []
-            labels = []
-            for word in words:
-                chars.extend(list(word))
-                if len(word) == 1:
-                    labels.append('S-CWS')
-                else:
-                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
-                                  + ['E-CWS'])
-            assert len(chars) == len(labels)
-            return chars, labels
-
-        chars, labels = produce_train_sample(data)
-        return {
-            self.first_sequence: chars,
-            self.label: labels,
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
-class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in normal NER task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-        if 'is_split_into_words' in kwargs:
-            self.is_split_into_words = kwargs.pop('is_split_into_words')
-        else:
-            self.is_split_into_words = self.tokenizer.init_kwargs.get(
-                'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = None
-        labels_list = None
-        if isinstance(data, str):
-            text = data
-        elif isinstance(data, dict):
-            text = data.get(self.first_sequence)
-            labels_list = data.get(self.label)
-
-        input_ids = []
-        label_mask = []
-        offset_mapping = []
-        if self.is_split_into_words:
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)])
-        else:
-            if self.tokenizer.is_fast:
-                encodings = self.tokenizer(
-                    text,
-                    add_special_tokens=False,
-                    return_offsets_mapping=True,
-                    **self.tokenize_kwargs)
-                input_ids = encodings['input_ids']
-                word_ids = encodings.word_ids()
-                for i in range(len(word_ids)):
-                    if word_ids[i] is None:
-                        label_mask.append(0)
-                    elif word_ids[i] == word_ids[i - 1]:
-                        label_mask.append(0)
-                        offset_mapping[-1] = (
-                            offset_mapping[-1][0],
-                            encodings['offset_mapping'][i][1])
-                    else:
-                        label_mask.append(1)
-                        offset_mapping.append(encodings['offset_mapping'][i])
-            else:
-                encodings = self.tokenizer(
-                    text, add_special_tokens=False, **self.tokenize_kwargs)
-                input_ids = encodings['input_ids']
-                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
-                    text)
-
-        if len(input_ids) >= self.sequence_length - 2:
-            input_ids = input_ids[:self.sequence_length - 2]
-            label_mask = label_mask[:self.sequence_length - 2]
-        input_ids = [self.tokenizer.cls_token_id
-                     ] + input_ids + [self.tokenizer.sep_token_id]
-        label_mask = [0] + label_mask + [0]
-        attention_mask = [1] * len(input_ids)
-        offset_mapping = offset_mapping[:sum(label_mask)]
-
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-
-        if self._mode == ModeKeys.INFERENCE:
-            input_ids = torch.tensor(input_ids).unsqueeze(0)
-            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
-            label_mask = torch.tensor(
-                label_mask, dtype=torch.bool).unsqueeze(0)
-
-        # the token classification
-        output = {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-        # align the labels with tokenized text
-        if labels_list is not None:
-            assert self.label2id is not None
-            # Map that sends B-Xxx label to its I-Xxx counterpart
-            b_to_i_label = []
-            label_enumerate_values = [
-                k for k, v in sorted(
-                    self.label2id.items(), key=lambda item: item[1])
-            ]
-            for idx, label in enumerate(label_enumerate_values):
-                if label.startswith('B-') and label.replace(
-                        'B-', 'I-') in label_enumerate_values:
-                    b_to_i_label.append(
-                        label_enumerate_values.index(
-                            label.replace('B-', 'I-')))
-                else:
-                    b_to_i_label.append(idx)
-
-            label_row = [self.label2id[lb] for lb in labels_list]
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                if word_idx is None:
-                    label_ids.append(-100)
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_row[word_idx])
-                else:
-                    if self.label_all_tokens:
-                        label_ids.append(b_to_i_label[label_row[word_idx]])
-                    else:
-                        label_ids.append(-100)
-                previous_word_idx = word_idx
-            labels = label_ids
-            output['labels'] = labels
-        return output
-
-    def get_tokenizer_class(self):
-        tokenizer_class = self.tokenizer.__class__.__name__
-        if tokenizer_class.endswith(
-                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
-            tokenizer_class = tokenizer_class[:-4]
-        return tokenizer_class
-
-    def get_label_mask_and_offset_mapping(self, text):
-        label_mask = []
-        offset_mapping = []
-        tokens = self.tokenizer.tokenize(text)
-        offset = 0
-        if self.get_tokenizer_class() == 'BertTokenizer':
-            for token in tokens:
-                is_start = (token[:2] != '##')
-                if is_start:
-                    label_mask.append(True)
-                else:
-                    token = token[2:]
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
-            last_is_blank = False
-            for token in tokens:
-                is_start = (token[0] == '▁')
-                if is_start:
-                    token = token[1:]
-                    label_mask.append(True)
-                    if len(token) == 0:
-                        last_is_blank = True
-                        continue
-                else:
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if last_is_blank or is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-                last_is_blank = False
-        else:
-            raise NotImplementedError
-
-        return label_mask, offset_mapping
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.re_tokenizer)
-class RelationExtractionPreprocessor(Preprocessor):
-    """The relation extraction preprocessor used in normal RE task.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=True)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        output = self.tokenizer([text], return_tensors='pt')
-        return {
-            'text': text,
-            'input_ids': output['input_ids'],
-            'attention_mask': output['attention_mask'],
-            'offsets': output[0].offsets
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
-class FaqQuestionAnsweringPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super(FaqQuestionAnsweringPreprocessor, self).__init__(
-            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
-        import os
-        from transformers import BertTokenizer
-
-        from modelscope.utils.config import Config
-        from modelscope.utils.constant import ModelFile
-        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
-        preprocessor_config = Config.from_file(
-            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
-                ConfigFields.preprocessor, {})
-        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
-        self.label_dict = None
-
-    def pad(self, samples, max_len):
-        result = []
-        for sample in samples:
-            pad_len = max_len - len(sample[:max_len])
-            result.append(sample[:max_len]
-                          + [self.tokenizer.pad_token_id] * pad_len)
-        return result
-
-    def set_label_dict(self, label_dict):
-        self.label_dict = label_dict
-
-    def get_label(self, label_id):
-        assert self.label_dict is not None and label_id < len(self.label_dict)
-        return self.label_dict[label_id]
-
-    def encode_plus(self, text):
-        return [
-            self.tokenizer.cls_token_id
-        ] + self.tokenizer.convert_tokens_to_ids(
-            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
-
-    @type_assert(object, Dict)
-    def __call__(self, data: Dict[str, Any],
-                 **preprocessor_param) -> Dict[str, Any]:
-        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
-        queryset = data['query_set']
-        if not isinstance(queryset, list):
-            queryset = [queryset]
-        supportset = data['support_set']
-        supportset = sorted(supportset, key=lambda d: d['label'])
-
-        queryset_tokenized = [self.encode_plus(text) for text in queryset]
-        supportset_tokenized = [
-            self.encode_plus(item['text']) for item in supportset
-        ]
-
-        max_len = max(
-            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
-        max_len = min(TMP_MAX_LEN, max_len)
-        queryset_padded = self.pad(queryset_tokenized, max_len)
-        supportset_padded = self.pad(supportset_tokenized, max_len)
-
-        supportset_labels_ori = [item['label'] for item in supportset]
-        label_dict = []
-        for label in supportset_labels_ori:
-            if label not in label_dict:
-                label_dict.append(label)
-        self.set_label_dict(label_dict)
-        supportset_labels_ids = [
-            label_dict.index(label) for label in supportset_labels_ori
-        ]
-        return {
-            'query': queryset_padded,
-            'support': supportset_padded,
-            'support_labels': supportset_labels_ids
-        }
-
-    def batch_encode(self, sentence_list: list, max_length=None):
-        if not max_length:
-            max_length = self.MAX_LEN
-        return self.tokenizer.batch_encode_plus(
-            sentence_list, padding=True, max_length=max_length)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.document_segmentation)
-class DocumentSegmentationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, config, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-        from transformers import BertTokenizerFast
-        self.tokenizer = BertTokenizerFast.from_pretrained(
-            model_dir,
-            use_fast=True,
-        )
-        self.question_column_name = 'labels'
-        self.context_column_name = 'sentences'
-        self.example_id_column_name = 'example_id'
-        self.label_to_id = {'B-EOP': 0, 'O': 1}
-        self.target_specical_ids = set()
-        self.target_specical_ids.add(self.tokenizer.eos_token_id)
-        self.max_seq_length = config.max_position_embeddings
-        self.label_list = ['B-EOP', 'O']
-
-    def __call__(self, examples) -> Dict[str, Any]:
-        questions = examples[self.question_column_name]
-        contexts = examples[self.context_column_name]
-        example_ids = examples[self.example_id_column_name]
-        num_examples = len(questions)
-
-        sentences = []
-        for sentence_list in contexts:
-            sentence_list = [_ + '[EOS]' for _ in sentence_list]
-            sentences.append(sentence_list)
-
-        try:
-            tokenized_examples = self.tokenizer(
-                sentences,
-                is_split_into_words=True,
-                add_special_tokens=False,
-                return_token_type_ids=True,
-                return_attention_mask=True,
-            )
-        except Exception as e:
-            logger.error(e)
-            return {}
-
-        segment_ids = []
-        token_seq_labels = []
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_labels = questions[example_index]
-            example_labels = [
-                self.label_to_id[_] if _ in self.label_to_id else -100
-                for _ in example_labels
-            ]
-            example_token_labels = []
-            segment_id = []
-            cur_seg_id = 1
-            for token_index in range(len(example_input_ids)):
-                if example_input_ids[token_index] in self.target_specical_ids:
-                    example_token_labels.append(example_labels[cur_seg_id - 1])
-                    segment_id.append(cur_seg_id)
-                    cur_seg_id += 1
-                else:
-                    example_token_labels.append(-100)
-                    segment_id.append(cur_seg_id)
-
-            segment_ids.append(segment_id)
-            token_seq_labels.append(example_token_labels)
-
-        tokenized_examples['segment_ids'] = segment_ids
-        tokenized_examples['token_seq_labels'] = token_seq_labels
-
-        new_segment_ids = []
-        new_token_seq_labels = []
-        new_input_ids = []
-        new_token_type_ids = []
-        new_attention_mask = []
-        new_example_ids = []
-        new_sentences = []
-
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_token_type_ids = tokenized_examples['token_type_ids'][
-                example_index]
-            example_attention_mask = tokenized_examples['attention_mask'][
-                example_index]
-            example_segment_ids = tokenized_examples['segment_ids'][
-                example_index]
-            example_token_seq_labels = tokenized_examples['token_seq_labels'][
-                example_index]
-            example_sentences = contexts[example_index]
-            example_id = example_ids[example_index]
-            example_total_num_sentences = len(questions[example_index])
-            example_total_num_tokens = len(
-                tokenized_examples['input_ids'][example_index])
-            accumulate_length = [
-                i for i, x in enumerate(tokenized_examples['input_ids']
-                                        [example_index])
-                if x == self.tokenizer.eos_token_id
-            ]
-            samples_boundary = []
-            left_index = 0
-            sent_left_index = 0
-            sent_i = 0
-
-            # for sent_i, length in enumerate(accumulate_length):
-            while sent_i < len(accumulate_length):
-                length = accumulate_length[sent_i]
-                right_index = length + 1
-                sent_right_index = sent_i + 1
-                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
-                    samples_boundary.append([left_index, right_index])
-
-                    sample_input_ids = [
-                        self.tokenizer.cls_token_id
-                    ] + example_input_ids[left_index:right_index]
-                    sample_input_ids = sample_input_ids[:self.max_seq_length]
-
-                    sample_token_type_ids = [
-                        0
-                    ] + example_token_type_ids[left_index:right_index]
-                    sample_token_type_ids = sample_token_type_ids[:self.
-                                                                  max_seq_length]
-
-                    sample_attention_mask = [
-                        1
-                    ] + example_attention_mask[left_index:right_index]
-                    sample_attention_mask = sample_attention_mask[:self.
-                                                                  max_seq_length]
-
-                    sample_segment_ids = [
-                        0
-                    ] + example_segment_ids[left_index:right_index]
-                    sample_segment_ids = sample_segment_ids[:self.
-                                                            max_seq_length]
-
-                    sample_token_seq_labels = [
-                        -100
-                    ] + example_token_seq_labels[left_index:right_index]
-                    sample_token_seq_labels = sample_token_seq_labels[:self.
-                                                                      max_seq_length]
-
-                    if sent_right_index - 1 == sent_left_index:
-                        left_index = right_index
-                        sample_input_ids[-1] = self.tokenizer.eos_token_id
-                        sample_token_seq_labels[-1] = -100
-                    else:
-                        left_index = accumulate_length[sent_i - 1] + 1
-                        if sample_token_seq_labels[-1] != -100:
-                            sample_token_seq_labels[-1] = -100
-
-                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index]
-                        sent_left_index = sent_right_index
-                        sent_i += 1
-                    else:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index - 1]
-                        sent_left_index = sent_right_index - 1
-
-                    if (len([_ for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences) - 1 and (len([
-                                 _
-                                 for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences):
-                        tmp = []
-                        for w_i, w, l in zip(
-                                sample_input_ids,
-                                self.tokenizer.decode(sample_input_ids).split(
-                                    ' '), sample_token_seq_labels):
-                            tmp.append((w_i, w, l))
-                    while len(sample_input_ids) < self.max_seq_length:
-                        sample_input_ids.append(self.tokenizer.pad_token_id)
-                        sample_token_type_ids.append(0)
-                        sample_attention_mask.append(0)
-                        sample_segment_ids.append(example_total_num_sentences
-                                                  + 1)
-                        sample_token_seq_labels.append(-100)
-
-                    new_input_ids.append(sample_input_ids)
-                    new_token_type_ids.append(sample_token_type_ids)
-                    new_attention_mask.append(sample_attention_mask)
-                    new_segment_ids.append(sample_segment_ids)
-                    new_token_seq_labels.append(sample_token_seq_labels)
-                    new_example_ids.append(example_id)
-                    new_sentences.append(sample_sentences)
-                else:
-                    sent_i += 1
-                    continue
-
-        output_samples = {}
-
-        output_samples['input_ids'] = new_input_ids
-        output_samples['token_type_ids'] = new_token_type_ids
-        output_samples['attention_mask'] = new_attention_mask
-
-        output_samples['segment_ids'] = new_segment_ids
-        output_samples['example_id'] = new_example_ids
-        output_samples['labels'] = new_token_seq_labels
-        output_samples['sentences'] = new_sentences
-
-        return output_samples
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
-class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-        self.cfg = Config.from_file(
-            osp.join(model_dir, ModelFile.CONFIGURATION))
-        self.language = self.cfg.model.get('language', 'en')
-        if self.language == 'en':
-            from nltk.tokenize import sent_tokenize
-            import_external_nltk_data(
-                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
-        elif self.language in ['zh', 'cn']:
-
-            def sent_tokenize(para):
-                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
-                              para)  # noqa *
-                para = para.rstrip()
-                return [_ for _ in para.split('\n') if _]
-        else:
-            raise NotImplementedError
-
-        self.sent_tokenize = sent_tokenize
-        self.max_length = kwargs['max_length']
-
-    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b, labels = self.parse_text_and_label(data)
-        output = self.tokenizer(
-            text_a,
-            text_b,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-        max_seq_length = self.max_length
-
-        if text_b is None:
-            segment_ids = []
-            seg_lens = list(
-                map(
-                    len,
-                    self.tokenizer(
-                        self.sent_tokenize(text_a),
-                        add_special_tokens=False,
-                        truncation=True)['input_ids']))
-            segment_id = [0] + sum(
-                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
-            segment_id = segment_id[:max_seq_length - 1]
-            segment_ids.append(segment_id + [segment_id[-1] + 1]
-                               * (max_seq_length - len(segment_id)))
-            output['segment_ids'] = segment_ids
-
-        output = {
-            k: np.array(v) if isinstance(v, list) else v
-            for k, v in output.items()
-        }
-
-        self.labels_to_id(labels, output)
-        return output
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sentence_piece)
-class SentencePiecePreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        import os
-
-        super().__init__(*args, **kwargs)
-        self.tokenizer = None
-        for file_name in os.listdir(model_dir):
-            if file_name.endswith('.model'):
-                m_file = osp.join(model_dir, file_name)
-                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
-                break
-        assert self.tokenizer is not None, 'Can not find .model file'
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
+        except KeyError as e:
+            logger.error(
+                f'Label {labels} cannot be found in the label mapping {self.label2id},'
+                f'which comes from the user input or the configuration files. '
+                f'Please consider matching your labels with this mapping.')
+            raise e
diff --git a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
new file mode 100644
index 00000000..9a426ab7
--- /dev/null
+++ b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(NLPBasePreprocessor):
+    """The relation extraction preprocessor used in normal RE task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
diff --git a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
new file mode 100644
index 00000000..f1295c50
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sequence classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
new file mode 100644
index 00000000..519de60c
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_embedding)
+class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sentence embedding.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict:
+                keys: "source_sentence" && "sentences_to_compare"
+                values: list of sentences
+                Example:
+                    {"source_sentence": ["how long it take to get a master's degree"],
+                     "sentences_to_compare": ["On average, students take about 18 to 24 months
+                     to complete a master's degree.",
+                     "On the other hand, some students prefer to go at a slower pace
+                     and choose to take several years to complete their studies.",
+                     "It can take anywhere from two semesters"]}
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        source_sentence = data['source_sentence']
+        compare_sentences = data['sentences_to_compare']
+        sentences = []
+        sentences.append(source_sentence[0])
+        for sent in compare_sentences:
+            sentences.append(sent)
+
+        tokenized_inputs = self.tokenizer(
+            sentences,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            padding=True,
+            truncation=True)
+        return tokenized_inputs
diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
new file mode 100644
index 00000000..1d1ef19d
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import sentencepiece as spm
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_piece)
+class SentencePiecePreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        import os
+
+        super().__init__(*args, **kwargs)
+        self.tokenizer = None
+        for file_name in os.listdir(model_dir):
+            if file_name.endswith('.model'):
+                m_file = osp.join(model_dir, file_name)
+                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
+                break
+        assert self.tokenizer is not None, 'Can not find .model file'
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/nlp/space/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space/__init__.py
rename to modelscope/preprocessors/nlp/space/__init__.py
diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/nlp/space/args.py
similarity index 97%
rename from modelscope/preprocessors/space/args.py
rename to modelscope/preprocessors/nlp/space/args.py
index d9e91e74..17c6828b 100644
--- a/modelscope/preprocessors/space/args.py
+++ b/modelscope/preprocessors/nlp/space/args.py
@@ -1,7 +1,4 @@
-"""
-Parse argument.
-"""
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 
 import json
diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/nlp/space/batch.py
similarity index 96%
rename from modelscope/preprocessors/space/batch.py
rename to modelscope/preprocessors/nlp/space/batch.py
index fe0ad0ec..d27776f5 100644
--- a/modelscope/preprocessors/space/batch.py
+++ b/modelscope/preprocessors/nlp/space/batch.py
@@ -1,3 +1,6 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
 def batch(reader, batch_size, drop_last=False):
     """
     This operator creates a batched reader which combines the data from the
diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/nlp/space/data_loader.py
similarity index 87%
rename from modelscope/preprocessors/space/data_loader.py
rename to modelscope/preprocessors/nlp/space/data_loader.py
index bd04a79c..290b64f3 100644
--- a/modelscope/preprocessors/space/data_loader.py
+++ b/modelscope/preprocessors/nlp/space/data_loader.py
@@ -1,18 +1,16 @@
-"""
-DataLoader class
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import math
 import os
 
 import numpy as np
 
-from modelscope.preprocessors.space.args import str2bool
-from modelscope.preprocessors.space.batch import batch
-from modelscope.preprocessors.space.lazy_dataset import LazyDataset
-from modelscope.preprocessors.space.sampler import (RandomSampler,
-                                                    SequentialSampler,
-                                                    SortedSampler)
+from modelscope.preprocessors.nlp.space.args import str2bool
+from modelscope.preprocessors.nlp.space.batch import batch
+from modelscope.preprocessors.nlp.space.lazy_dataset import LazyDataset
+from modelscope.preprocessors.nlp.space.sampler import (RandomSampler,
+                                                        SequentialSampler,
+                                                        SortedSampler)
 
 
 def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test):
diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
similarity index 64%
rename from modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
index e2602eaa..2923157e 100644
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
@@ -8,8 +8,7 @@ import json
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space.fields.intent_field import \
-    IntentBPETextField
+from modelscope.preprocessors.nlp import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
@@ -47,10 +46,25 @@ class DialogIntentPredictionPreprocessor(Preprocessor):
         Args:
             data (str): a sentence
                 Example:
-                    'you are so handsome.'
+                    'What do I need to do for the card activation?'
 
         Returns:
             Dict[str, Any]: the preprocessed data
+                Example:
+                    {
+                        'src_token': array([[13,  2054,  2079,  1045...]]),
+                        'src_pos': array([[ 0,  1,  2,  3...]]),
+                        'src_type': array([[1, 1, 1, 1...]]),
+                        'src_turn': array([[1, 1, 1, 1...]]),
+                        'src_mask': array([[1, 1, 1, 1...]]),
+                        'mlm_token': array([[13,  2054,  2079,  1045...]]),
+                        'mlm_label': array([[0, 0, 0, 0...]]),
+                        'mlm_mask': array([[0, 0, 0, 0...]]),
+                        'tgt_token': array([[29, 30, 31, 32...]]),
+                        'tgt_mask': array([[1, 1, 1, 1...]]),
+                        'ids': array([0]),
+                        'intent_label': array([-1])
+                    }
         """
         samples = self.text_field.preprocessor([data])
         samples, _ = self.text_field.collate_fn_multi_turn(samples)
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
similarity index 75%
rename from modelscope/preprocessors/space/dialog_modeling_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
index c461ade1..ae3c214a 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
@@ -6,8 +6,7 @@ from typing import Any, Dict
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
@@ -42,9 +41,19 @@ class DialogModelingPreprocessor(Preprocessor):
         """process the raw input data
 
         Args:
-            data (str): a sentence
+            data (Dict[str, Any]): A sentence and dialogue history info.
                 Example:
-                    'you are so handsome.'
+                    {
+                        'user_input': 'i want to leave after 17:15 .',
+                        'history': {
+                            'labels': [[13, 1045, 2052, 2066...]],
+                            'resp': [14, 1045, 2064, 2393...],
+                            'bspn': [15, 43, 7688, 10733...],
+                            'db': [19, 24, 20],
+                            'aspn': [16, 43, 48, 2681, 7180, 10],
+                            'output': ['i', 'can', 'help', 'with'...]
+                        }
+                    }
 
         Returns:
             Dict[str, Any]: the preprocessed data
diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
similarity index 92%
rename from modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
index 6eb17288..cff39577 100644
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
@@ -31,13 +31,17 @@ class DialogStateTrackingPreprocessor(Preprocessor):
         self.processor = multiwoz22Processor()
 
     @type_assert(object, dict)
-    def __call__(self, data: Dict) -> Dict[str, Any]:
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
-            data (str): a sentence
+            data (Dict[str, Any]): a sentence
                 Example:
-                    'you are so handsome.'
+                    {
+                        'utter': {'User-1': "Hi, I'm looking for a train that is going"
+                            "to cambridge and arriving there by 20:45, is there anything like that?"},
+                        'history_states': [{}]
+                    }
 
         Returns:
             Dict[str, Any]: the preprocessed data
diff --git a/modelscope/preprocessors/space/dst_processors.py b/modelscope/preprocessors/nlp/space/dst_processors.py
similarity index 100%
rename from modelscope/preprocessors/space/dst_processors.py
rename to modelscope/preprocessors/nlp/space/dst_processors.py
diff --git a/modelscope/preprocessors/nlp/space/fields/__init__.py b/modelscope/preprocessors/nlp/space/fields/__init__.py
new file mode 100644
index 00000000..475a99dc
--- /dev/null
+++ b/modelscope/preprocessors/nlp/space/fields/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .gen_field import MultiWOZBPETextField
+    from .intent_field import IntentBPETextField
+else:
+    _import_structure = {
+        'gen_field': ['MultiWOZBPETextField'],
+        'intent_field': ['IntentBPETextField']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/nlp/space/fields/gen_field.py
similarity index 99%
rename from modelscope/preprocessors/space/fields/gen_field.py
rename to modelscope/preprocessors/nlp/space/fields/gen_field.py
index 32346bd5..1d1879fe 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/nlp/space/fields/gen_field.py
@@ -9,7 +9,7 @@ from itertools import chain
 import json
 import numpy as np
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology, utils
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/nlp/space/fields/intent_field.py
similarity index 99%
rename from modelscope/preprocessors/space/fields/intent_field.py
rename to modelscope/preprocessors/nlp/space/fields/intent_field.py
index 6d3b5fff..29ea915e 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/nlp/space/fields/intent_field.py
@@ -13,7 +13,7 @@ import json
 import numpy as np
 from tqdm import tqdm
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.nlp.space import ontology
 from modelscope.utils.nlp.space.scores import hierarchical_set_score
diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/nlp/space/lazy_dataset.py
similarity index 93%
rename from modelscope/preprocessors/space/lazy_dataset.py
rename to modelscope/preprocessors/nlp/space/lazy_dataset.py
index 8da21db7..536d9341 100644
--- a/modelscope/preprocessors/space/lazy_dataset.py
+++ b/modelscope/preprocessors/nlp/space/lazy_dataset.py
@@ -1,11 +1,6 @@
-"""
-Dataset class
-"""
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import json
 
-from modelscope.preprocessors.space.args import str2bool
-
 
 class LazyDataset(object):
     """
diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/nlp/space/preprocess.py
similarity index 92%
rename from modelscope/preprocessors/space/preprocess.py
rename to modelscope/preprocessors/nlp/space/preprocess.py
index bd8d64d1..8aab4711 100644
--- a/modelscope/preprocessors/space/preprocess.py
+++ b/modelscope/preprocessors/nlp/space/preprocess.py
@@ -1,12 +1,9 @@
-"""
-Preprocess script.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import glob
 import os
 
-from modelscope.preprocessors.space.args import parse_args
-from modelscope.preprocessors.space.fields.intent_field import \
+from modelscope.preprocessors.nlp.space.fields.intent_field import \
     IntentBPETextField
 
 FILE_NAME = 'train.json'
diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/nlp/space/sampler.py
similarity index 96%
rename from modelscope/preprocessors/space/sampler.py
rename to modelscope/preprocessors/nlp/space/sampler.py
index 49a216d1..e549c343 100644
--- a/modelscope/preprocessors/space/sampler.py
+++ b/modelscope/preprocessors/nlp/space/sampler.py
@@ -1,6 +1,4 @@
-"""
-Sampler class.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import numpy as np
 
diff --git a/modelscope/preprocessors/space/tensorlistdataset.py b/modelscope/preprocessors/nlp/space/tensorlistdataset.py
similarity index 100%
rename from modelscope/preprocessors/space/tensorlistdataset.py
rename to modelscope/preprocessors/nlp/space/tensorlistdataset.py
diff --git a/modelscope/preprocessors/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py
similarity index 99%
rename from modelscope/preprocessors/space/tokenizer.py
rename to modelscope/preprocessors/nlp/space/tokenizer.py
index 87f7e8c3..1bd0ce11 100644
--- a/modelscope/preprocessors/space/tokenizer.py
+++ b/modelscope/preprocessors/nlp/space/tokenizer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import collections
diff --git a/modelscope/preprocessors/space_T_cn/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/__init__.py
rename to modelscope/preprocessors/nlp/space_T_cn/__init__.py
diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/preprocessors/space_T_cn/fields/database.py b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
similarity index 98%
rename from modelscope/preprocessors/space_T_cn/fields/database.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/database.py
index 7ae38ee2..2fef8d7e 100644
--- a/modelscope/preprocessors/space_T_cn/fields/database.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
@@ -4,7 +4,7 @@ import sqlite3
 import json
 import tqdm
 
-from modelscope.preprocessors.space_T_cn.fields.struct import Trie
+from .struct import Trie
 
 
 class Database:
diff --git a/modelscope/preprocessors/space_T_cn/fields/schema_link.py b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
similarity index 99%
rename from modelscope/preprocessors/space_T_cn/fields/schema_link.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
index 4b8f9d31..b62d03e4 100644
--- a/modelscope/preprocessors/space_T_cn/fields/schema_link.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
 
-from modelscope.preprocessors.space_T_cn.fields.struct import TypeInfo
+from .struct import TypeInfo
 
 
 class SchemaLinker:
diff --git a/modelscope/preprocessors/space_T_cn/fields/struct.py b/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/fields/struct.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
diff --git a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
similarity index 96%
rename from modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
rename to modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
index 63e6fd57..3aabc6a9 100644
--- a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
@@ -8,8 +8,9 @@ from transformers import BertTokenizer
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space_T_cn.fields.database import Database
-from modelscope.preprocessors.space_T_cn.fields.schema_link import SchemaLinker
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.schema_link import \
+    SchemaLinker
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/nlp/space_T_en/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star/__init__.py
rename to modelscope/preprocessors/nlp/space_T_en/__init__.py
diff --git a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
similarity index 84%
rename from modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
rename to modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
index b5dd73a9..00c7bcd7 100644
--- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
@@ -12,9 +12,10 @@ from text2sql_lgesql.utils.example import Example
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.star.fields.preprocess_dataset import \
+from modelscope.preprocessors.nlp.space_T_en.fields import SubPreprocessor
+from modelscope.preprocessors.nlp.space_T_en.fields.preprocess_dataset import \
     preprocess_dataset
-from modelscope.preprocessors.star.fields.process_dataset import (
+from modelscope.preprocessors.nlp.space_T_en.fields.process_dataset import (
     process_dataset, process_tables)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
@@ -56,6 +57,18 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
             model_dir=self.model_dir,
             db_dir=os.path.join(model_dir, 'db'))
 
+        self.device = 'cuda' if \
+            ('device' not in kwargs or kwargs['device'] == 'gpu') \
+            and torch.cuda.is_available() else 'cpu'
+        use_device = True if self.device == 'cuda' else False
+        self.processor = \
+            SubPreprocessor(model_dir=model_dir,
+                            db_content=True,
+                            use_gpu=use_device)
+        self.output_tables = \
+            process_tables(self.processor,
+                           self.tables)
+
     @type_assert(object, dict)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """process the raw input data
diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_en/fields/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/__init__.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/__init__.py
diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/common_utils.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py
diff --git a/modelscope/preprocessors/star/fields/parse.py b/modelscope/preprocessors/nlp/space_T_en/fields/parse.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/parse.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/parse.py
diff --git a/modelscope/preprocessors/star/fields/preprocess_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
similarity index 95%
rename from modelscope/preprocessors/star/fields/preprocess_dataset.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
index 6c84c0e7..a0fd13d1 100644
--- a/modelscope/preprocessors/star/fields/preprocess_dataset.py
+++ b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
@@ -3,7 +3,7 @@
 from text2sql_lgesql.preprocess.parse_raw_json import Schema, get_schemas
 from text2sql_lgesql.process_sql import get_sql
 
-from modelscope.preprocessors.star.fields.parse import get_label
+from .parse import get_label
 
 
 def preprocess_dataset(processor, dataset, output_tables, database_id, tables):
diff --git a/modelscope/preprocessors/star/fields/process_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
similarity index 94%
rename from modelscope/preprocessors/star/fields/process_dataset.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
index d8ac094a..88059351 100644
--- a/modelscope/preprocessors/star/fields/process_dataset.py
+++ b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
@@ -1,17 +1,12 @@
 # Copyright (c) rhythmcao modified from https://github.com/rhythmcao/text2sql-lgesql.
 
-import argparse
 import os
 import pickle
 import sys
-import time
 
-import json
 from text2sql_lgesql.asdl.asdl import ASDLGrammar
 from text2sql_lgesql.asdl.transition_system import TransitionSystem
 
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 
diff --git a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
new file mode 100644
index 00000000..5693d36e
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
+        kwargs['padding'] = kwargs.get('padding', False)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        text_a, _, _ = self.parse_text_and_label(data)
+
+        inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+        return inputs
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
index 357a946f..4e5ba3bd 100644
--- a/modelscope/preprocessors/nlp/text_error_correction.py
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -7,11 +7,12 @@ from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
+from .nlp_base import NLPBasePreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(Preprocessor):
+class TextErrorCorrectionPreprocessor(NLPBasePreprocessor):
     """The preprocessor used in text correction task.
     """
 
@@ -22,7 +23,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         Args:
             model_dir (str): model path
         """
-        super().__init__(*args, **kwargs)
+        super().__init__(model_dir, *args, **kwargs)
         self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
 
     def __call__(self, data: str) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
new file mode 100644
index 00000000..1e972d64
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
+class TextGenerationJiebaPreprocessor(Preprocessor):
+    """The jieba tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
+        super().__init__(*args, **kwargs)
+        self.tokenizer = JiebaBPETokenizer(
+            osp.join(model_dir, 'tokenizer.json'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+        import torch
+
+        return {
+            'input_ids':
+            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
+        }
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
new file mode 100644
index 00000000..238e2972
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
+class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    @staticmethod
+    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
+        import os
+        for name in os.listdir(model_dir):
+            full_name = os.path.join(model_dir, name)
+            if 'roberta' in name and os.path.isdir(full_name):
+                return full_name
+
+    def build_tokenizer(self, model_dir: str):
+        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
+        if roberta_tokenizer_dir:
+            from transformers import RobertaTokenizer
+            return RobertaTokenizer.from_pretrained(
+                roberta_tokenizer_dir, do_lower_case=False)
+        return super().build_tokenizer(model_dir)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        if self._mode == ModeKeys.INFERENCE:
+            return super().__call__(data)
+        src_rst = super().__call__(data['src_txt'])
+        src_input_ids = src_rst['input_ids']
+        src_attention_mask = src_rst['attention_mask']
+        if 'tgt_txt' in data:
+            labels = super().__call__(data['tgt_txt'])['input_ids']
+        else:
+            labels = src_input_ids[1:]
+            src_input_ids = src_input_ids[:-1]
+            src_attention_mask = src_attention_mask[:-1]
+
+        return {
+            'input_ids': src_input_ids,
+            'attention_mask': src_attention_mask,
+            'labels': labels,
+        }
diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
new file mode 100644
index 00000000..2ada6892
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_ranking)
+class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in passage ranking model.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(model_dir, mode=mode, *args, **kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'source_sentence')
+        self.second_sequence = kwargs.pop('second_sequence',
+                                          'sentences_to_compare')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
+        if isinstance(data, tuple):
+            sentence1, sentence2 = data
+        elif isinstance(data, dict):
+            sentence1 = data.get(self.first_sequence)
+            sentence2 = data.get(self.second_sequence)
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+
+        max_seq_length = self.sequence_length
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_tensors='pt')
+        if 'labels' in data:
+            labels = data['labels']
+            feature['labels'] = labels
+        if 'qid' in data:
+            qid = data['qid']
+            feature['qid'] = qid
+        return feature
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
new file mode 100644
index 00000000..2de0c806
--- /dev/null
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -0,0 +1,261 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Tuple, Union
+
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor, NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
+class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
+    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.label = kwargs.pop('label', OutputKeys.LABELS)
+
+    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
+        data = data.split(' ')
+        data = list(filter(lambda x: len(x) > 0, data))
+
+        def produce_train_sample(words):
+            chars = []
+            labels = []
+            for word in words:
+                chars.extend(list(word))
+                if len(word) == 1:
+                    labels.append('S-CWS')
+                else:
+                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
+                                  + ['E-CWS'])
+            assert len(chars) == len(labels)
+            return chars, labels
+
+        chars, labels = produce_train_sample(data)
+        return {
+            self.first_sequence: chars,
+            self.label: labels,
+        }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
+class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in normal NER task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        self.sequence_length = kwargs['max_length']
+        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+        if 'is_split_into_words' in kwargs:
+            self.is_split_into_words = kwargs.pop('is_split_into_words')
+        else:
+            self.is_split_into_words = self.tokenizer.init_kwargs.get(
+                'is_split_into_words', False)
+        if 'label2id' in kwargs:
+            kwargs.pop('label2id')
+        self.tokenize_kwargs = kwargs
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = None
+        labels_list = None
+        if isinstance(data, str):
+            text = data
+        elif isinstance(data, dict):
+            text = data.get(self.first_sequence)
+            labels_list = data.get(self.label)
+
+        input_ids = []
+        label_mask = []
+        offset_mapping = []
+        if self.is_split_into_words:
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)])
+        else:
+            if self.tokenizer.is_fast:
+                encodings = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                word_ids = encodings.word_ids()
+                for i in range(len(word_ids)):
+                    if word_ids[i] is None:
+                        label_mask.append(0)
+                    elif word_ids[i] == word_ids[i - 1]:
+                        label_mask.append(0)
+                        offset_mapping[-1] = (
+                            offset_mapping[-1][0],
+                            encodings['offset_mapping'][i][1])
+                    else:
+                        label_mask.append(1)
+                        offset_mapping.append(encodings['offset_mapping'][i])
+            else:
+                encodings = self.tokenizer(
+                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
+                    text)
+
+        if len(input_ids) >= self.sequence_length - 2:
+            input_ids = input_ids[:self.sequence_length - 2]
+            label_mask = label_mask[:self.sequence_length - 2]
+        input_ids = [self.tokenizer.cls_token_id
+                     ] + input_ids + [self.tokenizer.sep_token_id]
+        label_mask = [0] + label_mask + [0]
+        attention_mask = [1] * len(input_ids)
+        offset_mapping = offset_mapping[:sum(label_mask)]
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+
+        if self._mode == ModeKeys.INFERENCE:
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
+            label_mask = torch.tensor(
+                label_mask, dtype=torch.bool).unsqueeze(0)
+
+        # the token classification
+        output = {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+        # align the labels with tokenized text
+        if labels_list is not None:
+            assert self.label2id is not None
+            # Map that sends B-Xxx label to its I-Xxx counterpart
+            b_to_i_label = []
+            label_enumerate_values = [
+                k for k, v in sorted(
+                    self.label2id.items(), key=lambda item: item[1])
+            ]
+            for idx, label in enumerate(label_enumerate_values):
+                if label.startswith('B-') and label.replace(
+                        'B-', 'I-') in label_enumerate_values:
+                    b_to_i_label.append(
+                        label_enumerate_values.index(
+                            label.replace('B-', 'I-')))
+                else:
+                    b_to_i_label.append(idx)
+
+            label_row = [self.label2id[lb] for lb in labels_list]
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                if word_idx is None:
+                    label_ids.append(-100)
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_row[word_idx])
+                else:
+                    if self.label_all_tokens:
+                        label_ids.append(b_to_i_label[label_row[word_idx]])
+                    else:
+                        label_ids.append(-100)
+                previous_word_idx = word_idx
+            labels = label_ids
+            output['labels'] = labels
+        return output
+
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
+
+    def get_label_mask_and_offset_mapping(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.tokenizer.tokenize(text)
+        offset = 0
+        if self.get_tokenizer_class() == 'BertTokenizer':
+            for token in tokens:
+                is_start = (token[:2] != '##')
+                if is_start:
+                    label_mask.append(True)
+                else:
+                    token = token[2:]
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+            last_is_blank = False
+            for token in tokens:
+                is_start = (token[0] == '▁')
+                if is_start:
+                    token = token[1:]
+                    label_mask.append(True)
+                    if len(token) == 0:
+                        last_is_blank = True
+                        continue
+                else:
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if last_is_blank or is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+                last_is_blank = False
+        else:
+            raise NotImplementedError
+
+        return label_mask, offset_mapping
diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
new file mode 100644
index 00000000..eb3c4b37
--- /dev/null
+++ b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in zero shot classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            truncation_strategy='only_first',
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
+        return features
diff --git a/modelscope/preprocessors/space/fields/__init__.py b/modelscope/preprocessors/space/fields/__init__.py
deleted file mode 100644
index 925eac71..00000000
--- a/modelscope/preprocessors/space/fields/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .gen_field import MultiWOZBPETextField
-from .intent_field import IntentBPETextField
diff --git a/modelscope/preprocessors/space/fields/dst_processors.py b/modelscope/preprocessors/space/fields/dst_processors.py
deleted file mode 100644
index 22e06eec..00000000
--- a/modelscope/preprocessors/space/fields/dst_processors.py
+++ /dev/null
@@ -1,1523 +0,0 @@
-#
-# Copyright 2020 Heinrich Heine University Duesseldorf
-#
-# Part of this code is based on the source code of BERT-DST
-# (arXiv:1907.03040)
-# Part of this code is based on the source code of Transformers
-# (arXiv:1910.03771)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import re
-
-import json
-import numpy as np
-import six
-from tqdm import tqdm
-
-logger = logging.getLogger(__name__)
-USER_NAME = 'User'
-SYSTEM_NAME = 'System'
-DIALOG_ACT = 'Dialog_Act'
-
-utter1 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food."
-}
-history_states1 = [
-    {},
-]
-utter2 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food.",
-    'System-1':
-    'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?',
-    'Dialog_Act-1': {
-        'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'],
-                              ['pricerange', 'that price range']]
-    },
-    'User-2':
-    'I am looking for an expensive indian restaurant in the area of centre.',
-}
-
-history_states2 = [{}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}]
-
-utter3 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food.",
-    'System-1':
-    'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?',
-    'Dialog_Act-1': {
-        'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'],
-                              ['pricerange', 'that price range']]
-    },
-    'User-2':
-    'I am looking for an expensive indian restaurant in the area of centre.',
-    'System-2':
-    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant '
-    'in the center of town. I can book a table for you, if you like.',
-    'Dialog_Act-2': {
-        'Restaurant-Recommend': [['area', 'center of town'],
-                                 ['food', 'Indian'],
-                                 ['name', 'Saffron Brasserie'],
-                                 ['pricerange', 'expensive']]
-    },
-    'User-3':
-    'Sure thing, please book for 6 people at 19:30 on Saturday.'
-}
-
-history_states3 = [{}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}]
-
-
-class DSTProcessor(object):
-    ACTS_DICT = {
-        'taxi-depart': 'taxi-departure',
-        'taxi-dest': 'taxi-destination',
-        'taxi-leaveat': 'taxi-leaveAt',
-        'taxi-arriveby': 'taxi-arriveBy',
-        'train-depart': 'train-departure',
-        'train-dest': 'train-destination',
-        'train-leaveat': 'train-leaveAt',
-        'train-arriveby': 'train-arriveBy',
-        'train-bookpeople': 'train-book_people',
-        'restaurant-price': 'restaurant-pricerange',
-        'restaurant-bookpeople': 'restaurant-book_people',
-        'restaurant-bookday': 'restaurant-book_day',
-        'restaurant-booktime': 'restaurant-book_time',
-        'hotel-price': 'hotel-pricerange',
-        'hotel-bookpeople': 'hotel-book_people',
-        'hotel-bookday': 'hotel-book_day',
-        'hotel-bookstay': 'hotel-book_stay',
-        'booking-bookpeople': 'booking-book_people',
-        'booking-bookday': 'booking-book_day',
-        'booking-bookstay': 'booking-book_stay',
-        'booking-booktime': 'booking-book_time',
-    }
-
-    LABEL_MAPS = {}  # Loaded from file
-
-    def __init__(self):
-        # Required for mapping slot names in dialogue_acts.json file
-        # to proper designations.
-        pass
-
-    def _convert_inputs_to_utterances(self, inputs: dict,
-                                      history_states: list):
-        """This method is to generate the utterances with user, sys, dialog_acts and metadata,
-         while metadata is from the history_states or the output from the inference pipline"""
-
-        utterances = []
-        user_inputs = []
-        sys_gen_inputs = []
-        dialog_acts_inputs = []
-        for i, item in enumerate(inputs):
-            name, turn = item.split('-')
-            if name == USER_NAME:
-                user_inputs.insert(int(turn) - 1, inputs[item])
-            elif name == SYSTEM_NAME:
-                sys_gen_inputs.insert(int(turn) - 1, inputs[item])
-            else:
-                dialog_acts_inputs.insert(int(turn) - 1, inputs[item])
-
-        # user is leading the topic should aways larger than sys and dialog acts
-        assert len(user_inputs) - 1 == len(sys_gen_inputs)
-        assert len(user_inputs) - 1 == len(dialog_acts_inputs)
-        # the history states record both user and sys states
-        assert len(history_states) == len(user_inputs) + len(sys_gen_inputs)
-
-        # the dialog_act at user turn is useless
-        for i, item in enumerate(history_states):
-            utterance = {}
-            # the dialog_act at user turn is useless
-            utterance['dialog_act'] = dialog_acts_inputs[
-                i // 2] if i % 2 == 1 else {}
-            utterance['text'] = sys_gen_inputs[
-                i // 2] if i % 2 == 1 else user_inputs[i // 2]
-            utterance['metadata'] = item
-            utterance['span_info'] = []
-            utterances.append(utterance)
-
-        return utterances
-
-    def _load_acts(self, inputs: dict, dialog_id='example.json'):
-        dialog_acts_inputs = []
-        for i, item in enumerate(inputs):
-            name, turn = item.split('-')
-            if name == DIALOG_ACT:
-                dialog_acts_inputs.insert(int(turn) - 1, inputs[item])
-        s_dict = {}
-
-        for j, item in enumerate(dialog_acts_inputs):
-            if isinstance(item, dict):
-                for a in item:
-                    aa = a.lower().split('-')
-                    if aa[1] == 'inform' or aa[1] == 'recommend' or \
-                            aa[1] == 'select' or aa[1] == 'book':
-                        for i in item[a]:
-                            s = i[0].lower()
-                            v = i[1].lower().strip()
-                            if s == 'none' or v == '?' or v == 'none':
-                                continue
-                            slot = aa[0] + '-' + s
-                            if slot in self.ACTS_DICT:
-                                slot = self.ACTS_DICT[slot]
-                            key = dialog_id, str(int(j) + 1), slot
-                            # In case of multiple mentioned values...
-                            # ... Option 1: Keep first informed value
-                            if key not in s_dict:
-                                s_dict[key] = list([v])
-                            # ... Option 2: Keep last informed value
-                            # s_dict[key] = list([v])
-
-        return s_dict
-
-
-class multiwoz22Processor(DSTProcessor):
-
-    def __init__(self):
-        super().__init__()
-
-    def normalize_time(self, text):
-        text = re.sub(r'(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
-                      text)  # am/pm without space
-        text = re.sub(r'(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
-                      text)  # am/pm short to long form
-        text = re.sub(
-            r'(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
-            r'\1\2 \3:\4\5', text)  # Missing separator
-        text = re.sub(r'(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
-                      text)  # Wrong separator
-        text = re.sub(r'(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
-                      r'\1\2 \3:00\4', text)  # normalize simple full hour time
-        text = re.sub(r'(^| )(\d{1}:\d{2})', r'\g<1>0\2',
-                      text)  # Add missing leading 0
-        # Map 12 hour times to 24 hour times
-        text = \
-            re.sub(
-                r'(\d{2})(:\d{2}) ?p\.?m\.?',
-                lambda x: str(int(x.groups()[0]) + 12
-                              if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text)
-        text = re.sub(r'(^| )24:(\d{2})', r'\g<1>00:\2',
-                      text)  # Correct times that use 24 as hour
-        return text
-
-    def normalize_text(self, text):
-        text = self.normalize_time(text)
-        text = re.sub("n't", ' not', text)
-        text = re.sub('(^| )zero(-| )star([s.,? ]|$)', r'\g<1>0 star\3', text)
-        text = re.sub('(^| )one(-| )star([s.,? ]|$)', r'\g<1>1 star\3', text)
-        text = re.sub('(^| )two(-| )star([s.,? ]|$)', r'\g<1>2 star\3', text)
-        text = re.sub('(^| )three(-| )star([s.,? ]|$)', r'\g<1>3 star\3', text)
-        text = re.sub('(^| )four(-| )star([s.,? ]|$)', r'\g<1>4 star\3', text)
-        text = re.sub('(^| )five(-| )star([s.,? ]|$)', r'\g<1>5 star\3', text)
-        text = re.sub('archaelogy', 'archaeology', text)  # Systematic typo
-        text = re.sub('guesthouse', 'guest house', text)  # Normalization
-        text = re.sub('(^| )b ?& ?b([.,? ]|$)', r'\1bed and breakfast\2',
-                      text)  # Normalization
-        text = re.sub('bed & breakfast', 'bed and breakfast',
-                      text)  # Normalization
-        return text
-
-    # Loads the dialogue_acts.json and returns a list
-    # of slot-value pairs.
-    def load_acts(self, input_file):
-        with open(input_file) as f:
-            acts = json.load(f)
-        s_dict = {}
-        for d in acts:
-            for t in acts[d]:
-                if int(t) % 2 == 0:
-                    continue
-                # Only process, if turn has annotation
-                if isinstance(acts[d][t]['dialog_act'], dict):
-                    for a in acts[d][t]['dialog_act']:
-                        aa = a.lower().split('-')
-                        if aa[1] == 'inform' or aa[1] == 'recommend' \
-                                or aa[1] == 'select' or aa[1] == 'book':
-                            for i in acts[d][t]['dialog_act'][a]:
-                                s = i[0].lower()
-                                v = i[1].lower().strip()
-                                if s == 'none' or v == '?' or v == 'none':
-                                    continue
-                                slot = aa[0] + '-' + s
-                                if slot in self.ACTS_DICT:
-                                    slot = self.ACTS_DICT[slot]
-                                key = d, str(int(t) // 2 + 1), slot
-                                # In case of multiple mentioned values...
-                                # ... Option 1: Keep first informed value
-                                if key not in s_dict:
-                                    s_dict[key] = list([v])
-                                # ... Option 2: Keep last informed value
-                                # s_dict[key] = list([v])
-        return s_dict
-
-    # This should only contain label normalizations. All other mappings should
-    # be defined in LABEL_MAPS.
-    def normalize_label(self, slot, value_label):
-        # Normalization of empty slots
-        if value_label == '' or value_label == 'not mentioned':
-            return 'none'
-
-        # Normalization of time slots
-        if 'leaveAt' in slot or 'arriveBy' in slot or slot == 'restaurant-book_time':
-            return self.normalize_time(value_label)
-
-        # Normalization
-        if 'type' in slot or 'name' in slot or 'destination' in slot or 'departure' in slot:
-            value_label = re.sub('guesthouse', 'guest house', value_label)
-
-        # Map to boolean slots
-        if slot == 'hotel-parking' or slot == 'hotel-internet':
-            if value_label == 'yes' or value_label == 'free':
-                return 'true'
-            if value_label == 'no':
-                return 'false'
-        if slot == 'hotel-type':
-            if value_label == 'hotel':
-                return 'true'
-            if value_label == 'guest house':
-                return 'false'
-
-        return value_label
-
-    def tokenize(self, utt):
-        utt_lower = convert_to_unicode(utt).lower()
-        utt_lower = self.normalize_text(utt_lower)
-        utt_tok = [
-            tok for tok in map(str.strip, re.split(r'(\W+)', utt_lower))
-            if len(tok) > 0
-        ]
-        return utt_tok
-
-    def delex_utt(self, utt, values, unk_token='[UNK]'):
-        utt_norm = self.tokenize(utt)
-        for s, vals in values.items():
-            for v in vals:
-                if v != 'none':
-                    v_norm = self.tokenize(v)
-                    v_len = len(v_norm)
-                    for i in range(len(utt_norm) + 1 - v_len):
-                        if utt_norm[i:i + v_len] == v_norm:
-                            utt_norm[i:i + v_len] = [unk_token] * v_len
-        return utt_norm
-
-    def get_token_pos(self, tok_list, value_label):
-        find_pos = []
-        found = False
-        label_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', value_label))
-            if len(item) > 0
-        ]
-        len_label = len(label_list)
-        for i in range(len(tok_list) + 1 - len_label):
-            if tok_list[i:i + len_label] == label_list:
-                find_pos.append((i, i + len_label))  # start, exclusive_end
-                found = True
-        return found, find_pos
-
-    def check_label_existence(self, value_label, usr_utt_tok):
-        in_usr, usr_pos = self.get_token_pos(usr_utt_tok, value_label)
-        # If no hit even though there should be one, check for value label variants
-        if not in_usr and value_label in self.LABEL_MAPS:
-            for value_label_variant in self.LABEL_MAPS[value_label]:
-                in_usr, usr_pos = self.get_token_pos(usr_utt_tok,
-                                                     value_label_variant)
-                if in_usr:
-                    break
-        return in_usr, usr_pos
-
-    def check_slot_referral(self, value_label, slot, seen_slots):
-        referred_slot = 'none'
-        if slot == 'hotel-stars' or slot == 'hotel-internet' or slot == 'hotel-parking':
-            return referred_slot
-        for s in seen_slots:
-            # Avoid matches for slots that share values with different meaning.
-            # hotel-internet and -parking are handled separately as Boolean slots.
-            if s == 'hotel-stars' or s == 'hotel-internet' or s == 'hotel-parking':
-                continue
-            if re.match('(hotel|restaurant)-book_people',
-                        s) and slot == 'hotel-book_stay':
-                continue
-            if re.match('(hotel|restaurant)-book_people',
-                        slot) and s == 'hotel-book_stay':
-                continue
-            if slot != s and (slot not in seen_slots
-                              or seen_slots[slot] != value_label):
-                if seen_slots[s] == value_label:
-                    referred_slot = s
-                    break
-                elif value_label in self.LABEL_MAPS:
-                    for value_label_variant in self.LABEL_MAPS[value_label]:
-                        if seen_slots[s] == value_label_variant:
-                            referred_slot = s
-                            break
-        return referred_slot
-
-    def is_in_list(self, tok, value):
-        found = False
-        tok_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', tok))
-            if len(item) > 0
-        ]
-        value_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', value))
-            if len(item) > 0
-        ]
-        tok_len = len(tok_list)
-        value_len = len(value_list)
-        for i in range(tok_len + 1 - value_len):
-            if tok_list[i:i + value_len] == value_list:
-                found = True
-                break
-        return found
-
-    # Fuzzy matching to label informed slot values
-    def check_slot_inform(self, value_label, inform_label):
-        result = False
-        informed_value = 'none'
-        vl = ' '.join(self.tokenize(value_label))
-        for il in inform_label:
-            if vl == il:
-                result = True
-            elif self.is_in_list(il, vl):
-                result = True
-            elif self.is_in_list(vl, il):
-                result = True
-            elif il in self.LABEL_MAPS:
-                for il_variant in self.LABEL_MAPS[il]:
-                    if vl == il_variant:
-                        result = True
-                        break
-                    elif self.is_in_list(il_variant, vl):
-                        result = True
-                        break
-                    elif self.is_in_list(vl, il_variant):
-                        result = True
-                        break
-            elif vl in self.LABEL_MAPS:
-                for value_label_variant in self.LABEL_MAPS[vl]:
-                    if value_label_variant == il:
-                        result = True
-                        break
-                    elif self.is_in_list(il, value_label_variant):
-                        result = True
-                        break
-                    elif self.is_in_list(value_label_variant, il):
-                        result = True
-                        break
-            if result:
-                informed_value = il
-                break
-        return result, informed_value
-
-    def get_turn_label(self, value_label, inform_label, sys_utt_tok,
-                       usr_utt_tok, slot, seen_slots, slot_last_occurrence):
-        usr_utt_tok_label = [0 for _ in usr_utt_tok]
-        informed_value = 'none'
-        referred_slot = 'none'
-        if value_label == 'none' or value_label == 'dontcare' or value_label == 'true' or value_label == 'false':
-            class_type = value_label
-        else:
-            in_usr, usr_pos = self.check_label_existence(
-                value_label, usr_utt_tok)
-            is_informed, informed_value = self.check_slot_inform(
-                value_label, inform_label)
-            if in_usr:
-                class_type = 'copy_value'
-                if slot_last_occurrence:
-                    (s, e) = usr_pos[-1]
-                    for i in range(s, e):
-                        usr_utt_tok_label[i] = 1
-                else:
-                    for (s, e) in usr_pos:
-                        for i in range(s, e):
-                            usr_utt_tok_label[i] = 1
-            elif is_informed:
-                class_type = 'inform'
-            else:
-                referred_slot = self.check_slot_referral(
-                    value_label, slot, seen_slots)
-                if referred_slot != 'none':
-                    class_type = 'refer'
-                else:
-                    class_type = 'unpointable'
-        return informed_value, referred_slot, usr_utt_tok_label, class_type
-
-    def _create_example(self,
-                        utterances,
-                        sys_inform_dict,
-                        set_type,
-                        slot_list,
-                        label_maps={},
-                        append_history=False,
-                        use_history_labels=False,
-                        swap_utterances=False,
-                        label_value_repetitions=False,
-                        delexicalize_sys_utts=False,
-                        unk_token='[UNK]',
-                        analyze=False,
-                        dialog_id='example.json'):
-
-        # Collects all slot changes throughout the dialog
-        cumulative_labels = {slot: 'none' for slot in slot_list}
-
-        # First system utterance is empty, since multiwoz starts with user input
-        utt_tok_list = [[]]
-        mod_slots_list = []
-
-        # Collect all utterances and their metadata
-        usr_sys_switch = True
-        turn_itr = 0
-
-        for utt in utterances:
-            # Assert that system and user utterances alternate
-            is_sys_utt = utt['metadata'] != {}
-            if usr_sys_switch == is_sys_utt:
-                print(
-                    'WARN: Wrong order of system and user utterances. Skipping rest of the dialog %s'
-                    % (dialog_id))
-                break
-            usr_sys_switch = is_sys_utt
-
-            if is_sys_utt:
-                turn_itr += 1
-
-            # Delexicalize sys utterance
-            if delexicalize_sys_utts and is_sys_utt:
-                inform_dict = {slot: 'none' for slot in slot_list}
-                for slot in slot_list:
-                    if (str(dialog_id), str(turn_itr),
-                            slot) in sys_inform_dict:
-                        inform_dict[slot] = sys_inform_dict[(str(dialog_id),
-                                                             str(turn_itr),
-                                                             slot)]
-                utt_tok_list.append(
-                    self.delex_utt(utt['text'], inform_dict,
-                                   unk_token))  # normalize utterances
-            else:
-                utt_tok_list.append(self.tokenize(
-                    utt['text']))  # normalize utterances
-
-            modified_slots = {}
-
-            # If sys utt, extract metadata (identify and collect modified slots)
-            if is_sys_utt:
-                for d in utt['metadata']:
-                    booked = utt['metadata'][d]['book']['booked']
-                    booked_slots = {}
-                    # Check the booked section
-                    if booked != []:
-                        for s in booked[0]:
-                            booked_slots[s] = self.normalize_label(
-                                '%s-%s' % (d, s),
-                                booked[0][s])  # normalize labels
-                    # Check the semi and the inform slots
-                    for category in ['book', 'semi']:
-                        for s in utt['metadata'][d][category]:
-                            cs = '%s-book_%s' % (
-                                d, s) if category == 'book' else '%s-%s' % (d,
-                                                                            s)
-                            value_label = self.normalize_label(
-                                cs, utt['metadata'][d][category]
-                                [s])  # normalize labels
-                            # Prefer the slot value as stored in the booked section
-                            if s in booked_slots:
-                                value_label = booked_slots[s]
-                            # Remember modified slots and entire dialog state
-                            if cs in slot_list and cumulative_labels[
-                                    cs] != value_label:
-                                modified_slots[cs] = value_label
-                                cumulative_labels[cs] = value_label
-
-            mod_slots_list.append(modified_slots.copy())
-
-        # Form proper (usr, sys) turns
-        turn_itr = 0
-        diag_seen_slots_dict = {}
-        diag_seen_slots_value_dict = {slot: 'none' for slot in slot_list}
-        diag_state = {slot: 'none' for slot in slot_list}
-        sys_utt_tok = []
-        usr_utt_tok = []
-        hst_utt_tok = []
-        hst_utt_tok_label_dict = {slot: [] for slot in slot_list}
-        new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
-        new_diag_state = diag_state.copy()
-
-        for i in range(0, len(utt_tok_list) - 1, 2):
-            sys_utt_tok_label_dict = {}
-            usr_utt_tok_label_dict = {}
-            value_dict = {}
-            inform_dict = {}
-            inform_slot_dict = {}
-            referral_dict = {}
-            class_type_dict = {}
-
-            # Collect turn data
-            if append_history:
-                if swap_utterances:
-                    hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok
-                else:
-                    hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok
-            sys_utt_tok = utt_tok_list[i]
-            usr_utt_tok = utt_tok_list[i + 1]
-            turn_slots = mod_slots_list[
-                i + 1] if len(mod_slots_list) > 1 else {}
-
-            guid = '%s-%s-%s' % (set_type, str(dialog_id), str(turn_itr))
-
-            if analyze:
-                print('%15s %2s %s ||| %s' %
-                      (dialog_id, turn_itr, ' '.join(sys_utt_tok),
-                       ' '.join(usr_utt_tok)))
-                print('%15s %2s [' % (dialog_id, turn_itr), end='')
-
-            new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
-            new_diag_state = diag_state.copy()
-            for slot in slot_list:
-                value_label = 'none'
-                if slot in turn_slots:
-                    value_label = turn_slots[slot]
-                    # We keep the original labels so as to not
-                    # overlook unpointable values, as well as to not
-                    # modify any of the original labels for test sets,
-                    # since this would make comparison difficult.
-                    value_dict[slot] = value_label
-                elif label_value_repetitions and slot in diag_seen_slots_dict:
-                    value_label = diag_seen_slots_value_dict[slot]
-
-                # Get dialog act annotations
-                inform_label = list(['none'])
-                inform_slot_dict[slot] = 0
-                if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
-                    inform_label = list([
-                        self.normalize_label(slot, i)
-                        for i in sys_inform_dict[(str(dialog_id),
-                                                  str(turn_itr), slot)]
-                    ])
-                    inform_slot_dict[slot] = 1
-                elif (str(dialog_id), str(turn_itr),
-                      'booking-' + slot.split('-')[1]) in sys_inform_dict:
-                    inform_label = list([
-                        self.normalize_label(slot, i)
-                        for i in sys_inform_dict[(str(dialog_id),
-                                                  str(turn_itr), 'booking-'
-                                                  + slot.split('-')[1])]
-                    ])
-                    inform_slot_dict[slot] = 1
-
-                (informed_value, referred_slot, usr_utt_tok_label,
-                 class_type) = self.get_turn_label(
-                     value_label,
-                     inform_label,
-                     sys_utt_tok,
-                     usr_utt_tok,
-                     slot,
-                     diag_seen_slots_value_dict,
-                     slot_last_occurrence=True)
-
-                inform_dict[slot] = informed_value
-
-                # Generally don't use span prediction on sys utterance (but inform prediction instead).
-                sys_utt_tok_label = [0 for _ in sys_utt_tok]
-
-                # Determine what to do with value repetitions.
-                # If value is unique in seen slots, then tag it, otherwise not,
-                # since correct slot assignment can not be guaranteed anymore.
-                if label_value_repetitions and slot in diag_seen_slots_dict:
-                    if class_type == 'copy_value' and list(
-                            diag_seen_slots_value_dict.values()).count(
-                                value_label) > 1:
-                        class_type = 'none'
-                        usr_utt_tok_label = [0 for _ in usr_utt_tok_label]
-
-                sys_utt_tok_label_dict[slot] = sys_utt_tok_label
-                usr_utt_tok_label_dict[slot] = usr_utt_tok_label
-
-                if append_history:
-                    if use_history_labels:
-                        if swap_utterances:
-                            new_hst_utt_tok_label_dict[
-                                slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[
-                                    slot]
-                        else:
-                            new_hst_utt_tok_label_dict[
-                                slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[
-                                    slot]
-                    else:
-                        new_hst_utt_tok_label_dict[slot] = [
-                            0 for _ in sys_utt_tok_label + usr_utt_tok_label
-                            + new_hst_utt_tok_label_dict[slot]
-                        ]
-
-                # For now, we map all occurences of unpointable slot values
-                # to none. However, since the labels will still suggest
-                # a presence of unpointable slot values, the task of the
-                # DST is still to find those values. It is just not
-                # possible to do that via span prediction on the current input.
-                if class_type == 'unpointable':
-                    class_type_dict[slot] = 'none'
-                    referral_dict[slot] = 'none'
-                    if analyze:
-                        if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[
-                                slot]:
-                            print('(%s): %s, ' % (slot, value_label), end='')
-                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] \
-                        and class_type != 'copy_value' and class_type != 'inform':
-                    # If slot has seen before and its class type did not change, label this slot a not present,
-                    # assuming that the slot has not actually been mentioned in this turn.
-                    # Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
-                    # this must mean there is evidence in the original labels, therefore consider
-                    # them as mentioned again.
-                    class_type_dict[slot] = 'none'
-                    referral_dict[slot] = 'none'
-                else:
-                    class_type_dict[slot] = class_type
-                    referral_dict[slot] = referred_slot
-                # Remember that this slot was mentioned during this dialog already.
-                if class_type != 'none':
-                    diag_seen_slots_dict[slot] = class_type
-                    diag_seen_slots_value_dict[slot] = value_label
-                    new_diag_state[slot] = class_type
-                    # Unpointable is not a valid class, therefore replace with
-                    # some valid class for now...
-                    if class_type == 'unpointable':
-                        new_diag_state[slot] = 'copy_value'
-
-            if analyze:
-                print(']')
-
-            if swap_utterances:
-                txt_a = usr_utt_tok
-                txt_b = sys_utt_tok
-                txt_a_lbl = usr_utt_tok_label_dict
-                txt_b_lbl = sys_utt_tok_label_dict
-            else:
-                txt_a = sys_utt_tok
-                txt_b = usr_utt_tok
-                txt_a_lbl = sys_utt_tok_label_dict
-                txt_b_lbl = usr_utt_tok_label_dict
-
-            example = DSTExample(
-                guid=guid,
-                text_a=txt_a,
-                text_b=txt_b,
-                history=hst_utt_tok,
-                text_a_label=txt_a_lbl,
-                text_b_label=txt_b_lbl,
-                history_label=hst_utt_tok_label_dict,
-                values=diag_seen_slots_value_dict.copy(),
-                inform_label=inform_dict,
-                inform_slot_label=inform_slot_dict,
-                refer_label=referral_dict,
-                diag_state=diag_state,
-                class_label=class_type_dict)
-        # Update some variables.
-        hst_utt_tok_label_dict = new_hst_utt_tok_label_dict.copy()
-        diag_state = new_diag_state.copy()
-
-        turn_itr += 1
-        return example
-
-    def create_example(self,
-                       inputs,
-                       history_states,
-                       set_type,
-                       slot_list,
-                       label_maps={},
-                       append_history=False,
-                       use_history_labels=False,
-                       swap_utterances=False,
-                       label_value_repetitions=False,
-                       delexicalize_sys_utts=False,
-                       unk_token='[UNK]',
-                       analyze=False,
-                       dialog_id='0'):
-        utterances = self._convert_inputs_to_utterances(inputs, history_states)
-        sys_inform_dict = self._load_acts(inputs)
-        self.LABEL_MAPS = label_maps
-        example = self._create_example(utterances, sys_inform_dict, set_type,
-                                       slot_list, label_maps, append_history,
-                                       use_history_labels, swap_utterances,
-                                       label_value_repetitions,
-                                       delexicalize_sys_utts, unk_token,
-                                       analyze)
-
-        return example
-
-    def create_examples(self,
-                        input_file,
-                        acts_file,
-                        set_type,
-                        slot_list,
-                        label_maps={},
-                        append_history=False,
-                        use_history_labels=False,
-                        swap_utterances=False,
-                        label_value_repetitions=False,
-                        delexicalize_sys_utts=False,
-                        unk_token='[UNK]',
-                        analyze=False):
-        """Read a DST json file into a list of DSTExample."""
-
-        sys_inform_dict = self.load_acts(acts_file)
-
-        with open(input_file, 'r', encoding='utf-8') as reader:
-            input_data = json.load(reader)
-
-        self.LABEL_MAPS = label_maps
-
-        examples = []
-        for dialog_id in tqdm(input_data):
-            entry = input_data[dialog_id]
-            utterances = entry['log']
-
-            example = self._create_example(
-                utterances, sys_inform_dict, set_type, slot_list, label_maps,
-                append_history, use_history_labels, swap_utterances,
-                label_value_repetitions, delexicalize_sys_utts, unk_token,
-                analyze)
-            examples.append(example)
-
-        return examples
-
-
-class DSTExample(object):
-    """
-    A single training/test example for the DST dataset.
-    """
-
-    def __init__(self,
-                 guid,
-                 text_a,
-                 text_b,
-                 history,
-                 text_a_label=None,
-                 text_b_label=None,
-                 history_label=None,
-                 values=None,
-                 inform_label=None,
-                 inform_slot_label=None,
-                 refer_label=None,
-                 diag_state=None,
-                 class_label=None):
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.history = history
-        self.text_a_label = text_a_label
-        self.text_b_label = text_b_label
-        self.history_label = history_label
-        self.values = values
-        self.inform_label = inform_label
-        self.inform_slot_label = inform_slot_label
-        self.refer_label = refer_label
-        self.diag_state = diag_state
-        self.class_label = class_label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ''
-        s += 'guid: %s' % (self.guid)
-        s += ', text_a: %s' % (self.text_a)
-        s += ', text_b: %s' % (self.text_b)
-        s += ', history: %s' % (self.history)
-        if self.text_a_label:
-            s += ', text_a_label: %d' % (self.text_a_label)
-        if self.text_b_label:
-            s += ', text_b_label: %d' % (self.text_b_label)
-        if self.history_label:
-            s += ', history_label: %d' % (self.history_label)
-        if self.values:
-            s += ', values: %d' % (self.values)
-        if self.inform_label:
-            s += ', inform_label: %d' % (self.inform_label)
-        if self.inform_slot_label:
-            s += ', inform_slot_label: %d' % (self.inform_slot_label)
-        if self.refer_label:
-            s += ', refer_label: %d' % (self.refer_label)
-        if self.diag_state:
-            s += ', diag_state: %d' % (self.diag_state)
-        if self.class_label:
-            s += ', class_label: %d' % (self.class_label)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 input_ids,
-                 input_ids_unmasked,
-                 input_mask,
-                 segment_ids,
-                 start_pos=None,
-                 end_pos=None,
-                 values=None,
-                 inform=None,
-                 inform_slot=None,
-                 refer_id=None,
-                 diag_state=None,
-                 class_label_id=None,
-                 guid='NONE'):
-        self.guid = guid
-        self.input_ids = input_ids
-        self.input_ids_unmasked = input_ids_unmasked
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_pos = start_pos
-        self.end_pos = end_pos
-        self.values = values
-        self.inform = inform
-        self.inform_slot = inform_slot
-        self.refer_id = refer_id
-        self.diag_state = diag_state
-        self.class_label_id = class_label_id
-
-
-def convert_examples_to_features(examples,
-                                 slot_list,
-                                 class_types,
-                                 model_type,
-                                 tokenizer,
-                                 max_seq_length,
-                                 slot_value_dropout=0.0):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    if model_type == 'bert':
-        model_specs = {
-            'MODEL_TYPE': 'bert',
-            'CLS_TOKEN': '[CLS]',
-            'UNK_TOKEN': '[UNK]',
-            'SEP_TOKEN': '[SEP]',
-            'TOKEN_CORRECTION': 4
-        }
-    else:
-        logger.error('Unknown model type (%s). Aborting.' % (model_type))
-        exit(1)
-
-    def _tokenize_text_and_label(text, text_label_dict, slot, tokenizer,
-                                 model_specs, slot_value_dropout):
-        joint_text_label = [0 for _ in text_label_dict[slot]
-                            ]  # joint all slots' label
-        for slot_text_label in text_label_dict.values():
-            for idx, label in enumerate(slot_text_label):
-                if label == 1:
-                    joint_text_label[idx] = 1
-
-        text_label = text_label_dict[slot]
-        tokens = []
-        tokens_unmasked = []
-        token_labels = []
-        for token, token_label, joint_label in zip(text, text_label,
-                                                   joint_text_label):
-            token = convert_to_unicode(token)
-            sub_tokens = tokenizer.tokenize(token)  # Most time intensive step
-            tokens_unmasked.extend(sub_tokens)
-            if slot_value_dropout == 0.0 or joint_label == 0:
-                tokens.extend(sub_tokens)
-            else:
-                rn_list = np.random.random_sample((len(sub_tokens), ))
-                for rn, sub_token in zip(rn_list, sub_tokens):
-                    if rn > slot_value_dropout:
-                        tokens.append(sub_token)
-                    else:
-                        tokens.append(model_specs['UNK_TOKEN'])
-            token_labels.extend([token_label for _ in sub_tokens])
-        assert len(tokens) == len(token_labels)
-        assert len(tokens_unmasked) == len(token_labels)
-        return tokens, tokens_unmasked, token_labels
-
-    def _truncate_seq_pair(tokens_a, tokens_b, history, max_length):
-        """Truncates a sequence pair in place to the maximum length.
-        Copied from bert/run_classifier.py
-        """
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b) + len(history)
-            if total_length <= max_length:
-                break
-            if len(history) > 0:
-                history.pop()
-            elif len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-    def _truncate_length_and_warn(tokens_a, tokens_b, history, max_seq_length,
-                                  model_specs, guid):
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP], [SEP] with "- 4" (BERT)
-        if len(tokens_a) + len(tokens_b) + len(
-                history) > max_seq_length - model_specs['TOKEN_CORRECTION']:
-            logger.info('Truncate Example %s. Total len=%d.' %
-                        (guid, len(tokens_a) + len(tokens_b) + len(history)))
-            input_text_too_long = True
-        else:
-            input_text_too_long = False
-        _truncate_seq_pair(tokens_a, tokens_b, history,
-                           max_seq_length - model_specs['TOKEN_CORRECTION'])
-        return input_text_too_long
-
-    def _get_token_label_ids(token_labels_a, token_labels_b,
-                             token_labels_history, max_seq_length,
-                             model_specs):
-        token_label_ids = []
-        token_label_ids.append(0)  # [CLS]
-        for token_label in token_labels_a:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        for token_label in token_labels_b:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        for token_label in token_labels_history:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        while len(token_label_ids) < max_seq_length:
-            token_label_ids.append(0)  # padding
-        assert len(token_label_ids) == max_seq_length
-        return token_label_ids
-
-    def _get_start_end_pos(class_type, token_label_ids, max_seq_length):
-        if class_type == 'copy_value' and 1 not in token_label_ids:
-            # logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
-            class_type = 'none'
-        start_pos = 0
-        end_pos = 0
-        if 1 in token_label_ids:
-            start_pos = token_label_ids.index(1)
-            # Parsing is supposed to find only first location of wanted value
-            if 0 not in token_label_ids[start_pos:]:
-                end_pos = len(token_label_ids[start_pos:]) + start_pos - 1
-            else:
-                end_pos = token_label_ids[start_pos:].index(0) + start_pos - 1
-            for i in range(max_seq_length):
-                if i >= start_pos and i <= end_pos:
-                    assert token_label_ids[i] == 1
-        return class_type, start_pos, end_pos
-
-    def _get_transformer_input(tokens_a, tokens_b, history, max_seq_length,
-                               tokenizer, model_specs):
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        segment_ids = []
-        tokens.append(model_specs['CLS_TOKEN'])
-        segment_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(0)
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(1)
-        for token in history:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(1)
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        return tokens, input_ids, input_mask, segment_ids
-
-    total_cnt = 0
-    too_long_cnt = 0
-
-    refer_list = ['none'] + slot_list
-
-    features = []
-    # Convert single example
-    for (example_index, example) in enumerate(examples):
-        if example_index % 1000 == 0:
-            logger.info('Writing example %d of %d' %
-                        (example_index, len(examples)))
-
-        total_cnt += 1
-
-        value_dict = {}
-        inform_dict = {}
-        inform_slot_dict = {}
-        refer_id_dict = {}
-        diag_state_dict = {}
-        class_label_id_dict = {}
-        start_pos_dict = {}
-        end_pos_dict = {}
-        for slot in slot_list:
-            tokens_a, tokens_a_unmasked, token_labels_a = _tokenize_text_and_label(
-                example.text_a, example.text_a_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-            tokens_b, tokens_b_unmasked, token_labels_b = _tokenize_text_and_label(
-                example.text_b, example.text_b_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-            tokens_history, tokens_history_unmasked, token_labels_history = _tokenize_text_and_label(
-                example.history, example.history_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-
-            input_text_too_long = _truncate_length_and_warn(
-                tokens_a, tokens_b, tokens_history, max_seq_length,
-                model_specs, example.guid)
-
-            if input_text_too_long:
-                if example_index < 10:
-                    if len(token_labels_a) > len(tokens_a):
-                        logger.info('    tokens_a truncated labels: %s'
-                                    % str(token_labels_a[len(tokens_a):]))
-                    if len(token_labels_b) > len(tokens_b):
-                        logger.info('    tokens_b truncated labels: %s'
-                                    % str(token_labels_b[len(tokens_b):]))
-                    if len(token_labels_history) > len(tokens_history):
-                        logger.info(
-                            '    tokens_history truncated labels: %s'
-                            % str(token_labels_history[len(tokens_history):]))
-
-                token_labels_a = token_labels_a[:len(tokens_a)]
-                token_labels_b = token_labels_b[:len(tokens_b)]
-                token_labels_history = token_labels_history[:len(tokens_history
-                                                                 )]
-                tokens_a_unmasked = tokens_a_unmasked[:len(tokens_a)]
-                tokens_b_unmasked = tokens_b_unmasked[:len(tokens_b)]
-                tokens_history_unmasked = tokens_history_unmasked[:len(
-                    tokens_history)]
-
-            assert len(token_labels_a) == len(tokens_a)
-            assert len(token_labels_b) == len(tokens_b)
-            assert len(token_labels_history) == len(tokens_history)
-            assert len(token_labels_a) == len(tokens_a_unmasked)
-            assert len(token_labels_b) == len(tokens_b_unmasked)
-            assert len(token_labels_history) == len(tokens_history_unmasked)
-            token_label_ids = _get_token_label_ids(token_labels_a,
-                                                   token_labels_b,
-                                                   token_labels_history,
-                                                   max_seq_length, model_specs)
-
-            value_dict[slot] = example.values[slot]
-            inform_dict[slot] = example.inform_label[slot]
-
-            class_label_mod, start_pos_dict[slot], end_pos_dict[
-                slot] = _get_start_end_pos(example.class_label[slot],
-                                           token_label_ids, max_seq_length)
-            if class_label_mod != example.class_label[slot]:
-                example.class_label[slot] = class_label_mod
-            inform_slot_dict[slot] = example.inform_slot_label[slot]
-            refer_id_dict[slot] = refer_list.index(example.refer_label[slot])
-            diag_state_dict[slot] = class_types.index(example.diag_state[slot])
-            class_label_id_dict[slot] = class_types.index(
-                example.class_label[slot])
-
-        if input_text_too_long:
-            too_long_cnt += 1
-
-        tokens, input_ids, input_mask, segment_ids = _get_transformer_input(
-            tokens_a, tokens_b, tokens_history, max_seq_length, tokenizer,
-            model_specs)
-        if slot_value_dropout > 0.0:
-            _, input_ids_unmasked, _, _ = _get_transformer_input(
-                tokens_a_unmasked, tokens_b_unmasked, tokens_history_unmasked,
-                max_seq_length, tokenizer, model_specs)
-        else:
-            input_ids_unmasked = input_ids
-
-        assert (len(input_ids) == len(input_ids_unmasked))
-
-        if example_index < 10:
-            logger.info('*** Example ***')
-            logger.info('guid: %s' % (example.guid))
-            logger.info('tokens: %s' % ' '.join(tokens))
-            logger.info('input_ids: %s' % ' '.join([str(x)
-                                                    for x in input_ids]))
-            logger.info('input_mask: %s'
-                        % ' '.join([str(x) for x in input_mask]))
-            logger.info('segment_ids: %s'
-                        % ' '.join([str(x) for x in segment_ids]))
-            logger.info('start_pos: %s' % str(start_pos_dict))
-            logger.info('end_pos: %s' % str(end_pos_dict))
-            logger.info('values: %s' % str(value_dict))
-            logger.info('inform: %s' % str(inform_dict))
-            logger.info('inform_slot: %s' % str(inform_slot_dict))
-            logger.info('refer_id: %s' % str(refer_id_dict))
-            logger.info('diag_state: %s' % str(diag_state_dict))
-            logger.info('class_label_id: %s' % str(class_label_id_dict))
-
-        features.append(
-            InputFeatures(
-                guid=example.guid,
-                input_ids=input_ids,
-                input_ids_unmasked=input_ids_unmasked,
-                input_mask=input_mask,
-                segment_ids=segment_ids,
-                start_pos=start_pos_dict,
-                end_pos=end_pos_dict,
-                values=value_dict,
-                inform=inform_dict,
-                inform_slot=inform_slot_dict,
-                refer_id=refer_id_dict,
-                diag_state=diag_state_dict,
-                class_label_id=class_label_id_dict))
-
-    logger.info('========== %d out of %d examples have text too long' %
-                (too_long_cnt, total_cnt))
-
-    return features
-
-
-# From bert.tokenization (TF code)
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode('utf-8', 'ignore')
-        else:
-            raise ValueError('Unsupported string type: %s' % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode('utf-8', 'ignore')
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError('Unsupported string type: %s' % (type(text)))
-    else:
-        raise ValueError('Not running on Python2 or Python 3?')
-
-
-if __name__ == '__main__':
-    processor = multiwoz22Processor()
-    set_type = 'test'
-    slot_list = [
-        'taxi-leaveAt', 'taxi-destination', 'taxi-departure', 'taxi-arriveBy',
-        'restaurant-book_people', 'restaurant-book_day',
-        'restaurant-book_time', 'restaurant-food', 'restaurant-pricerange',
-        'restaurant-name', 'restaurant-area', 'hotel-book_people',
-        'hotel-book_day', 'hotel-book_stay', 'hotel-name', 'hotel-area',
-        'hotel-parking', 'hotel-pricerange', 'hotel-stars', 'hotel-internet',
-        'hotel-type', 'attraction-type', 'attraction-name', 'attraction-area',
-        'train-book_people', 'train-leaveAt', 'train-destination', 'train-day',
-        'train-arriveBy', 'train-departure'
-    ]
-    append_history = True
-    use_history_labels = True
-    swap_utterances = True
-    label_value_repetitions = True
-    delexicalize_sys_utts = True,
-    unk_token = '[UNK]'
-    analyze = False
-    example = processor.create_example(utter1, history_states1, set_type,
-                                       slot_list, {}, append_history,
-                                       use_history_labels, swap_utterances,
-                                       label_value_repetitions,
-                                       delexicalize_sys_utts, unk_token,
-                                       analyze)
-    print(f'utterances is {example}')
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index dbfe5ba7..d914489c 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
                      MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer, TextRankingTrainer
-    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
+    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer, NlpTrainerArguments
     from .trainer import EpochBasedTrainer
 
 else:
@@ -27,7 +27,8 @@ else:
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'],
-        'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
+        'nlp_trainer':
+        ['NlpEpochBasedTrainer', 'VecoTrainer', 'NlpTrainerArguments'],
         'trainer': ['EpochBasedTrainer']
     }
 
diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index c8f0c7b0..a02478b9 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -22,7 +22,8 @@ def merge_cfg(cfg: Config):
 
     This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg.
 
-    @param cfg: The input cfg to be merged into.
+    Aegs:
+        cfg: The input cfg to be merged into.
     """
     cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
     # pop duplicate hook
diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py
index 32fb0250..ed018fef 100644
--- a/modelscope/trainers/hooks/lr_scheduler_hook.py
+++ b/modelscope/trainers/hooks/lr_scheduler_hook.py
@@ -47,7 +47,8 @@ class LrSchedulerHook(Hook):
         return lr
 
     def before_train_iter(self, trainer):
-        if not self.by_epoch and trainer.iter > 0:
+        if not self.by_epoch and trainer.iter >= getattr(
+                trainer, 'cumulative_iters', 1):
             if self.warmup_lr_scheduler is not None:
                 self.warmup_lr_scheduler.step()
             else:
diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py
index 8c61dfdb..0f38c67a 100644
--- a/modelscope/trainers/hooks/optimizer/base.py
+++ b/modelscope/trainers/hooks/optimizer/base.py
@@ -44,6 +44,7 @@ class OptimizerHook(Hook):
 
     def before_run(self, trainer):
         trainer.optimizer.zero_grad()
+        trainer.cumulative_iters = self.cumulative_iters
 
     def after_train_iter(self, trainer):
         for k in self.loss_keys:
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 7f1bcd63..22f2cfe6 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
-    from .text_ranking_trainer import TextRankingTranier
+    from .text_ranking_trainer import TextRankingTrainer
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
index 2e59cd80..4baaddfe 100644
--- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -1,23 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import time
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional
 
 import numpy as np
 
 from modelscope.metainfo import Trainers
 from modelscope.models.nlp.space.model.generator import SpaceGenerator
 from modelscope.models.nlp.space.model.model_base import SpaceModelBase
-from modelscope.preprocessors.space.data_loader import \
+from modelscope.preprocessors.nlp.space.data_loader import \
     get_sequential_data_loader
-from modelscope.preprocessors.space.fields.intent_field import \
+from modelscope.preprocessors.nlp.space.fields.intent_field import \
     IntentBPETextField
-from modelscope.preprocessors.space.preprocess import intent_preprocess
+from modelscope.preprocessors.nlp.space.preprocess import intent_preprocess
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer
-from modelscope.utils.config import Config
+from modelscope.utils.config import Config, ModelFile
 from modelscope.utils.logger import get_logger
 
 PATH = None
@@ -34,14 +33,6 @@ class DialogIntentTrainer(BaseTrainer):
                  **kwargs):
         super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
 
-        def to_tensor(array):
-            """
-            numpy array -> tensor
-            """
-            import torch
-            array = torch.tensor(array)
-            return array.cuda() if self.cfg.use_gpu else array
-
         def setup_seed(seed):
             import random
             import torch
@@ -59,56 +50,70 @@ class DialogIntentTrainer(BaseTrainer):
         # preprocess data
         intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg)
         # set reader and evaluator
-        bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
+        self.bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
 
-        self.cfg.Model.num_token_embeddings = bpe.vocab_size
-        self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1
+        self.cfg.Model.num_token_embeddings = self.bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1
         dataset_paths = [
             os.path.join(self.cfg.Dataset.data_dir,
                          self.cfg.Dataset.trigger_data)
         ]
         # set data and data status
-        collate_fn = bpe.collate_fn_multi_turn
+        collate_fn = self.bpe.collate_fn_multi_turn
         self.train_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='train')
         self.valid_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='valid')
         self.test_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='test')
 
         # set generator
-        generator = SpaceGenerator.create(self.cfg, reader=bpe)
+        self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe)
+        self._load_model(**kwargs)
+
+    def _load_model(self, **kwargs):
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda() if self.cfg.use_gpu else array
+
         # construct model
-        self.model = SpaceModelBase.create(
-            self.cfg.Model.init_checkpoint,
-            self.cfg,
-            reader=bpe,
-            generator=generator)
+        if 'model' in kwargs:
+            self.model = kwargs['model']
+        else:
+            self.model = SpaceModelBase.create(
+                kwargs['model_dir'],
+                self.cfg,
+                reader=self.bpe,
+                generator=self.generator)
 
         import torch
-
         # multi-gpu
         if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
             self.model = torch.nn.DataParallel(self.model)
 
         # construct trainer
         self.trainer = IntentTrainer(
-            self.model, to_tensor, self.cfg, reader=bpe)
+            self.model, to_tensor, self.cfg, reader=self.bpe)
         num_batches = len(self.train_label_loader)
         self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches)
         # load model, optimizer and lr_scheduler
@@ -131,6 +136,16 @@ class DialogIntentTrainer(BaseTrainer):
                  *args,
                  **kwargs) -> Dict[str, float]:
         logger.info('Evaluate')
+        self.cfg.do_infer = True
+
+        # get best checkpoint path
+        pos = checkpoint_path.rfind('/')
+        checkpoint_name = checkpoint_path[pos + 1:]
+        checkpoint_dir = checkpoint_path[:pos]
+
+        assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE
+        kwargs['model_dir'] = checkpoint_dir
+        self._load_model(**kwargs)
         self.trainer.infer(
             data_iter=self.test_label_loader,
             ex_data_iter=self.train_label_loader)
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
index 726404d4..aa6bb69d 100644
--- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -9,8 +9,7 @@ import numpy as np
 from modelscope.metainfo import Trainers
 from modelscope.models.nlp.space.model.generator import SpaceGenerator
 from modelscope.models.nlp.space.model.model_base import SpaceModelBase
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator
diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
index 34cd2f9b..05efa138 100644
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -1,9 +1,6 @@
-"""
-Trainer class.
-"""
-import logging
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
-import sys
 import time
 from collections import OrderedDict
 
@@ -61,7 +58,7 @@ class Trainer(object):
         self.evaluator = evaluator
         self.tokenizer = reader.tokenizer
 
-        self.logger = get_logger()
+        self.logger = logger or get_logger()
 
         self.batch_metrics_tracker = MetricsTracker()
         self.token_metrics_tracker = MetricsTracker()
diff --git a/modelscope/trainers/nlp/space/trainer/intent_trainer.py b/modelscope/trainers/nlp/space/trainer/intent_trainer.py
index 1e6f4a2d..dc6b317b 100644
--- a/modelscope/trainers/nlp/space/trainer/intent_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/intent_trainer.py
@@ -1,10 +1,6 @@
-"""
-Trainer class.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
-import logging
 import os
-import sys
 import time
 from collections import OrderedDict
 
@@ -16,24 +12,8 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
     MetricsTracker
-
-
-def get_logger(log_path, name='default'):
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(message)s')
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setFormatter(formatter)
-    logger.addHandler(sh)
-
-    fh = logging.FileHandler(log_path, mode='w')
-    fh.setFormatter(formatter)
-    logger.addHandler(fh)
-
-    return logger
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
 
 
 class Trainer(object):
@@ -76,11 +56,7 @@ class Trainer(object):
         self.lr_scheduler = lr_scheduler
         self.optimizer = optimizer
 
-        # if not os.path.exists(self.save_dir):
-        #     os.makedirs(self.save_dir)
-
-        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
-        self.logger = logger or get_logger('trainer.log', 'trainer')
+        self.logger = logger or get_logger()
 
         self.batch_metrics_tracker_label = MetricsTracker()
         self.token_metrics_tracker_label = MetricsTracker()
@@ -201,9 +177,12 @@ class Trainer(object):
 
         # Save current best model
         if is_best:
-            best_model_file = os.path.join(self.save_dir, 'best.model')
+            best_model_file = os.path.join(self.save_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             torch.save(self.model.state_dict(), best_model_file)
-            best_train_file = os.path.join(self.save_dir, 'best.train')
+            best_train_file = os.path.join(
+                self.save_dir,
+                '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE))
             torch.save(train_state, best_train_file)
             self.logger.info(
                 f"Saved best model state to '{best_model_file}' with new best valid metric "
@@ -215,7 +194,7 @@ class Trainer(object):
 
         def _load_model_state():
             model_state_dict = torch.load(
-                f'{self.func_model.init_checkpoint}.model',
+                f'{self.func_model.init_checkpoint}',
                 map_location=lambda storage, loc: storage)
 
             if 'module.' in list(model_state_dict.keys())[0]:
@@ -303,8 +282,13 @@ class Trainer(object):
             self.logger.info('Loaded no model !!!')
             return
 
-        _load_model_state()
-        _load_train_state()
+        if self.do_train:
+            _load_model_state()
+            return
+
+        if self.do_infer:
+            _load_model_state()
+            _load_train_state()
 
 
 class IntentTrainer(Trainer):
@@ -719,104 +703,3 @@ class IntentTrainer(Trainer):
 
         assert 'loss' in metrics
         return metrics['loss'], metrics
-
-    def load(self):
-        """ load """
-
-        def _load_model_state():
-            model_state_dict = torch.load(
-                f'{self.func_model.init_checkpoint}',
-                map_location=lambda storage, loc: storage)
-
-            if 'module.' in list(model_state_dict.keys())[0]:
-                new_model_state_dict = OrderedDict()
-                for k, v in model_state_dict.items():
-                    assert k[:7] == 'module.'
-                    new_model_state_dict[k[7:]] = v
-                model_state_dict = new_model_state_dict
-
-            new_model_state_dict = OrderedDict()
-            parameters = {
-                name: param
-                for name, param in self.func_model.named_parameters()
-            }
-            for name, param in model_state_dict.items():
-                if name in parameters:
-                    if param.shape != parameters[name].shape:
-                        assert hasattr(param, 'numpy')
-                        arr = param.numpy()
-                        z = np.random.normal(
-                            scale=self.func_model.initializer_range,
-                            size=parameters[name].shape).astype('float32')
-                        if name == 'embedder.token_embedding.weight':
-                            z[-param.shape[0]:] = arr
-                            print(
-                                f'part of parameter({name}) random normlize initialize'
-                            )
-                        else:
-                            if z.shape[0] < param.shape[0]:
-                                z = arr[:z.shape[0]]
-                                print(f'part of parameter({name}) are dropped')
-                            else:
-                                z[:param.shape[0]] = arr
-                                print(
-                                    f'part of parameter({name}) random normlize initialize'
-                                )
-                        dtype, device = param.dtype, param.device
-                        z = torch.tensor(z, dtype=dtype, device=device)
-                        new_model_state_dict[name] = z
-                    else:
-                        new_model_state_dict[name] = param
-                else:
-                    print(f'parameter({name}) are dropped')
-            model_state_dict = new_model_state_dict
-
-            for name in parameters:
-                if name not in model_state_dict:
-                    if parameters[name].requires_grad:
-                        print(f'parameter({name}) random normlize initialize')
-                        z = np.random.normal(
-                            scale=self.func_model.initializer_range,
-                            size=parameters[name].shape).astype('float32')
-                        dtype, device = parameters[name].dtype, parameters[
-                            name].device
-                        model_state_dict[name] = torch.tensor(
-                            z, dtype=dtype, device=device)
-                    else:
-                        model_state_dict[name] = parameters[name]
-
-            self.func_model.load_state_dict(model_state_dict)
-            self.logger.info(
-                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
-            )
-
-        def _load_train_state():
-            train_file = f'{self.func_model.init_checkpoint}.train'
-            if os.path.exists(train_file):
-                train_state_dict = torch.load(
-                    train_file, map_location=lambda storage, loc: storage)
-                self.epoch = train_state_dict['epoch']
-                self.best_valid_metric = train_state_dict['best_valid_metric']
-                if self.optimizer is not None and 'optimizer' in train_state_dict:
-                    self.optimizer.load_state_dict(
-                        train_state_dict['optimizer'])
-                if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict:
-                    self.lr_scheduler.load_state_dict(
-                        train_state_dict['lr_scheduler'])
-                self.logger.info(
-                    f"Loaded train state from '{train_file}' with (epoch-{self.epoch} "
-                    f'best_valid_metric={self.best_valid_metric:.3f})')
-            else:
-                self.logger.info('Loaded no train state')
-
-        if self.func_model.init_checkpoint is None:
-            self.logger.info('Loaded no model !!!')
-            return
-
-        if self.do_train:
-            _load_model_state()
-            return
-
-        if self.do_infer:
-            _load_model_state()
-            _load_train_state()
diff --git a/modelscope/trainers/nlp/text_ranking_trainer.py b/modelscope/trainers/nlp/text_ranking_trainer.py
index 5da9c76a..610c36b5 100644
--- a/modelscope/trainers/nlp/text_ranking_trainer.py
+++ b/modelscope/trainers/nlp/text_ranking_trainer.py
@@ -12,9 +12,9 @@ from tqdm import tqdm
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model, TorchModel
+from modelscope.models.nlp import BertForTextRanking
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
@@ -118,7 +118,6 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
             Example:
             {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
         """
-        from modelscope.models.nlp import TextRanking
         # get the raw online dataset
         self.eval_dataloader = self._build_dataloader_with_dataset(
             self.eval_dataset,
@@ -127,7 +126,7 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
         # generate a standard dataloader
         # generate a model
         if checkpoint_path is not None:
-            model = TextRanking.from_pretrained(checkpoint_path)
+            model = BertForTextRanking.from_pretrained(checkpoint_path)
         else:
             model = self.model
 
@@ -156,13 +155,16 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
             with torch.no_grad():
                 label_ids = batch.pop('labels').detach().cpu().numpy()
                 qids = batch.pop('qid').detach().cpu().numpy()
-                outputs = model(batch)
+                outputs = model(**batch)
             infer_end_time = time.time()
             total_spent_time += infer_end_time - infer_start_time
             total_samples += self.eval_dataloader.batch_size
 
-            assert 'scores' in outputs
-            logits = outputs['scores']
+            def sigmoid(logits):
+                return np.exp(logits) / (1 + np.exp(logits))
+
+            logits = outputs['logits'].squeeze(-1).detach().cpu().numpy()
+            logits = sigmoid(logits).tolist()
 
             label_list.extend(label_ids)
             logits_list.extend(logits)
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index b54aa666..a19e7c7b 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,7 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import Callable, Optional, Tuple, Union
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -13,15 +15,416 @@ from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets import MsDataset
-from modelscope.preprocessors import Preprocessor, build_preprocessor
-from modelscope.utils.config import Config
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
-                                       ModelFile, Tasks)
+                                       ModelFile)
 from modelscope.utils.hub import parse_label_mapping
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
 
+@dataclass
+class NlpTrainerArguments:
+    """The arguments for the nlp trainer.
+
+    All the arguments listed here have None default values, which means follow the default value in the input
+    cfg dict.
+    """
+
+    work_dir: Optional[str] = field(
+        default=None, metadata={'help': 'The work dir(key: train.work_dir)'})
+
+    task: Optional[str] = field(
+        default=None, metadata={'help': 'The task type(key: task)'})
+
+    preprocessor_type: Optional[str] = field(
+        default=None,
+        metadata={'help': 'The preprocessor type(key: preprocessor.type)'})
+
+    train_first_sequence: str = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of first sentence for the training dataset(key:preprocessor.train.'
+            'first_sequence/dataset.train.first_sequence)'
+        })
+
+    train_second_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of second sentence for the training dataset(key:preprocessor.train.'
+            'second_sequence/dataset.train.second_sequence)'
+        })
+
+    train_label: str = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of label for the training dataset(key:preprocessor.train.'
+            'second_sequence/dataset.train.second_sequence)'
+        })
+
+    eval_first_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of first sentence for the eval dataset(key:preprocessor.val.'
+            'first_sequence/dataset.val.first_sequence), '
+            'if not provided, the trainer will use the train_first_sequence for evaluation'
+        })
+
+    eval_second_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of second sentence for the eval dataset(key:preprocessor.val.'
+            'second_sequence/dataset.val.second_sequence),'
+            'if not provided, the trainer will use the train_second_sequence for evaluation'
+        })
+
+    eval_label: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of label for the eval dataset(key:preprocessor.val.'
+            'second_sequence/dataset.val.second_sequence),'
+            'if not provided, the trainer will use the train_label for evaluation'
+        })
+
+    labels: Optional[List] = field(
+        default=None,
+        metadata={
+            'help':
+            'The labels list of the dataset(key:dataset.train.labels),'
+            'This parameter has the same effect with "label2id"'
+        })
+
+    max_epochs: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The max_epochs of the training loop(key: train.max_epochs)'
+        })
+
+    train_batch_size_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The train batch size per gpu(key: train.dataloader.batch_size_per_gpu)'
+        })
+
+    train_workers_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The number of workers per gpu(key: train.dataloader.workers_per_gpu)'
+        })
+
+    train_shuffle: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Shuffle the train dataset or not(key: train.dataloader.shuffle)'
+        })
+
+    eval_batch_size_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The eval batch size per gpu(key: evaluation.dataloader.batch_size_per_gpu)'
+        })
+
+    eval_workers_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The number of workers per gpu(key: evaluation.dataloader.workers_per_gpu)'
+        })
+
+    eval_shuffle: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Shuffle the eval dataset or not(key: evaluation.dataloader.shuffle)'
+        })
+
+    optimizer_args: Optional[Dict] = field(
+        default=None,
+        metadata={'help': 'The optimizer config dict(key: train.optimizer)'})
+
+    lr_scheduler_args: Optional[Dict] = field(
+        default=None,
+        metadata={
+            'help': 'The lr_scheduler config dict(key: train.lr_scheduler)'
+        })
+
+    checkpoint_saving_type: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The checkpoint saving type(key: The ckpt hook dict in train.hooks), '
+            'valid options: "BestCkptSaverHook", "CheckpointHook"'
+        })
+
+    checkpoint_by_epoch: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Saving checkpoint by epoch or not(key: The by_epoch key in '
+            'ckpt hook dict in train.hooks)'
+        })
+
+    checkpoint_interval: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The checkpoint saving interval(key: The interval key in '
+            'ckpt hook dict in train.hooks)'
+        })
+
+    metric_key: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The metric key for the BestCkptSaverHook(key: The metric_key key in '
+            'ckpt hook dict in train.hooks), if the checkpoint_saving_type is "CheckpointHook" or '
+            '"None", the metric_key key has no effects'
+        })
+
+    evaluation_type: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The evaluation type(key: The evaluation hook dict in train.hooks), '
+            'valid options: "EvaluationHook", "None"'
+        })
+
+    evaluation_by_epoch: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Evaluating by epoch or not(key: The by_epoch key in '
+            'evaluation hook dict in train.hooks)'
+        })
+
+    evaluation_interval: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The evaluating interval(key: The interval key in '
+            'evaluation hook dict in train.hooks)'
+        })
+
+    metrics: Optional[List[str]] = field(
+        default=None,
+        metadata={'help': 'The metrics class keys(key: evaluation.metrics)'})
+
+    default_train_config = ConfigDict({
+        'work_dir':
+        '/tmp',
+        'max_epochs':
+        5,
+        'dataloader': {
+            'batch_size_per_gpu': 32,
+            'workers_per_gpu': 0
+        },
+        'optimizer': {
+            'type': 'AdamW',
+            'lr': 2e-5,
+            'options': {}
+        },
+        'lr_scheduler': {
+            'type': 'LinearLR',
+            'start_factor': 1.0,
+            'end_factor': 0.0,
+            'total_iters': 10000,
+            'options': {
+                'by_epoch': False
+            }
+        },
+        'hooks': [{
+            'type': 'CheckpointHook',
+            'by_epoch': False,
+            'interval': 100
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }, {
+            'type': 'EvaluationHook',
+            'by_epoch': False,
+            'interval': 100
+        }]
+    })
+
+    def __call__(self, cfg):
+        """
+
+        Args:
+            cfg(`Config`): The cfg to be modified.
+
+        Returns:
+            The cfg after modification.
+        """
+
+        if self.task is not None:
+            cfg.task = self.task
+
+        if self.preprocessor_type is not None:
+            if not hasattr(cfg, 'preprocessor'):
+                cfg.preprocessor = ConfigDict()
+            cfg.preprocessor.type = self.preprocessor_type
+
+        if self.train_first_sequence is not None or self.train_second_sequence \
+                is not None or self.train_label is not None or self.labels is not None:
+            if not hasattr(cfg, 'dataset'):
+                cfg.dataset = ConfigDict()
+            if not hasattr(cfg.dataset, 'train'):
+                cfg.dataset.train = ConfigDict()
+            if self.train_first_sequence is not None:
+                cfg.dataset.train.first_sequence = self.train_first_sequence
+            if self.train_second_sequence is not None:
+                cfg.dataset.train.second_sequence = self.train_second_sequence
+            if self.train_label is not None:
+                cfg.dataset.train.label = self.train_label
+            if self.labels is not None:
+                cfg.dataset.train.labels = self.labels
+
+        if self.eval_first_sequence is not None or self.eval_second_sequence \
+                is not None or self.eval_label is not None:
+            if not hasattr(cfg, 'dataset'):
+                cfg.dataset = ConfigDict()
+            if not hasattr(cfg.dataset, 'val'):
+                cfg.dataset.val = ConfigDict()
+            if self.eval_first_sequence is not None:
+                cfg.dataset.val.first_sequence = self.eval_first_sequence
+            if self.eval_second_sequence is not None:
+                cfg.dataset.val.second_sequence = self.eval_second_sequence
+            if self.eval_label is not None:
+                cfg.dataset.val.label = self.eval_label
+
+        if self.max_epochs is not None or self.train_batch_size_per_gpu is not None \
+                or self.train_shuffle is not None or self.optimizer_args is not None \
+                or self.work_dir is not None or self.lr_scheduler_args is not None\
+                or self.train_workers_per_gpu is not None:
+            if not hasattr(cfg, 'train'):
+                cfg.train = deepcopy(self.default_train_config)
+            if not hasattr(cfg.train, 'dataloader'):
+                cfg.train.dataloader = deepcopy(
+                    self.default_train_config.dataloader)
+            if not hasattr(cfg.train, 'optimizer'):
+                cfg.train.optimizer = deepcopy(
+                    self.default_train_config.optimizer)
+            if not hasattr(cfg.train, 'lr_scheduler'):
+                cfg.train.lr_scheduler = deepcopy(
+                    self.default_train_config.lr_scheduler)
+            if self.work_dir is not None:
+                cfg.train.work_dir = self.work_dir
+            if self.max_epochs is not None:
+                cfg.train.max_epochs = self.max_epochs
+            if self.train_batch_size_per_gpu is not None:
+                cfg.train.dataloader.batch_size_per_gpu = self.train_batch_size_per_gpu
+            if self.train_workers_per_gpu is not None:
+                cfg.train.dataloader.workers_per_gpu = self.train_workers_per_gpu
+            if self.train_shuffle is not None:
+                cfg.train.dataloader.shuffle = self.train_shuffle
+            if self.optimizer_args is not None:
+                if cfg.train.optimizer.type != self.optimizer_args.get(
+                        'type', cfg.train.optimizer.type):
+                    cfg.train.optimizer = ConfigDict(
+                        deepcopy(self.optimizer_args))
+                else:
+                    cfg.train.optimizer = Config._merge_a_into_b(
+                        self.optimizer_args, cfg.train.optimizer, force=True)
+            if self.lr_scheduler_args is not None:
+                if cfg.train.lr_scheduler.type != self.lr_scheduler_args.get(
+                        'type', cfg.train.lr_scheduler.type):
+                    cfg.train.lr_scheduler = ConfigDict(
+                        deepcopy(self.lr_scheduler_args))
+                else:
+                    cfg.train.lr_scheduler = Config._merge_a_into_b(
+                        self.lr_scheduler_args,
+                        cfg.train.lr_scheduler,
+                        force=True)
+
+        if self.checkpoint_saving_type is not None or self.checkpoint_by_epoch is not None \
+                or self.checkpoint_interval is not None or self.metric_key is not None:
+            if not any([
+                    self.checkpoint_saving_type == hook['type']
+                    for hook in cfg.train.hooks
+            ]):
+                cfg.train.hooks = list(
+                    filter(
+                        lambda hook: hook['type'] not in
+                        ['CheckpointHook', 'BestCkptSaverHook'],
+                        cfg.train.hooks))
+                cfg.train.hooks.append(
+                    deepcopy(self.default_train_config.hooks[0]))
+                cfg.train.hooks[-1].type = self.checkpoint_saving_type
+            checkpoint_hook = list(
+                filter(
+                    lambda hook: hook[
+                        'type'] in ['CheckpointHook', 'BestCkptSaverHook'],
+                    cfg.train.hooks))[0]
+            if self.checkpoint_by_epoch is not None:
+                checkpoint_hook['by_epoch'] = self.checkpoint_by_epoch
+            if self.checkpoint_interval is not None:
+                checkpoint_hook['interval'] = self.checkpoint_interval
+            if checkpoint_hook['type'] == 'BestCkptSaverHook':
+                assert self.metric_key is not None, 'The metric_key must be provided ' \
+                                                    'if the ckpt saving hook is "BestCkptSaverHook"'
+                checkpoint_hook['metric_key'] = self.metric_key
+
+        if self.evaluation_type is not None or self.evaluation_by_epoch is not None \
+                or self.evaluation_interval is not None or self.eval_batch_size_per_gpu is not None or \
+                self.eval_shuffle is not None or self.metrics is not None:
+            if self.evaluation_type is not None and not any([
+                    self.evaluation_type == hook['type']
+                    for hook in cfg.train.hooks
+            ]):
+                cfg.train.hooks = list(
+                    filter(lambda hook: hook['type'] not in ['EvaluationHook'],
+                           cfg.train.hooks))
+                if self.evaluation_type != 'None':
+                    cfg.train.hooks.append(
+                        deepcopy(self.default_train_config.hooks[3]))
+                    cfg.train.hooks[-1].type = self.evaluation_type
+
+            evaluation_hook = list(
+                filter(lambda hook: hook['type'] in ['EvaluationHook'],
+                       cfg.train.hooks))
+            evaluation_hook = evaluation_hook[0] if len(
+                evaluation_hook) > 0 else None
+
+            if evaluation_hook is not None and self.evaluation_by_epoch is not None:
+                evaluation_hook['by_epoch'] = self.evaluation_by_epoch
+            if evaluation_hook is not None and self.evaluation_interval is not None:
+                evaluation_hook['interval'] = self.evaluation_interval
+
+            if not hasattr(cfg, 'evaluation'):
+                cfg.evaluation = ConfigDict({
+                    'dataloader': {
+                        'batch_size_per_gpu': 32,
+                        'workers_per_gpu': 0,
+                        'shuffle': False
+                    }
+                })
+
+            if self.metrics is not None:
+                cfg.evaluation.metrics = self.metrics
+            if self.eval_batch_size_per_gpu is not None:
+                cfg.evaluation.dataloader.batch_size_per_gpu = self.eval_batch_size_per_gpu
+            if self.eval_workers_per_gpu is not None:
+                cfg.evaluation.dataloader.workers_per_gpu = self.eval_workers_per_gpu
+            if self.eval_shuffle is not None:
+                cfg.evaluation.dataloader.shuffle = self.eval_shuffle
+
+        return cfg
+
+
 @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer)
 class NlpEpochBasedTrainer(EpochBasedTrainer):
 
@@ -80,9 +483,10 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
                     model)
             else:
                 model_dir = snapshot_download(model, revision=model_revision)
-            cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+            if cfg_file is None:
+                cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         else:
-            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
             model_dir = os.path.dirname(cfg_file)
 
         self.label2id = None
@@ -91,26 +495,17 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         self.cfg_modify_fn = cfg_modify_fn
         self.cfg = self.rebuild_config(Config.from_file(cfg_file))
 
-        label2id = parse_label_mapping(model_dir)
-        if label2id is not None:
-            self.label2id = label2id
-            self.id2label = {id: label for label, id in label2id.items()}
-            self.num_labels = len(label2id)
-        else:
-            try:
-                labels = self.cfg.dataset.train.labels
-                if labels is not None and len(labels) > 0:
-                    self.label2id = {
-                        label: idx
-                        for idx, label in enumerate(labels)
-                    }
-                    self.id2label = {
-                        idx: label
-                        for idx, label in enumerate(labels)
-                    }
-                    self.num_labels = len(labels)
-            except AttributeError:
-                pass
+        try:
+            labels = self.cfg.dataset.train.labels
+            self.label2id = {label: idx for idx, label in enumerate(labels)}
+            self.id2label = {idx: label for idx, label in enumerate(labels)}
+            self.num_labels = len(labels)
+        except AttributeError:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None:
+                self.label2id = label2id
+                self.id2label = {id: label for label, id in label2id.items()}
+                self.num_labels = len(label2id)
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -185,36 +580,20 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             'label2id': self.label2id
         }
 
-        field_name = Tasks.find_field_by_task(self.cfg.task)
-        train_preprocessor, eval_preprocessor = None, None
-        _train_cfg, _eval_cfg = {}, {}
-
-        if 'type' not in self.cfg.preprocessor and (
-                'train' in self.cfg.preprocessor
-                or 'val' in self.cfg.preprocessor):
-            if 'train' in self.cfg.preprocessor:
-                _train_cfg = self.cfg.preprocessor.train
-            if 'val' in self.cfg.preprocessor:
-                _eval_cfg = self.cfg.preprocessor.val
-        else:
-            _train_cfg = self.cfg.preprocessor
-            _eval_cfg = self.cfg.preprocessor
-
-        if len(_train_cfg):
-            _train_cfg.update({
-                'model_dir': self.model_dir,
-                **model_args,
-                **self.train_keys, 'mode': ModeKeys.TRAIN
-            })
-            train_preprocessor = build_preprocessor(_train_cfg, field_name)
-        if len(_eval_cfg):
-            _eval_cfg.update({
-                'model_dir': self.model_dir,
-                **model_args,
-                **self.eval_keys, 'mode': ModeKeys.EVAL
-            })
-            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
-
+        train_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.TRAIN,
+            **model_args,
+            **self.train_keys,
+            mode=ModeKeys.TRAIN)
+        eval_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.EVAL,
+            **model_args,
+            **self.eval_keys,
+            mode=ModeKeys.EVAL)
         return train_preprocessor, eval_preprocessor
 
 
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 61d11aa6..0dc6ece4 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -4,7 +4,7 @@ import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
 from functools import partial
-from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import json
 import torch
@@ -22,18 +22,18 @@ from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.msdatasets.task_datasets.builder import build_task_dataset
 from modelscope.msdatasets.task_datasets.torch_base_dataset import \
     TorchTaskDataset
+from modelscope.outputs import ModelOutputBase
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import build_preprocessor
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
-                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
-                                       Tasks, TrainerStages)
+                                       ConfigKeys, ModeKeys, ModelFile,
+                                       TrainerStages)
 from modelscope.utils.data_utils import to_device
-from modelscope.utils.device import create_device, verify_device
+from modelscope.utils.device import create_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
@@ -146,7 +146,8 @@ class EpochBasedTrainer(BaseTrainer):
             if ConfigKeys.val in preprocessor:
                 assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
                 self.eval_preprocessor = preprocessor[ConfigKeys.val]
-        elif hasattr(self.cfg, ConfigFields.preprocessor):
+        elif hasattr(self.cfg, ConfigFields.preprocessor
+                     ) and self.cfg.preprocessor is not None:
             self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
             )
 
@@ -344,23 +345,32 @@ class EpochBasedTrainer(BaseTrainer):
                         preprocessors=preprocessor) for d in datasets
                 ]
                 cfg = ConfigDict(
-                    type=self.cfg.task, mode=mode, datasets=datasets)
-                return build_task_dataset(cfg, self.cfg.task)
+                    type=self.cfg.model.type, mode=mode, datasets=datasets)
+                task_dataset = build_task_dataset(cfg, self.cfg.task)
+                task_dataset.trainer = self
+                return task_dataset
             else:
                 # avoid add no str value datasets, preprocessors in cfg
                 task_data_build_config = ConfigDict(
-                    mode=mode, datasets=datasets, preprocessor=preprocessor)
+                    type=self.cfg.model.type,
+                    mode=mode,
+                    datasets=datasets,
+                    preprocessor=preprocessor)
                 task_data_build_config.update(task_data_config)
-                return build_task_dataset(task_data_build_config,
-                                          self.cfg.task)
+                task_dataset = build_task_dataset(task_data_build_config,
+                                                  self.cfg.task)
+                task_dataset.trainer = self
+                return task_dataset
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
-                return TorchTaskDataset(
+                task_dataset = TorchTaskDataset(
                     datasets,
                     mode=mode,
                     preprocessor=preprocessor,
                     **(dict(type=self.cfg.model.type) if hasattr(
                         self.cfg, 'model') else {}))
+                task_dataset.trainer = self
+                return task_dataset
             else:
                 return datasets
 
@@ -372,35 +382,12 @@ class EpochBasedTrainer(BaseTrainer):
         Returns: The train preprocessor and eval preprocessor instance.
 
         """
-        field_name = Tasks.find_field_by_task(self.cfg.task)
-        train_preprocessor, eval_preprocessor = None, None
-        _train_cfg, _eval_cfg = {}, {}
-        _dafault_args = {'model_dir': self.model_dir}
-
-        if 'type' not in self.cfg.preprocessor and (
-                'train' in self.cfg.preprocessor
-                or 'val' in self.cfg.preprocessor):
-            if 'train' in self.cfg.preprocessor:
-                _train_cfg = self.cfg.preprocessor.train
-            if 'val' in self.cfg.preprocessor:
-                _eval_cfg = self.cfg.preprocessor.val
-        else:
-            _train_cfg = self.cfg.preprocessor
-            _eval_cfg = self.cfg.preprocessor
-
-        if len(_train_cfg):
-            if isinstance(_train_cfg, Sequence):
-                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
-                # and add mode for Compose or other plans
-                raise NotImplementedError('Not supported yet!')
-            _train_cfg.update(_dafault_args)
-            train_preprocessor = build_preprocessor(_train_cfg, field_name)
-        if len(_eval_cfg):
-            if isinstance(_eval_cfg, Sequence):
-                raise NotImplementedError('Not supported yet!')
-            _eval_cfg.update(_dafault_args)
-            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
-
+        train_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.TRAIN)
+        eval_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, preprocessor_mode=ModeKeys.EVAL)
         return train_preprocessor, eval_preprocessor
 
     def get_metrics(self) -> List[Union[str, Dict]]:
@@ -547,6 +534,8 @@ class EpochBasedTrainer(BaseTrainer):
         else:
             train_outputs = model.forward(inputs)
 
+        if isinstance(train_outputs, ModelOutputBase):
+            train_outputs = train_outputs.to_dict()
         if not isinstance(train_outputs, dict):
             raise TypeError('"model.forward()" must return a dict')
 
@@ -650,8 +639,9 @@ class EpochBasedTrainer(BaseTrainer):
         """
         # TODO: support MsDataset load for cv
         if hasattr(data_cfg, 'name'):
+            dataset_name = data_cfg.pop('name')
             dataset = MsDataset.load(
-                dataset_name=data_cfg.pop('name'),
+                dataset_name=dataset_name,
                 **data_cfg,
             )
             cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index a9d7f396..2a7520f2 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -207,6 +207,6 @@ def save_pretrained(model,
     # Dump the config to the configuration.json
     if ConfigFields.pipeline not in config:
         config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
-    cfg_str = json.dumps(config, cls=JSONIteratorEncoder)
+    cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
     config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
     storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 50a1c016..6a9d6fd5 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -115,7 +115,6 @@ class NLPTasks(object):
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_state_tracking = 'dialog-state-tracking'
     table_question_answering = 'table-question-answering'
-    sentence_embedding = 'sentence-embedding'
     fill_mask = 'fill-mask'
     text_summarization = 'text-summarization'
     question_answering = 'question-answering'
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 2dbe7045..105b3ffa 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -82,7 +82,8 @@ def get_model_type(model_dir):
     this file does not exist, the method will try to get the 'model_type' field
     from the config.json.
 
-    @param model_dir: The local model dir to use. @return: The model type
+    Args:
+        model_dir: The local model dir to use. @return: The model type
     string, returns None if nothing is found.
     """
     try:
@@ -112,8 +113,11 @@ def parse_label_mapping(model_dir):
     2. Try to read label-id mapping from the configuration.json
     3. Try to read label-id mapping from the config.json
 
-    @param model_dir: The local model dir to use.
-    @return: The label2id mapping if found.
+    Args:
+        model_dir: The local model dir to use.
+
+    Returns:
+        The label2id mapping if found.
     """
     import json
     import os
diff --git a/modelscope/utils/nlp/space/args.py b/modelscope/utils/nlp/space/args.py
index d9e91e74..c92401c5 100644
--- a/modelscope/utils/nlp/space/args.py
+++ b/modelscope/utils/nlp/space/args.py
@@ -1,6 +1,4 @@
-"""
-Parse argument.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
 
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
index 4578ccc4..2c971b10 100644
--- a/modelscope/utils/nlp/space/clean_dataset.py
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import re
 
diff --git a/modelscope/utils/nlp/space/criterions.py b/modelscope/utils/nlp/space/criterions.py
index 60f98457..82ef4ba5 100644
--- a/modelscope/utils/nlp/space/criterions.py
+++ b/modelscope/utils/nlp/space/criterions.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn.functional as F
 from torch.nn.modules.loss import _Loss
diff --git a/modelscope/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py
index 880b018b..d1d14ef9 100644
--- a/modelscope/utils/nlp/space/db_ops.py
+++ b/modelscope/utils/nlp/space/db_ops.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import random
 import sqlite3
diff --git a/modelscope/utils/nlp/space/ontology.py b/modelscope/utils/nlp/space/ontology.py
index 99b084bb..c55d12e1 100644
--- a/modelscope/utils/nlp/space/ontology.py
+++ b/modelscope/utils/nlp/space/ontology.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 all_domains = [
     'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital'
 ]
diff --git a/modelscope/utils/nlp/space/scores.py b/modelscope/utils/nlp/space/scores.py
index fe0a8a17..eb6dd41c 100644
--- a/modelscope/utils/nlp/space/scores.py
+++ b/modelscope/utils/nlp/space/scores.py
@@ -1,3 +1,6 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
 def hierarchical_set_score(frame1, frame2):
     # deal with empty frame
     if not (frame1 and frame2):
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index 81d1b1c5..56e67671 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import logging
 from collections import OrderedDict
 
diff --git a/modelscope/utils/nlp/space/utils_dst.py b/modelscope/utils/nlp/space/utils_dst.py
index 2a7e67d7..6277172e 100644
--- a/modelscope/utils/nlp/space/utils_dst.py
+++ b/modelscope/utils/nlp/space/utils_dst.py
@@ -1,3 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import List
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.nlp import DialogStateTrackingPipeline
+
+
+def tracking_and_print_dialog_states(
+        test_case, pipelines: List[DialogStateTrackingPipeline]):
+    import json
+    pipelines_len = len(pipelines)
+    history_states = [{}]
+    utter = {}
+    for step, item in enumerate(test_case):
+        utter.update(item)
+        result = pipelines[step % pipelines_len]({
+            'utter':
+            utter,
+            'history_states':
+            history_states
+        })
+        print(json.dumps(result))
+
+        history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
 def batch_to_device(batch, device):
     batch_on_device = []
     for element in batch:
diff --git a/modelscope/utils/nlp/space_T_en/__init__.py b/modelscope/utils/nlp/space_T_en/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/space_T_en/utils.py
similarity index 52%
rename from modelscope/utils/nlp/nlp_utils.py
rename to modelscope/utils/nlp/space_T_en/utils.py
index bfeaf924..d884c241 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/space_T_en/utils.py
@@ -1,8 +1,9 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import List
 
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline)
+from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 
 
 def text2sql_tracking_and_print_results(
@@ -22,22 +23,3 @@ def text2sql_tracking_and_print_results(
             print(results)
             last_sql = results[OutputKeys.OUTPUT][OutputKeys.TEXT]
             history.append(item)
-
-
-def tracking_and_print_dialog_states(
-        test_case, pipelines: List[DialogStateTrackingPipeline]):
-    import json
-    pipelines_len = len(pipelines)
-    history_states = [{}]
-    utter = {}
-    for step, item in enumerate(test_case):
-        utter.update(item)
-        result = pipelines[step % pipelines_len]({
-            'utter':
-            utter,
-            'history_states':
-            history_states
-        })
-        print(json.dumps(result))
-
-        history_states.extend([result[OutputKeys.OUTPUT], {}])
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index d6994bd3..5284aa43 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -74,6 +74,7 @@ class Registry(object):
             raise KeyError(f'{module_name} is already registered in '
                            f'{self._name}[{group_key}]')
         self._modules[group_key][module_name] = module_cls
+        module_cls.group_key = group_key
 
     def register_module(self,
                         group_key: str = default_group,
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 3c1e5c1c..8045d3e9 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -7,6 +7,7 @@ import pickle
 import random
 import shutil
 import tempfile
+from collections import OrderedDict
 from collections.abc import Mapping
 from pathlib import Path
 from types import FunctionType
@@ -14,6 +15,7 @@ from typing import Any, Dict, Union
 
 import json
 import numpy as np
+import torch
 import torch.optim
 from torch import nn
 
@@ -69,9 +71,10 @@ class RegressTool:
                                       **kwargs):
         """Monitor a pytorch module in a single forward.
 
-        @param module: A torch module
-        @param file_name: The file_name to store or load file
-        @param compare_fn: A custom fn used to compare the results manually.
+        Args:
+            module: A torch module
+            file_name: The file_name to store or load file
+            compare_fn: A custom fn used to compare the results manually.
 
         >>> def compare_fn(v1, v2, key, type):
         >>>     return None
@@ -80,6 +83,10 @@ class RegressTool:
         v2 is the value of current version
         key is the key of submodules
         type is in one of 'input', 'output'
+
+            kwargs:
+            atol: The absolute gap between two np arrays.
+            rtol: The relative gap between two np arrays.
         """
         baseline = os.getenv('REGRESSION_BASELINE')
         if baseline is None or self.baseline is None:
@@ -144,20 +151,24 @@ class RegressTool:
         This is usually useful when you try to change some dangerous code
         which has the risk of affecting the training loop.
 
-        @param trainer: A dict or an object contains the model/optimizer/lr_scheduler
-        @param file_name: The file_name to store or load file
-        @param level: The regression level.
+        Args:
+            trainer: A dict or an object contains the model/optimizer/lr_scheduler
+            file_name: The file_name to store or load file
+            level: The regression level.
             'strict' for matching every single tensor.
                      Please make sure the parameters of head are fixed
                      and the drop-out rate is zero.
             'config' for matching the initial config, like cfg file, optimizer param_groups,
                      lr_scheduler params and the random seed.
             'metric' for compare the best metrics in the evaluation loop.
-        @param compare_fn: A custom fn used to compare the results manually.
-        @param ignore_keys: The keys to ignore of the named_parameters.
-        @param compare_random: If to compare random setttings, default True.
-        @param reset_dropout: Reset all dropout modules to 0.0.
-        @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+            compare_fn: A custom fn used to compare the results manually.
+            ignore_keys: The keys to ignore of the named_parameters.
+            compare_random: If to compare random setttings, default True.
+            reset_dropout: Reset all dropout modules to 0.0.
+            lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+            kwargs:
+            atol: The absolute gap between two np arrays.
+            rtol: The relative gap between two np arrays.
 
         >>> def compare_fn(v1, v2, key, type):
         >>>     return None
@@ -353,16 +364,22 @@ def compare_module(module1: nn.Module, module2: nn.Module):
 
 
 def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
-    import torch
+    try:
+        from modelscope.outputs import ModelOutputBase
+    except ImportError:
+        ModelOutputBase = dict
     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(
-            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
-    if isinstance(tensors, Mapping):
-        return {
+    if isinstance(tensors, (Mapping, ModelOutputBase)):
+        return OrderedDict({
             k: numpify_tensor_nested(t, reduction, clip_value)
             for k, t in tensors.items()
-        }
+        })
+    if isinstance(tensors, list):
+        return list(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
+    if isinstance(tensors, tuple):
+        return tuple(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
     if isinstance(tensors, torch.Tensor):
         t: np.ndarray = tensors.cpu().numpy()
         if clip_value is not None:
@@ -377,12 +394,19 @@ def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
 
 
 def detach_tensor_nested(tensors):
-    import torch
+    try:
+        from modelscope.outputs import ModelOutputBase
+    except ImportError:
+        ModelOutputBase = dict
     "Detach `tensors` (even if it's a nested list/tuple of tensors)."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(detach_tensor_nested(t) for t in tensors)
-    if isinstance(tensors, Mapping):
-        return {k: detach_tensor_nested(t) for k, t in tensors.items()}
+    if isinstance(tensors, (Mapping, ModelOutputBase)):
+        return OrderedDict(
+            {k: detach_tensor_nested(t)
+             for k, t in tensors.items()})
+    if isinstance(tensors, list):
+        return list(detach_tensor_nested(t) for t in tensors)
+    if isinstance(tensors, tuple):
+        return tuple(detach_tensor_nested(t) for t in tensors)
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 406d671f..8f580d19 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -8,8 +8,11 @@ def torch_nested_numpify(tensors):
 
     NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
 
-    @param tensors: Nested torch tensors.
-    @return: The numpify tensors.
+    Args:
+        tensors: Nested torch tensors.
+
+    Returns:
+        The numpify tensors.
     """
 
     import torch
@@ -30,8 +33,11 @@ def torch_nested_detach(tensors):
 
     NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
 
-    @param tensors: Nested torch tensors.
-    @return: The detached tensors.
+    Args:
+        tensors: Nested torch tensors.
+
+    Returns:
+        The detached tensors.
     """
 
     import torch
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index 97926539..0e4f8349 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -3,9 +3,10 @@ import os
 import shutil
 import tempfile
 import unittest
+from collections import OrderedDict
 
 from modelscope.exporters import Exporter, TorchModelExporter
-from modelscope.models.base import Model
+from modelscope.models import Model
 from modelscope.utils.test_utils import test_level
 
 
@@ -27,10 +28,42 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
         model = Model.from_pretrained(self.model_id)
         print(
             Exporter.from_model(model).export_onnx(
-                shape=(2, 256), outputs=self.tmp_dir))
+                shape=(2, 256), output_dir=self.tmp_dir))
         print(
             TorchModelExporter.from_model(model).export_torch_script(
-                shape=(2, 256), outputs=self.tmp_dir))
+                shape=(2, 256), output_dir=self.tmp_dir))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_export_outer_module(self):
+        from transformers import BertForSequenceClassification, BertTokenizerFast
+        model = BertForSequenceClassification.from_pretrained(
+            'bert-base-uncased')
+        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+        dummy_inputs = tokenizer(
+            tokenizer.unk_token,
+            padding='max_length',
+            max_length=256,
+            return_tensors='pt')
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        inputs = OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+        outputs = OrderedDict({'logits': {0: 'batch'}})
+        output_files = TorchModelExporter().export_onnx(
+            model=model,
+            dummy_inputs=dummy_inputs,
+            inputs=inputs,
+            outputs=outputs,
+            output_dir='/tmp')
+        print(output_files)
+        output_files = TorchModelExporter().export_torch_script(
+            model=model,
+            dummy_inputs=dummy_inputs,
+            output_dir='/tmp',
+            strict=False)
+        print(output_files)
 
 
 if __name__ == '__main__':
diff --git a/tests/hub/test_download_dataset.py b/tests/hub/test_download_dataset.py
new file mode 100644
index 00000000..29b5d1ab
--- /dev/null
+++ b/tests/hub/test_download_dataset.py
@@ -0,0 +1,709 @@
+import unittest
+
+from modelscope.msdatasets import MsDataset
+from modelscope.utils.test_utils import test_level
+
+
+class DownloadDatasetTest(unittest.TestCase):
+
+    def setUp(self):
+        self.subset_count = 10
+
+    def download_subset(self, dataset, subset_name):
+        dataset = MsDataset.load(dataset, subset_name=subset_name)
+        if isinstance(dataset, MsDataset):
+            lens = len(dataset)
+            print(f'dataset {subset_name} len: {lens}')
+            self.assertTrue(lens > 0)
+        else:
+            assert isinstance(dataset, dict)
+            lens = {key: len(subset) for key, subset in dataset.items()}
+            print(f'dataset {subset_name} len: {lens}')
+            self.assertTrue(all([_len > 0 for _len in lens.values()]))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_glue(self):
+        subset = [
+            'cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched',
+            'mnli_matched', 'qnli', 'rte', 'wnli', 'ax'
+        ]
+        for subset_name in subset:
+            self.download_subset('glue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_super_glue(self):
+        subset = [
+            'boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc',
+            'wsc.fixed', 'axb', 'axg'
+        ]
+        for subset_name in subset:
+            self.download_subset('super_glue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_nllb(self):
+        subset = [
+            'ace_Latn-ban_Latn', 'ace_Latn-bjn_Latn', 'ace_Latn-bug_Latn',
+            'ace_Latn-ceb_Latn', 'ace_Latn-eng_Latn', 'ace_Latn-fij_Latn',
+            'ace_Latn-ilo_Latn', 'ace_Latn-jav_Latn', 'ace_Latn-min_Latn',
+            'ace_Latn-mri_Latn', 'ace_Latn-pag_Latn', 'ace_Latn-plt_Latn',
+            'ace_Latn-smo_Latn', 'ace_Latn-sun_Latn', 'ace_Latn-war_Latn',
+            'afr_Latn-aka_Latn', 'afr_Latn-amh_Ethi', 'afr_Latn-bam_Latn',
+            'afr_Latn-bem_Latn', 'afr_Latn-cjk_Latn', 'afr_Latn-dik_Latn',
+            'afr_Latn-dyu_Latn', 'afr_Latn-eng_Latn', 'afr_Latn-ewe_Latn',
+            'afr_Latn-fon_Latn', 'afr_Latn-fra_Latn', 'afr_Latn-fuv_Latn',
+            'afr_Latn-gaz_Latn', 'afr_Latn-hau_Latn', 'afr_Latn-ibo_Latn',
+            'afr_Latn-kam_Latn', 'afr_Latn-kik_Latn', 'afr_Latn-kin_Latn',
+            'afr_Latn-kmb_Latn', 'afr_Latn-knc_Arab', 'afr_Latn-knc_Latn',
+            'afr_Latn-kon_Latn', 'afr_Latn-lin_Latn', 'afr_Latn-lua_Latn',
+            'afr_Latn-lug_Latn', 'afr_Latn-luo_Latn', 'afr_Latn-nso_Latn',
+            'afr_Latn-nus_Latn', 'afr_Latn-nya_Latn', 'afr_Latn-run_Latn',
+            'afr_Latn-sna_Latn', 'afr_Latn-som_Latn', 'afr_Latn-sot_Latn',
+            'afr_Latn-ssw_Latn', 'afr_Latn-swh_Latn', 'afr_Latn-tir_Ethi',
+            'afr_Latn-tsn_Latn', 'afr_Latn-tso_Latn', 'afr_Latn-tum_Latn',
+            'afr_Latn-twi_Latn', 'afr_Latn-umb_Latn', 'afr_Latn-wol_Latn',
+            'afr_Latn-xho_Latn', 'afr_Latn-yor_Latn', 'afr_Latn-zul_Latn',
+            'aka_Latn-amh_Ethi', 'aka_Latn-bam_Latn', 'aka_Latn-bem_Latn',
+            'aka_Latn-cjk_Latn', 'aka_Latn-dik_Latn', 'aka_Latn-dyu_Latn',
+            'aka_Latn-eng_Latn', 'aka_Latn-ewe_Latn', 'aka_Latn-fon_Latn',
+            'aka_Latn-fra_Latn', 'aka_Latn-fuv_Latn', 'aka_Latn-gaz_Latn',
+            'aka_Latn-hau_Latn', 'aka_Latn-ibo_Latn', 'aka_Latn-kam_Latn',
+            'aka_Latn-kik_Latn', 'aka_Latn-kin_Latn', 'aka_Latn-kmb_Latn',
+            'aka_Latn-knc_Arab', 'aka_Latn-knc_Latn', 'aka_Latn-kon_Latn',
+            'aka_Latn-lin_Latn', 'aka_Latn-lua_Latn', 'aka_Latn-lug_Latn',
+            'aka_Latn-luo_Latn', 'aka_Latn-nso_Latn', 'aka_Latn-nus_Latn',
+            'aka_Latn-nya_Latn', 'aka_Latn-run_Latn', 'aka_Latn-sna_Latn',
+            'aka_Latn-som_Latn', 'aka_Latn-sot_Latn', 'aka_Latn-ssw_Latn',
+            'aka_Latn-swh_Latn', 'aka_Latn-tir_Ethi', 'aka_Latn-tsn_Latn',
+            'aka_Latn-tso_Latn', 'aka_Latn-tum_Latn', 'aka_Latn-twi_Latn',
+            'aka_Latn-umb_Latn', 'aka_Latn-wol_Latn', 'aka_Latn-xho_Latn',
+            'aka_Latn-yor_Latn', 'aka_Latn-zul_Latn', 'amh_Ethi-bam_Latn',
+            'amh_Ethi-bem_Latn', 'amh_Ethi-cjk_Latn', 'amh_Ethi-dik_Latn',
+            'amh_Ethi-dyu_Latn', 'amh_Ethi-eng_Latn', 'amh_Ethi-ewe_Latn',
+            'amh_Ethi-fon_Latn', 'amh_Ethi-fra_Latn', 'amh_Ethi-fuv_Latn',
+            'amh_Ethi-gaz_Latn', 'amh_Ethi-hau_Latn', 'amh_Ethi-ibo_Latn',
+            'amh_Ethi-kam_Latn', 'amh_Ethi-kik_Latn', 'amh_Ethi-kin_Latn',
+            'amh_Ethi-kmb_Latn', 'amh_Ethi-knc_Arab', 'amh_Ethi-knc_Latn',
+            'amh_Ethi-kon_Latn', 'amh_Ethi-lin_Latn', 'amh_Ethi-lua_Latn',
+            'amh_Ethi-lug_Latn', 'amh_Ethi-luo_Latn', 'amh_Ethi-nso_Latn',
+            'amh_Ethi-nus_Latn', 'amh_Ethi-nya_Latn', 'amh_Ethi-run_Latn',
+            'amh_Ethi-sna_Latn', 'amh_Ethi-som_Latn', 'amh_Ethi-sot_Latn',
+            'amh_Ethi-ssw_Latn', 'amh_Ethi-swh_Latn', 'amh_Ethi-tir_Ethi',
+            'amh_Ethi-tsn_Latn', 'amh_Ethi-tso_Latn', 'amh_Ethi-tum_Latn',
+            'amh_Ethi-twi_Latn', 'amh_Ethi-umb_Latn', 'amh_Ethi-wol_Latn',
+            'amh_Ethi-xho_Latn', 'amh_Ethi-yor_Latn', 'amh_Ethi-zul_Latn',
+            'arb_Arab-ckb_Arab', 'arb_Arab-crh_Latn', 'arb_Arab-dik_Latn',
+            'arb_Arab-diq_Latn', 'arb_Arab-fuv_Latn', 'arb_Arab-kmr_Latn',
+            'arb_Arab-knc_Latn', 'arb_Arab-nus_Latn', 'arb_Arab-som_Latn',
+            'arb_Arab-tat_Cyrl', 'arb_Arab-tzm_Tfng', 'arb_Arab-urd_Arab',
+            'arb_Arab-wol_Latn', 'asm_Beng-awa_Deva', 'asm_Beng-ben_Beng',
+            'asm_Beng-bho_Deva', 'asm_Beng-eng_Latn', 'asm_Beng-guj_Gujr',
+            'asm_Beng-hin_Deva', 'asm_Beng-hne_Deva', 'asm_Beng-kan_Knda',
+            'asm_Beng-kas_Arab', 'asm_Beng-kas_Deva', 'asm_Beng-mag_Deva',
+            'asm_Beng-mai_Deva', 'asm_Beng-mal_Mlym', 'asm_Beng-mar_Deva',
+            'asm_Beng-npi_Deva', 'asm_Beng-ory_Orya', 'asm_Beng-pan_Guru',
+            'asm_Beng-san_Deva', 'asm_Beng-sat_Beng', 'asm_Beng-sin_Sinh',
+            'asm_Beng-snd_Arab', 'asm_Beng-tam_Taml', 'asm_Beng-tel_Telu',
+            'asm_Beng-urd_Arab', 'awa_Deva-ben_Beng', 'awa_Deva-bho_Deva',
+            'awa_Deva-eng_Latn', 'awa_Deva-guj_Gujr', 'awa_Deva-hin_Deva',
+            'awa_Deva-hne_Deva', 'awa_Deva-kan_Knda', 'awa_Deva-kas_Arab',
+            'awa_Deva-kas_Deva', 'awa_Deva-mag_Deva', 'awa_Deva-mai_Deva',
+            'awa_Deva-mal_Mlym', 'awa_Deva-mar_Deva', 'awa_Deva-npi_Deva',
+            'awa_Deva-ory_Orya', 'awa_Deva-pan_Guru', 'awa_Deva-san_Deva',
+            'awa_Deva-sat_Beng', 'awa_Deva-sin_Sinh', 'awa_Deva-snd_Arab',
+            'awa_Deva-tam_Taml', 'awa_Deva-tel_Telu', 'awa_Deva-urd_Arab',
+            'ayr_Latn-eng_Latn', 'ayr_Latn-spa_Latn', 'azb_Arab-eng_Latn',
+            'azj_Latn-eng_Latn', 'azj_Latn-rus_Cyrl', 'bak_Cyrl-crh_Latn',
+            'bak_Cyrl-eng_Latn', 'bak_Cyrl-kir_Cyrl', 'bak_Cyrl-rus_Cyrl',
+            'bak_Cyrl-tat_Cyrl', 'bak_Cyrl-tuk_Latn', 'bak_Cyrl-uig_Arab',
+            'bak_Cyrl-uzn_Latn', 'bam_Latn-bem_Latn', 'bam_Latn-cjk_Latn',
+            'bam_Latn-dik_Latn', 'bam_Latn-dyu_Latn', 'bam_Latn-eng_Latn',
+            'bam_Latn-ewe_Latn', 'bam_Latn-fon_Latn', 'bam_Latn-fra_Latn',
+            'bam_Latn-fuv_Latn', 'bam_Latn-gaz_Latn', 'bam_Latn-hau_Latn',
+            'bam_Latn-ibo_Latn', 'bam_Latn-kam_Latn', 'bam_Latn-kik_Latn',
+            'bam_Latn-kin_Latn', 'bam_Latn-kmb_Latn', 'bam_Latn-knc_Arab',
+            'bam_Latn-knc_Latn', 'bam_Latn-kon_Latn', 'bam_Latn-lin_Latn',
+            'bam_Latn-lua_Latn', 'bam_Latn-lug_Latn', 'bam_Latn-luo_Latn',
+            'bam_Latn-nso_Latn', 'bam_Latn-nus_Latn', 'bam_Latn-nya_Latn',
+            'bam_Latn-run_Latn', 'bam_Latn-sna_Latn', 'bam_Latn-som_Latn',
+            'bam_Latn-sot_Latn', 'bam_Latn-ssw_Latn', 'bam_Latn-swh_Latn',
+            'bam_Latn-tir_Ethi', 'bam_Latn-tsn_Latn', 'bam_Latn-tso_Latn',
+            'bam_Latn-tum_Latn', 'bam_Latn-twi_Latn', 'bam_Latn-umb_Latn',
+            'bam_Latn-wol_Latn', 'bam_Latn-xho_Latn', 'bam_Latn-yor_Latn',
+            'bam_Latn-zul_Latn', 'ban_Latn-bjn_Latn', 'ban_Latn-bug_Latn',
+            'ban_Latn-ceb_Latn', 'ban_Latn-eng_Latn', 'ban_Latn-fij_Latn',
+            'ban_Latn-ilo_Latn', 'ban_Latn-jav_Latn', 'ban_Latn-min_Latn',
+            'ban_Latn-mri_Latn', 'ban_Latn-pag_Latn', 'ban_Latn-plt_Latn',
+            'ban_Latn-smo_Latn', 'ban_Latn-sun_Latn', 'ban_Latn-war_Latn',
+            'bel_Cyrl-eng_Latn', 'bel_Cyrl-rus_Cyrl', 'bem_Latn-cjk_Latn',
+            'bem_Latn-dik_Latn', 'bem_Latn-dyu_Latn', 'bem_Latn-eng_Latn',
+            'bem_Latn-ewe_Latn', 'bem_Latn-fon_Latn', 'bem_Latn-fra_Latn',
+            'bem_Latn-fuv_Latn', 'bem_Latn-gaz_Latn', 'bem_Latn-hau_Latn',
+            'bem_Latn-ibo_Latn', 'bem_Latn-kam_Latn', 'bem_Latn-kik_Latn',
+            'bem_Latn-kin_Latn', 'bem_Latn-kmb_Latn', 'bem_Latn-knc_Arab',
+            'bem_Latn-knc_Latn', 'bem_Latn-kon_Latn', 'bem_Latn-lin_Latn',
+            'bem_Latn-lua_Latn', 'bem_Latn-lug_Latn', 'bem_Latn-luo_Latn',
+            'bem_Latn-nso_Latn', 'bem_Latn-nus_Latn', 'bem_Latn-nya_Latn',
+            'bem_Latn-run_Latn', 'bem_Latn-sna_Latn', 'bem_Latn-som_Latn',
+            'bem_Latn-sot_Latn', 'bem_Latn-ssw_Latn', 'bem_Latn-swh_Latn',
+            'bem_Latn-tir_Ethi', 'bem_Latn-tsn_Latn', 'bem_Latn-tso_Latn',
+            'bem_Latn-tum_Latn', 'bem_Latn-twi_Latn', 'bem_Latn-umb_Latn',
+            'bem_Latn-wol_Latn', 'bem_Latn-xho_Latn', 'bem_Latn-yor_Latn',
+            'bem_Latn-zul_Latn', 'ben_Beng-bho_Deva', 'ben_Beng-eng_Latn',
+            'ben_Beng-guj_Gujr', 'ben_Beng-hin_Deva', 'ben_Beng-hne_Deva',
+            'ben_Beng-kan_Knda', 'ben_Beng-kas_Arab', 'ben_Beng-kas_Deva',
+            'ben_Beng-mag_Deva', 'ben_Beng-mai_Deva', 'ben_Beng-mal_Mlym',
+            'ben_Beng-mar_Deva', 'ben_Beng-npi_Deva', 'ben_Beng-ory_Orya',
+            'ben_Beng-pan_Guru', 'ben_Beng-pbt_Arab', 'ben_Beng-san_Deva',
+            'ben_Beng-sat_Beng', 'ben_Beng-sin_Sinh', 'ben_Beng-snd_Arab',
+            'ben_Beng-tam_Taml', 'ben_Beng-tel_Telu', 'ben_Beng-urd_Arab',
+            'bho_Deva-eng_Latn', 'bho_Deva-guj_Gujr', 'bho_Deva-hin_Deva',
+            'bho_Deva-hne_Deva', 'bho_Deva-kan_Knda', 'bho_Deva-kas_Arab',
+            'bho_Deva-kas_Deva', 'bho_Deva-mag_Deva', 'bho_Deva-mai_Deva',
+            'bho_Deva-mal_Mlym', 'bho_Deva-mar_Deva', 'bho_Deva-npi_Deva',
+            'bho_Deva-ory_Orya', 'bho_Deva-pan_Guru', 'bho_Deva-san_Deva',
+            'bho_Deva-sat_Beng', 'bho_Deva-sin_Sinh', 'bho_Deva-snd_Arab',
+            'bho_Deva-tam_Taml', 'bho_Deva-tel_Telu', 'bho_Deva-urd_Arab',
+            'bjn_Latn-bug_Latn', 'bjn_Latn-ceb_Latn', 'bjn_Latn-eng_Latn',
+            'bjn_Latn-fij_Latn', 'bjn_Latn-ilo_Latn', 'bjn_Latn-ind_Latn',
+            'bjn_Latn-jav_Latn', 'bjn_Latn-min_Latn', 'bjn_Latn-mri_Latn',
+            'bjn_Latn-pag_Latn', 'bjn_Latn-plt_Latn', 'bjn_Latn-smo_Latn',
+            'bjn_Latn-sun_Latn', 'bjn_Latn-war_Latn', 'bod_Tibt-eng_Latn',
+            'bos_Latn-eng_Latn', 'bug_Latn-ceb_Latn', 'bug_Latn-eng_Latn',
+            'bug_Latn-fij_Latn', 'bug_Latn-ilo_Latn', 'bug_Latn-jav_Latn',
+            'bug_Latn-min_Latn', 'bug_Latn-mri_Latn', 'bug_Latn-pag_Latn',
+            'bug_Latn-plt_Latn', 'bug_Latn-smo_Latn', 'bug_Latn-sun_Latn',
+            'bug_Latn-war_Latn', 'ceb_Latn-eng_Latn', 'ceb_Latn-fij_Latn',
+            'ceb_Latn-ilo_Latn', 'ceb_Latn-jav_Latn', 'ceb_Latn-min_Latn',
+            'ceb_Latn-mri_Latn', 'ceb_Latn-pag_Latn', 'ceb_Latn-plt_Latn',
+            'ceb_Latn-smo_Latn', 'ceb_Latn-sun_Latn', 'ceb_Latn-war_Latn',
+            'cjk_Latn-dik_Latn', 'cjk_Latn-dyu_Latn', 'cjk_Latn-eng_Latn',
+            'cjk_Latn-ewe_Latn', 'cjk_Latn-fon_Latn', 'cjk_Latn-fra_Latn',
+            'cjk_Latn-fuv_Latn', 'cjk_Latn-gaz_Latn', 'cjk_Latn-hau_Latn',
+            'cjk_Latn-ibo_Latn', 'cjk_Latn-kam_Latn', 'cjk_Latn-kik_Latn',
+            'cjk_Latn-kin_Latn', 'cjk_Latn-kmb_Latn', 'cjk_Latn-knc_Arab',
+            'cjk_Latn-knc_Latn', 'cjk_Latn-kon_Latn', 'cjk_Latn-lin_Latn',
+            'cjk_Latn-lua_Latn', 'cjk_Latn-lug_Latn', 'cjk_Latn-luo_Latn',
+            'cjk_Latn-nso_Latn', 'cjk_Latn-nus_Latn', 'cjk_Latn-nya_Latn',
+            'cjk_Latn-por_Latn', 'cjk_Latn-run_Latn', 'cjk_Latn-sna_Latn',
+            'cjk_Latn-som_Latn', 'cjk_Latn-sot_Latn', 'cjk_Latn-ssw_Latn',
+            'cjk_Latn-swh_Latn', 'cjk_Latn-tir_Ethi', 'cjk_Latn-tsn_Latn',
+            'cjk_Latn-tso_Latn', 'cjk_Latn-tum_Latn', 'cjk_Latn-twi_Latn',
+            'cjk_Latn-umb_Latn', 'cjk_Latn-wol_Latn', 'cjk_Latn-xho_Latn',
+            'cjk_Latn-yor_Latn', 'cjk_Latn-zul_Latn', 'ckb_Arab-diq_Latn',
+            'ckb_Arab-eng_Latn', 'ckb_Arab-kmr_Latn', 'ckb_Arab-pbt_Arab',
+            'ckb_Arab-prs_Arab', 'ckb_Arab-tgk_Cyrl', 'crh_Latn-eng_Latn',
+            'crh_Latn-kir_Cyrl', 'crh_Latn-rus_Cyrl', 'crh_Latn-tat_Cyrl',
+            'crh_Latn-tuk_Latn', 'crh_Latn-uig_Arab', 'crh_Latn-uzn_Latn',
+            'cym_Latn-eng_Latn', 'dik_Latn-dyu_Latn', 'dik_Latn-eng_Latn',
+            'dik_Latn-ewe_Latn', 'dik_Latn-fon_Latn', 'dik_Latn-fra_Latn',
+            'dik_Latn-fuv_Latn', 'dik_Latn-gaz_Latn', 'dik_Latn-hau_Latn',
+            'dik_Latn-ibo_Latn', 'dik_Latn-kam_Latn', 'dik_Latn-kik_Latn',
+            'dik_Latn-kin_Latn', 'dik_Latn-kmb_Latn', 'dik_Latn-knc_Arab',
+            'dik_Latn-knc_Latn', 'dik_Latn-kon_Latn', 'dik_Latn-lin_Latn',
+            'dik_Latn-lua_Latn', 'dik_Latn-lug_Latn', 'dik_Latn-luo_Latn',
+            'dik_Latn-nso_Latn', 'dik_Latn-nus_Latn', 'dik_Latn-nya_Latn',
+            'dik_Latn-run_Latn', 'dik_Latn-sna_Latn', 'dik_Latn-som_Latn',
+            'dik_Latn-sot_Latn', 'dik_Latn-ssw_Latn', 'dik_Latn-swh_Latn',
+            'dik_Latn-tir_Ethi', 'dik_Latn-tsn_Latn', 'dik_Latn-tso_Latn',
+            'dik_Latn-tum_Latn', 'dik_Latn-twi_Latn', 'dik_Latn-umb_Latn',
+            'dik_Latn-wol_Latn', 'dik_Latn-xho_Latn', 'dik_Latn-yor_Latn',
+            'dik_Latn-zul_Latn', 'diq_Latn-eng_Latn', 'diq_Latn-kmr_Latn',
+            'diq_Latn-pbt_Arab', 'diq_Latn-prs_Arab', 'diq_Latn-tgk_Cyrl',
+            'dyu_Latn-eng_Latn', 'dyu_Latn-ewe_Latn', 'dyu_Latn-fon_Latn',
+            'dyu_Latn-fra_Latn', 'dyu_Latn-fuv_Latn', 'dyu_Latn-gaz_Latn',
+            'dyu_Latn-hau_Latn', 'dyu_Latn-ibo_Latn', 'dyu_Latn-kam_Latn',
+            'dyu_Latn-kik_Latn', 'dyu_Latn-kin_Latn', 'dyu_Latn-kmb_Latn',
+            'dyu_Latn-knc_Arab', 'dyu_Latn-knc_Latn', 'dyu_Latn-kon_Latn',
+            'dyu_Latn-lin_Latn', 'dyu_Latn-lua_Latn', 'dyu_Latn-lug_Latn',
+            'dyu_Latn-luo_Latn', 'dyu_Latn-nso_Latn', 'dyu_Latn-nus_Latn',
+            'dyu_Latn-nya_Latn', 'dyu_Latn-run_Latn', 'dyu_Latn-sna_Latn',
+            'dyu_Latn-som_Latn', 'dyu_Latn-sot_Latn', 'dyu_Latn-ssw_Latn',
+            'dyu_Latn-swh_Latn', 'dyu_Latn-tir_Ethi', 'dyu_Latn-tsn_Latn',
+            'dyu_Latn-tso_Latn', 'dyu_Latn-tum_Latn', 'dyu_Latn-twi_Latn',
+            'dyu_Latn-umb_Latn', 'dyu_Latn-wol_Latn', 'dyu_Latn-xho_Latn',
+            'dyu_Latn-yor_Latn', 'dyu_Latn-zul_Latn', 'dzo_Tibt-eng_Latn',
+            'eng_Latn-als_Latn', 'eng_Latn-epo_Latn', 'eng_Latn-ewe_Latn',
+            'eng_Latn-fao_Latn', 'eng_Latn-fij_Latn', 'eng_Latn-fon_Latn',
+            'eng_Latn-fur_Latn', 'eng_Latn-fuv_Latn', 'eng_Latn-gaz_Latn',
+            'eng_Latn-gla_Latn', 'eng_Latn-gle_Latn', 'eng_Latn-grn_Latn',
+            'eng_Latn-guj_Gujr', 'eng_Latn-hat_Latn', 'eng_Latn-hau_Latn',
+            'eng_Latn-hin_Deva', 'eng_Latn-hne_Deva', 'eng_Latn-hye_Armn',
+            'eng_Latn-ibo_Latn', 'eng_Latn-ilo_Latn', 'eng_Latn-jav_Latn',
+            'eng_Latn-kab_Latn', 'eng_Latn-kac_Latn', 'eng_Latn-kam_Latn',
+            'eng_Latn-kan_Knda', 'eng_Latn-kas_Arab', 'eng_Latn-kas_Deva',
+            'eng_Latn-kat_Geor', 'eng_Latn-kaz_Cyrl', 'eng_Latn-kbp_Latn',
+            'eng_Latn-kea_Latn', 'eng_Latn-khk_Cyrl', 'eng_Latn-khm_Khmr',
+            'eng_Latn-kik_Latn', 'eng_Latn-kin_Latn', 'eng_Latn-kir_Cyrl',
+            'eng_Latn-kmb_Latn', 'eng_Latn-kmr_Latn', 'eng_Latn-knc_Arab',
+            'eng_Latn-knc_Latn', 'eng_Latn-kon_Latn', 'eng_Latn-lao_Laoo',
+            'eng_Latn-lij_Latn', 'eng_Latn-lim_Latn', 'eng_Latn-lin_Latn',
+            'eng_Latn-lmo_Latn', 'eng_Latn-ltg_Latn', 'eng_Latn-ltz_Latn',
+            'eng_Latn-lua_Latn', 'eng_Latn-lug_Latn', 'eng_Latn-luo_Latn',
+            'eng_Latn-lus_Latn', 'eng_Latn-mag_Deva', 'eng_Latn-mai_Deva',
+            'eng_Latn-mal_Mlym', 'eng_Latn-mar_Deva', 'eng_Latn-min_Latn',
+            'eng_Latn-mlt_Latn', 'eng_Latn-mni_Beng', 'eng_Latn-mos_Latn',
+            'eng_Latn-mri_Latn', 'eng_Latn-mya_Mymr', 'eng_Latn-npi_Deva',
+            'eng_Latn-nso_Latn', 'eng_Latn-nus_Latn', 'eng_Latn-nya_Latn',
+            'eng_Latn-ory_Orya', 'eng_Latn-pag_Latn', 'eng_Latn-pan_Guru',
+            'eng_Latn-pap_Latn', 'eng_Latn-pbt_Arab', 'eng_Latn-plt_Latn',
+            'eng_Latn-prs_Arab', 'eng_Latn-quy_Latn', 'eng_Latn-run_Latn',
+            'eng_Latn-sag_Latn', 'eng_Latn-san_Deva', 'eng_Latn-sat_Beng',
+            'eng_Latn-scn_Latn', 'eng_Latn-shn_Mymr', 'eng_Latn-sin_Sinh',
+            'eng_Latn-smo_Latn', 'eng_Latn-sna_Latn', 'eng_Latn-snd_Arab',
+            'eng_Latn-som_Latn', 'eng_Latn-sot_Latn', 'eng_Latn-srd_Latn',
+            'eng_Latn-ssw_Latn', 'eng_Latn-sun_Latn', 'eng_Latn-swh_Latn',
+            'eng_Latn-szl_Latn', 'eng_Latn-tam_Taml', 'eng_Latn-taq_Latn',
+            'eng_Latn-tat_Cyrl', 'eng_Latn-tel_Telu', 'eng_Latn-tgk_Cyrl',
+            'eng_Latn-tgl_Latn', 'eng_Latn-tir_Ethi', 'eng_Latn-tpi_Latn',
+            'eng_Latn-tsn_Latn', 'eng_Latn-tso_Latn', 'eng_Latn-tuk_Latn',
+            'eng_Latn-tum_Latn', 'eng_Latn-twi_Latn', 'eng_Latn-tzm_Tfng',
+            'eng_Latn-uig_Arab', 'eng_Latn-umb_Latn', 'eng_Latn-urd_Arab',
+            'eng_Latn-uzn_Latn', 'eng_Latn-vec_Latn', 'eng_Latn-war_Latn',
+            'eng_Latn-wol_Latn', 'eng_Latn-xho_Latn', 'eng_Latn-ydd_Hebr',
+            'eng_Latn-yor_Latn', 'eng_Latn-zho_Hant', 'eng_Latn-zsm_Latn',
+            'eng_Latn-zul_Latn', 'epo_Latn-fra_Latn', 'ewe_Latn-fon_Latn',
+            'ewe_Latn-fra_Latn', 'ewe_Latn-fuv_Latn', 'ewe_Latn-gaz_Latn',
+            'ewe_Latn-hau_Latn', 'ewe_Latn-ibo_Latn', 'ewe_Latn-kam_Latn',
+            'ewe_Latn-kik_Latn', 'ewe_Latn-kin_Latn', 'ewe_Latn-kmb_Latn',
+            'ewe_Latn-knc_Arab', 'ewe_Latn-knc_Latn', 'ewe_Latn-kon_Latn',
+            'ewe_Latn-lin_Latn', 'ewe_Latn-lua_Latn', 'ewe_Latn-lug_Latn',
+            'ewe_Latn-luo_Latn', 'ewe_Latn-nso_Latn', 'ewe_Latn-nus_Latn',
+            'ewe_Latn-nya_Latn', 'ewe_Latn-run_Latn', 'ewe_Latn-sna_Latn',
+            'ewe_Latn-som_Latn', 'ewe_Latn-sot_Latn', 'ewe_Latn-ssw_Latn',
+            'ewe_Latn-swh_Latn', 'ewe_Latn-tir_Ethi', 'ewe_Latn-tsn_Latn',
+            'ewe_Latn-tso_Latn', 'ewe_Latn-tum_Latn', 'ewe_Latn-twi_Latn',
+            'ewe_Latn-umb_Latn', 'ewe_Latn-wol_Latn', 'ewe_Latn-xho_Latn',
+            'ewe_Latn-yor_Latn', 'ewe_Latn-zul_Latn', 'fij_Latn-hin_Deva',
+            'fij_Latn-ilo_Latn', 'fij_Latn-jav_Latn', 'fij_Latn-min_Latn',
+            'fij_Latn-mri_Latn', 'fij_Latn-pag_Latn', 'fij_Latn-plt_Latn',
+            'fij_Latn-smo_Latn', 'fij_Latn-sun_Latn', 'fij_Latn-war_Latn',
+            'fon_Latn-fra_Latn', 'fon_Latn-fuv_Latn', 'fon_Latn-gaz_Latn',
+            'fon_Latn-hau_Latn', 'fon_Latn-ibo_Latn', 'fon_Latn-kam_Latn',
+            'fon_Latn-kik_Latn', 'fon_Latn-kin_Latn', 'fon_Latn-kmb_Latn',
+            'fon_Latn-knc_Arab', 'fon_Latn-knc_Latn', 'fon_Latn-kon_Latn',
+            'fon_Latn-lin_Latn', 'fon_Latn-lua_Latn', 'fon_Latn-lug_Latn',
+            'fon_Latn-luo_Latn', 'fon_Latn-nso_Latn', 'fon_Latn-nus_Latn',
+            'fon_Latn-nya_Latn', 'fon_Latn-run_Latn', 'fon_Latn-sna_Latn',
+            'fon_Latn-som_Latn', 'fon_Latn-sot_Latn', 'fon_Latn-ssw_Latn',
+            'fon_Latn-swh_Latn', 'fon_Latn-tir_Ethi', 'fon_Latn-tsn_Latn',
+            'fon_Latn-tso_Latn', 'fon_Latn-tum_Latn', 'fon_Latn-twi_Latn',
+            'fon_Latn-umb_Latn', 'fon_Latn-wol_Latn', 'fon_Latn-xho_Latn',
+            'fon_Latn-yor_Latn', 'fon_Latn-zul_Latn', 'fra_Latn-fuv_Latn',
+            'fra_Latn-gaz_Latn', 'fra_Latn-glg_Latn', 'fra_Latn-hat_Latn',
+            'fra_Latn-hau_Latn', 'fra_Latn-ibo_Latn', 'fra_Latn-kab_Latn',
+            'fra_Latn-kam_Latn', 'fra_Latn-kik_Latn', 'fra_Latn-kin_Latn',
+            'fra_Latn-kmb_Latn', 'fra_Latn-knc_Arab', 'fra_Latn-knc_Latn',
+            'fra_Latn-kon_Latn', 'fra_Latn-lin_Latn', 'fra_Latn-ltz_Latn',
+            'fra_Latn-lua_Latn', 'fra_Latn-lug_Latn', 'fra_Latn-luo_Latn',
+            'fra_Latn-nso_Latn', 'fra_Latn-nus_Latn', 'fra_Latn-nya_Latn',
+            'fra_Latn-oci_Latn', 'fra_Latn-plt_Latn', 'fra_Latn-run_Latn',
+            'fra_Latn-sag_Latn', 'fra_Latn-scn_Latn', 'fra_Latn-sna_Latn',
+            'fra_Latn-som_Latn', 'fra_Latn-sot_Latn', 'fra_Latn-ssw_Latn',
+            'fra_Latn-swh_Latn', 'fra_Latn-tir_Ethi', 'fra_Latn-tsn_Latn',
+            'fra_Latn-tso_Latn', 'fra_Latn-tum_Latn', 'fra_Latn-twi_Latn',
+            'fra_Latn-tzm_Tfng', 'fra_Latn-umb_Latn', 'fra_Latn-wol_Latn',
+            'fra_Latn-xho_Latn', 'fra_Latn-yor_Latn', 'fra_Latn-zul_Latn',
+            'fuv_Latn-gaz_Latn', 'fuv_Latn-hau_Latn', 'fuv_Latn-ibo_Latn',
+            'fuv_Latn-kam_Latn', 'fuv_Latn-kik_Latn', 'fuv_Latn-kin_Latn',
+            'fuv_Latn-kmb_Latn', 'fuv_Latn-knc_Arab', 'fuv_Latn-knc_Latn',
+            'fuv_Latn-kon_Latn', 'fuv_Latn-lin_Latn', 'fuv_Latn-lua_Latn',
+            'fuv_Latn-lug_Latn', 'fuv_Latn-luo_Latn', 'fuv_Latn-nso_Latn',
+            'fuv_Latn-nus_Latn', 'fuv_Latn-nya_Latn', 'fuv_Latn-run_Latn',
+            'fuv_Latn-sna_Latn', 'fuv_Latn-som_Latn', 'fuv_Latn-sot_Latn',
+            'fuv_Latn-ssw_Latn', 'fuv_Latn-swh_Latn', 'fuv_Latn-tir_Ethi',
+            'fuv_Latn-tsn_Latn', 'fuv_Latn-tso_Latn', 'fuv_Latn-tum_Latn',
+            'fuv_Latn-twi_Latn', 'fuv_Latn-umb_Latn', 'fuv_Latn-wol_Latn',
+            'fuv_Latn-xho_Latn', 'fuv_Latn-yor_Latn', 'fuv_Latn-zul_Latn',
+            'gaz_Latn-run_Latn', 'gaz_Latn-sna_Latn', 'gaz_Latn-som_Latn',
+            'gaz_Latn-sot_Latn', 'gaz_Latn-ssw_Latn', 'gaz_Latn-swh_Latn',
+            'gaz_Latn-tir_Ethi', 'gaz_Latn-tsn_Latn', 'gaz_Latn-tso_Latn',
+            'gaz_Latn-tum_Latn', 'gaz_Latn-twi_Latn', 'gaz_Latn-umb_Latn',
+            'gaz_Latn-wol_Latn', 'gaz_Latn-xho_Latn', 'gaz_Latn-yor_Latn',
+            'gaz_Latn-zul_Latn', 'glg_Latn-por_Latn', 'grn_Latn-por_Latn',
+            'guj_Gujr-hin_Deva', 'guj_Gujr-hne_Deva', 'guj_Gujr-kan_Knda',
+            'guj_Gujr-kas_Arab', 'guj_Gujr-kas_Deva', 'guj_Gujr-mag_Deva',
+            'guj_Gujr-mai_Deva', 'guj_Gujr-mal_Mlym', 'guj_Gujr-mar_Deva',
+            'guj_Gujr-npi_Deva', 'guj_Gujr-ory_Orya', 'guj_Gujr-pan_Guru',
+            'guj_Gujr-san_Deva', 'guj_Gujr-sat_Beng', 'guj_Gujr-sin_Sinh',
+            'guj_Gujr-snd_Arab', 'guj_Gujr-tam_Taml', 'guj_Gujr-tel_Telu',
+            'guj_Gujr-urd_Arab', 'hau_Latn-gaz_Latn', 'hau_Latn-ibo_Latn',
+            'hau_Latn-kam_Latn', 'hau_Latn-kik_Latn', 'hau_Latn-kin_Latn',
+            'hau_Latn-kmb_Latn', 'hau_Latn-knc_Arab', 'hau_Latn-knc_Latn',
+            'hau_Latn-kon_Latn', 'hau_Latn-lin_Latn', 'hau_Latn-lua_Latn',
+            'hau_Latn-lug_Latn', 'hau_Latn-luo_Latn', 'hau_Latn-nso_Latn',
+            'hau_Latn-nus_Latn', 'hau_Latn-nya_Latn', 'hau_Latn-run_Latn',
+            'hau_Latn-sna_Latn', 'hau_Latn-som_Latn', 'hau_Latn-sot_Latn',
+            'hau_Latn-ssw_Latn', 'hau_Latn-swh_Latn', 'hau_Latn-tir_Ethi',
+            'hau_Latn-tsn_Latn', 'hau_Latn-tso_Latn', 'hau_Latn-tum_Latn',
+            'hau_Latn-twi_Latn', 'hau_Latn-umb_Latn', 'hau_Latn-wol_Latn',
+            'hau_Latn-xho_Latn', 'hau_Latn-yor_Latn', 'hau_Latn-zul_Latn',
+            'hin_Deva-hne_Deva', 'hin_Deva-kan_Knda', 'hin_Deva-kas_Arab',
+            'hin_Deva-kas_Deva', 'hin_Deva-mag_Deva', 'hin_Deva-mai_Deva',
+            'hin_Deva-mal_Mlym', 'hin_Deva-mar_Deva', 'hin_Deva-npi_Deva',
+            'hin_Deva-ory_Orya', 'hin_Deva-pan_Guru', 'hin_Deva-pbt_Arab',
+            'hin_Deva-san_Deva', 'hin_Deva-sat_Beng', 'hin_Deva-sin_Sinh',
+            'hin_Deva-snd_Arab', 'hin_Deva-tam_Taml', 'hin_Deva-tel_Telu',
+            'hin_Deva-urd_Arab', 'hne_Deva-kan_Knda', 'hne_Deva-kas_Arab',
+            'hne_Deva-kas_Deva', 'hne_Deva-mag_Deva', 'hne_Deva-mai_Deva',
+            'hne_Deva-mal_Mlym', 'hne_Deva-mar_Deva', 'hne_Deva-npi_Deva',
+            'hne_Deva-ory_Orya', 'hne_Deva-pan_Guru', 'hne_Deva-san_Deva',
+            'hne_Deva-sat_Beng', 'hne_Deva-sin_Sinh', 'hne_Deva-snd_Arab',
+            'hne_Deva-tam_Taml', 'hne_Deva-tel_Telu', 'hne_Deva-urd_Arab',
+            'hye_Armn-rus_Cyrl', 'ibo_Latn-gaz_Latn', 'ibo_Latn-kam_Latn',
+            'ibo_Latn-kik_Latn', 'ibo_Latn-kin_Latn', 'ibo_Latn-kmb_Latn',
+            'ibo_Latn-knc_Arab', 'ibo_Latn-knc_Latn', 'ibo_Latn-kon_Latn',
+            'ibo_Latn-lin_Latn', 'ibo_Latn-lua_Latn', 'ibo_Latn-lug_Latn',
+            'ibo_Latn-luo_Latn', 'ibo_Latn-nso_Latn', 'ibo_Latn-nus_Latn',
+            'ibo_Latn-nya_Latn', 'ibo_Latn-run_Latn', 'ibo_Latn-sna_Latn',
+            'ibo_Latn-som_Latn', 'ibo_Latn-sot_Latn', 'ibo_Latn-ssw_Latn',
+            'ibo_Latn-swh_Latn', 'ibo_Latn-tir_Ethi', 'ibo_Latn-tsn_Latn',
+            'ibo_Latn-tso_Latn', 'ibo_Latn-tum_Latn', 'ibo_Latn-twi_Latn',
+            'ibo_Latn-umb_Latn', 'ibo_Latn-wol_Latn', 'ibo_Latn-xho_Latn',
+            'ibo_Latn-yor_Latn', 'ibo_Latn-zul_Latn', 'ilo_Latn-jav_Latn',
+            'ilo_Latn-min_Latn', 'ilo_Latn-mri_Latn', 'ilo_Latn-pag_Latn',
+            'ilo_Latn-plt_Latn', 'ilo_Latn-smo_Latn', 'ilo_Latn-sun_Latn',
+            'ilo_Latn-war_Latn', 'ind_Latn-ace_Latn', 'ind_Latn-ban_Latn',
+            'ind_Latn-jav_Latn', 'ind_Latn-khm_Khmr', 'ind_Latn-lao_Laoo',
+            'ind_Latn-min_Latn', 'ind_Latn-mya_Mymr', 'ind_Latn-shn_Mymr',
+            'ind_Latn-sun_Latn', 'jav_Latn-min_Latn', 'jav_Latn-mri_Latn',
+            'jav_Latn-pag_Latn', 'jav_Latn-plt_Latn', 'jav_Latn-smo_Latn',
+            'jav_Latn-sun_Latn', 'jav_Latn-war_Latn', 'kam_Latn-gaz_Latn',
+            'kam_Latn-kik_Latn', 'kam_Latn-kin_Latn', 'kam_Latn-kmb_Latn',
+            'kam_Latn-knc_Arab', 'kam_Latn-knc_Latn', 'kam_Latn-kon_Latn',
+            'kam_Latn-lin_Latn', 'kam_Latn-lua_Latn', 'kam_Latn-lug_Latn',
+            'kam_Latn-luo_Latn', 'kam_Latn-nso_Latn', 'kam_Latn-nus_Latn',
+            'kam_Latn-nya_Latn', 'kam_Latn-run_Latn', 'kam_Latn-sna_Latn',
+            'kam_Latn-som_Latn', 'kam_Latn-sot_Latn', 'kam_Latn-ssw_Latn',
+            'kam_Latn-swh_Latn', 'kam_Latn-tir_Ethi', 'kam_Latn-tsn_Latn',
+            'kam_Latn-tso_Latn', 'kam_Latn-tum_Latn', 'kam_Latn-twi_Latn',
+            'kam_Latn-umb_Latn', 'kam_Latn-wol_Latn', 'kam_Latn-xho_Latn',
+            'kam_Latn-yor_Latn', 'kam_Latn-zul_Latn', 'kan_Knda-kas_Arab',
+            'kan_Knda-kas_Deva', 'kan_Knda-mag_Deva', 'kan_Knda-mai_Deva',
+            'kan_Knda-mal_Mlym', 'kan_Knda-mar_Deva', 'kan_Knda-npi_Deva',
+            'kan_Knda-ory_Orya', 'kan_Knda-pan_Guru', 'kan_Knda-san_Deva',
+            'kan_Knda-sat_Beng', 'kan_Knda-sin_Sinh', 'kan_Knda-snd_Arab',
+            'kan_Knda-tam_Taml', 'kan_Knda-tel_Telu', 'kan_Knda-urd_Arab',
+            'kas_Arab-kas_Deva', 'kas_Arab-mag_Deva', 'kas_Arab-mai_Deva',
+            'kas_Arab-mal_Mlym', 'kas_Arab-mar_Deva', 'kas_Arab-npi_Deva',
+            'kas_Arab-ory_Orya', 'kas_Arab-pan_Guru', 'kas_Arab-san_Deva',
+            'kas_Arab-sat_Beng', 'kas_Arab-sin_Sinh', 'kas_Arab-snd_Arab',
+            'kas_Arab-tam_Taml', 'kas_Arab-tel_Telu', 'kas_Arab-urd_Arab',
+            'kas_Deva-mag_Deva', 'kas_Deva-mai_Deva', 'kas_Deva-mal_Mlym',
+            'kas_Deva-mar_Deva', 'kas_Deva-npi_Deva', 'kas_Deva-ory_Orya',
+            'kas_Deva-pan_Guru', 'kas_Deva-san_Deva', 'kas_Deva-sat_Beng',
+            'kas_Deva-sin_Sinh', 'kas_Deva-snd_Arab', 'kas_Deva-tam_Taml',
+            'kas_Deva-tel_Telu', 'kas_Deva-urd_Arab', 'kat_Geor-rus_Cyrl',
+            'kea_Latn-por_Latn', 'kik_Latn-gaz_Latn', 'kik_Latn-kin_Latn',
+            'kik_Latn-kmb_Latn', 'kik_Latn-kon_Latn', 'kik_Latn-lin_Latn',
+            'kik_Latn-lua_Latn', 'kik_Latn-lug_Latn', 'kik_Latn-luo_Latn',
+            'kik_Latn-nso_Latn', 'kik_Latn-nus_Latn', 'kik_Latn-nya_Latn',
+            'kik_Latn-run_Latn', 'kik_Latn-sna_Latn', 'kik_Latn-som_Latn',
+            'kik_Latn-sot_Latn', 'kik_Latn-ssw_Latn', 'kik_Latn-swh_Latn',
+            'kik_Latn-tir_Ethi', 'kik_Latn-tsn_Latn', 'kik_Latn-tso_Latn',
+            'kik_Latn-tum_Latn', 'kik_Latn-twi_Latn', 'kik_Latn-umb_Latn',
+            'kik_Latn-wol_Latn', 'kik_Latn-xho_Latn', 'kik_Latn-yor_Latn',
+            'kik_Latn-zul_Latn', 'kin_Latn-gaz_Latn', 'kin_Latn-kmb_Latn',
+            'kin_Latn-kon_Latn', 'kin_Latn-lin_Latn', 'kin_Latn-lua_Latn',
+            'kin_Latn-lug_Latn', 'kin_Latn-luo_Latn', 'kin_Latn-nso_Latn',
+            'kin_Latn-nus_Latn', 'kin_Latn-nya_Latn', 'kin_Latn-run_Latn',
+            'kin_Latn-sna_Latn', 'kin_Latn-som_Latn', 'kin_Latn-sot_Latn',
+            'kin_Latn-ssw_Latn', 'kin_Latn-swh_Latn', 'kin_Latn-tir_Ethi',
+            'kin_Latn-tsn_Latn', 'kin_Latn-tso_Latn', 'kin_Latn-tum_Latn',
+            'kin_Latn-twi_Latn', 'kin_Latn-umb_Latn', 'kin_Latn-wol_Latn',
+            'kin_Latn-xho_Latn', 'kin_Latn-yor_Latn', 'kin_Latn-zul_Latn',
+            'kir_Cyrl-rus_Cyrl', 'kir_Cyrl-tat_Cyrl', 'kir_Cyrl-tuk_Latn',
+            'kir_Cyrl-uig_Arab', 'kir_Cyrl-uzn_Latn', 'kmb_Latn-gaz_Latn',
+            'kmb_Latn-kon_Latn', 'kmb_Latn-lin_Latn', 'kmb_Latn-lua_Latn',
+            'kmb_Latn-lug_Latn', 'kmb_Latn-luo_Latn', 'kmb_Latn-nso_Latn',
+            'kmb_Latn-nus_Latn', 'kmb_Latn-nya_Latn', 'kmb_Latn-por_Latn',
+            'kmb_Latn-run_Latn', 'kmb_Latn-sna_Latn', 'kmb_Latn-som_Latn',
+            'kmb_Latn-sot_Latn', 'kmb_Latn-ssw_Latn', 'kmb_Latn-swh_Latn',
+            'kmb_Latn-tir_Ethi', 'kmb_Latn-tsn_Latn', 'kmb_Latn-tso_Latn',
+            'kmb_Latn-tum_Latn', 'kmb_Latn-twi_Latn', 'kmb_Latn-umb_Latn',
+            'kmb_Latn-wol_Latn', 'kmb_Latn-xho_Latn', 'kmb_Latn-yor_Latn',
+            'kmb_Latn-zul_Latn', 'kmr_Latn-pbt_Arab', 'kmr_Latn-prs_Arab',
+            'kmr_Latn-tgk_Cyrl', 'knc_Arab-gaz_Latn', 'knc_Arab-kik_Latn',
+            'knc_Arab-kin_Latn', 'knc_Arab-kmb_Latn', 'knc_Arab-knc_Latn',
+            'knc_Arab-kon_Latn', 'knc_Arab-lin_Latn', 'knc_Arab-lua_Latn',
+            'knc_Arab-lug_Latn', 'knc_Arab-luo_Latn', 'knc_Arab-nso_Latn',
+            'knc_Arab-nus_Latn', 'knc_Arab-nya_Latn', 'knc_Arab-run_Latn',
+            'knc_Arab-sna_Latn', 'knc_Arab-som_Latn', 'knc_Arab-sot_Latn',
+            'knc_Arab-ssw_Latn', 'knc_Arab-swh_Latn', 'knc_Arab-tir_Ethi',
+            'knc_Arab-tsn_Latn', 'knc_Arab-tso_Latn', 'knc_Arab-tum_Latn',
+            'knc_Arab-twi_Latn', 'knc_Arab-umb_Latn', 'knc_Arab-wol_Latn',
+            'knc_Arab-xho_Latn', 'knc_Arab-yor_Latn', 'knc_Arab-zul_Latn',
+            'knc_Latn-gaz_Latn', 'knc_Latn-kik_Latn', 'knc_Latn-kin_Latn',
+            'knc_Latn-kmb_Latn', 'knc_Latn-kon_Latn', 'knc_Latn-lin_Latn',
+            'knc_Latn-lua_Latn', 'knc_Latn-lug_Latn', 'knc_Latn-luo_Latn',
+            'knc_Latn-nso_Latn', 'knc_Latn-nus_Latn', 'knc_Latn-nya_Latn',
+            'knc_Latn-run_Latn', 'knc_Latn-sna_Latn', 'knc_Latn-som_Latn',
+            'knc_Latn-sot_Latn', 'knc_Latn-ssw_Latn', 'knc_Latn-swh_Latn',
+            'knc_Latn-tir_Ethi', 'knc_Latn-tsn_Latn', 'knc_Latn-tso_Latn',
+            'knc_Latn-tum_Latn', 'knc_Latn-twi_Latn', 'knc_Latn-umb_Latn',
+            'knc_Latn-wol_Latn', 'knc_Latn-xho_Latn', 'knc_Latn-yor_Latn',
+            'knc_Latn-zul_Latn', 'kon_Latn-gaz_Latn', 'kon_Latn-lin_Latn',
+            'kon_Latn-lua_Latn', 'kon_Latn-lug_Latn', 'kon_Latn-luo_Latn',
+            'kon_Latn-nso_Latn', 'kon_Latn-nus_Latn', 'kon_Latn-nya_Latn',
+            'kon_Latn-run_Latn', 'kon_Latn-sna_Latn', 'kon_Latn-som_Latn',
+            'kon_Latn-sot_Latn', 'kon_Latn-ssw_Latn', 'kon_Latn-swh_Latn',
+            'kon_Latn-tir_Ethi', 'kon_Latn-tsn_Latn', 'kon_Latn-tso_Latn',
+            'kon_Latn-tum_Latn', 'kon_Latn-twi_Latn', 'kon_Latn-umb_Latn',
+            'kon_Latn-wol_Latn', 'kon_Latn-xho_Latn', 'kon_Latn-yor_Latn',
+            'kon_Latn-zul_Latn', 'lao_Laoo-rus_Cyrl', 'lin_Latn-gaz_Latn',
+            'lin_Latn-lua_Latn', 'lin_Latn-lug_Latn', 'lin_Latn-luo_Latn',
+            'lin_Latn-nso_Latn', 'lin_Latn-nus_Latn', 'lin_Latn-nya_Latn',
+            'lin_Latn-run_Latn', 'lin_Latn-sna_Latn', 'lin_Latn-som_Latn',
+            'lin_Latn-sot_Latn', 'lin_Latn-ssw_Latn', 'lin_Latn-swh_Latn',
+            'lin_Latn-tir_Ethi', 'lin_Latn-tsn_Latn', 'lin_Latn-tso_Latn',
+            'lin_Latn-tum_Latn', 'lin_Latn-twi_Latn', 'lin_Latn-umb_Latn',
+            'lin_Latn-wol_Latn', 'lin_Latn-xho_Latn', 'lin_Latn-yor_Latn',
+            'lin_Latn-zul_Latn', 'ltg_Latn-rus_Cyrl', 'lua_Latn-gaz_Latn',
+            'lua_Latn-lug_Latn', 'lua_Latn-luo_Latn', 'lua_Latn-nso_Latn',
+            'lua_Latn-nus_Latn', 'lua_Latn-nya_Latn', 'lua_Latn-run_Latn',
+            'lua_Latn-sna_Latn', 'lua_Latn-som_Latn', 'lua_Latn-sot_Latn',
+            'lua_Latn-ssw_Latn', 'lua_Latn-swh_Latn', 'lua_Latn-tir_Ethi',
+            'lua_Latn-tsn_Latn', 'lua_Latn-tso_Latn', 'lua_Latn-tum_Latn',
+            'lua_Latn-twi_Latn', 'lua_Latn-umb_Latn', 'lua_Latn-wol_Latn',
+            'lua_Latn-xho_Latn', 'lua_Latn-yor_Latn', 'lua_Latn-zul_Latn',
+            'lug_Latn-gaz_Latn', 'lug_Latn-luo_Latn', 'lug_Latn-nso_Latn',
+            'lug_Latn-nus_Latn', 'lug_Latn-nya_Latn', 'lug_Latn-run_Latn',
+            'lug_Latn-sna_Latn', 'lug_Latn-som_Latn', 'lug_Latn-sot_Latn',
+            'lug_Latn-ssw_Latn', 'lug_Latn-swh_Latn', 'lug_Latn-tir_Ethi',
+            'lug_Latn-tsn_Latn', 'lug_Latn-tso_Latn', 'lug_Latn-tum_Latn',
+            'lug_Latn-twi_Latn', 'lug_Latn-umb_Latn', 'lug_Latn-wol_Latn',
+            'lug_Latn-xho_Latn', 'lug_Latn-yor_Latn', 'lug_Latn-zul_Latn',
+            'luo_Latn-gaz_Latn', 'luo_Latn-nso_Latn', 'luo_Latn-nus_Latn',
+            'luo_Latn-nya_Latn', 'luo_Latn-run_Latn', 'luo_Latn-sna_Latn',
+            'luo_Latn-som_Latn', 'luo_Latn-sot_Latn', 'luo_Latn-ssw_Latn',
+            'luo_Latn-swh_Latn', 'luo_Latn-tir_Ethi', 'luo_Latn-tsn_Latn',
+            'luo_Latn-tso_Latn', 'luo_Latn-tum_Latn', 'luo_Latn-twi_Latn',
+            'luo_Latn-umb_Latn', 'luo_Latn-wol_Latn', 'luo_Latn-xho_Latn',
+            'luo_Latn-yor_Latn', 'luo_Latn-zul_Latn', 'mag_Deva-mai_Deva',
+            'mag_Deva-mal_Mlym', 'mag_Deva-mar_Deva', 'mag_Deva-npi_Deva',
+            'mag_Deva-ory_Orya', 'mag_Deva-pan_Guru', 'mag_Deva-san_Deva',
+            'mag_Deva-sat_Beng', 'mag_Deva-sin_Sinh', 'mag_Deva-snd_Arab',
+            'mag_Deva-tam_Taml', 'mag_Deva-tel_Telu', 'mag_Deva-urd_Arab',
+            'mai_Deva-mal_Mlym', 'mai_Deva-mar_Deva', 'mai_Deva-npi_Deva',
+            'mai_Deva-ory_Orya', 'mai_Deva-pan_Guru', 'mai_Deva-san_Deva',
+            'mai_Deva-sat_Beng', 'mai_Deva-sin_Sinh', 'mai_Deva-snd_Arab',
+            'mai_Deva-tam_Taml', 'mai_Deva-tel_Telu', 'mai_Deva-urd_Arab',
+            'mal_Mlym-mar_Deva', 'mal_Mlym-npi_Deva', 'mal_Mlym-ory_Orya',
+            'mal_Mlym-pan_Guru', 'mal_Mlym-san_Deva', 'mal_Mlym-sat_Beng',
+            'mal_Mlym-sin_Sinh', 'mal_Mlym-snd_Arab', 'mal_Mlym-tam_Taml',
+            'mal_Mlym-tel_Telu', 'mal_Mlym-urd_Arab', 'mar_Deva-npi_Deva',
+            'mar_Deva-ory_Orya', 'mar_Deva-pan_Guru', 'mar_Deva-san_Deva',
+            'mar_Deva-sat_Beng', 'mar_Deva-sin_Sinh', 'mar_Deva-snd_Arab',
+            'mar_Deva-tam_Taml', 'mar_Deva-tel_Telu', 'mar_Deva-urd_Arab',
+            'min_Latn-mri_Latn', 'min_Latn-pag_Latn', 'min_Latn-plt_Latn',
+            'min_Latn-smo_Latn', 'min_Latn-sun_Latn', 'min_Latn-war_Latn',
+            'mri_Latn-pag_Latn', 'mri_Latn-smo_Latn', 'mri_Latn-sun_Latn',
+            'mri_Latn-war_Latn', 'npi_Deva-ory_Orya', 'npi_Deva-pan_Guru',
+            'npi_Deva-san_Deva', 'npi_Deva-sat_Beng', 'npi_Deva-sin_Sinh',
+            'npi_Deva-snd_Arab', 'npi_Deva-tam_Taml', 'npi_Deva-tel_Telu',
+            'npi_Deva-urd_Arab', 'nso_Latn-gaz_Latn', 'nso_Latn-nus_Latn',
+            'nso_Latn-nya_Latn', 'nso_Latn-run_Latn', 'nso_Latn-sna_Latn',
+            'nso_Latn-som_Latn', 'nso_Latn-sot_Latn', 'nso_Latn-ssw_Latn',
+            'nso_Latn-swh_Latn', 'nso_Latn-tir_Ethi', 'nso_Latn-tsn_Latn',
+            'nso_Latn-tso_Latn', 'nso_Latn-tum_Latn', 'nso_Latn-twi_Latn',
+            'nso_Latn-umb_Latn', 'nso_Latn-wol_Latn', 'nso_Latn-xho_Latn',
+            'nso_Latn-yor_Latn', 'nso_Latn-zul_Latn', 'nus_Latn-gaz_Latn',
+            'nus_Latn-nya_Latn', 'nus_Latn-run_Latn', 'nus_Latn-sna_Latn',
+            'nus_Latn-som_Latn', 'nus_Latn-sot_Latn', 'nus_Latn-ssw_Latn',
+            'nus_Latn-swh_Latn', 'nus_Latn-tir_Ethi', 'nus_Latn-tsn_Latn',
+            'nus_Latn-tso_Latn', 'nus_Latn-tum_Latn', 'nus_Latn-twi_Latn',
+            'nus_Latn-umb_Latn', 'nus_Latn-wol_Latn', 'nus_Latn-xho_Latn',
+            'nus_Latn-yor_Latn', 'nus_Latn-zul_Latn', 'nya_Latn-gaz_Latn',
+            'nya_Latn-run_Latn', 'nya_Latn-sna_Latn', 'nya_Latn-som_Latn',
+            'nya_Latn-sot_Latn', 'nya_Latn-ssw_Latn', 'nya_Latn-swh_Latn',
+            'nya_Latn-tir_Ethi', 'nya_Latn-tsn_Latn', 'nya_Latn-tso_Latn',
+            'nya_Latn-tum_Latn', 'nya_Latn-twi_Latn', 'nya_Latn-umb_Latn',
+            'nya_Latn-wol_Latn', 'nya_Latn-xho_Latn', 'nya_Latn-yor_Latn',
+            'nya_Latn-zul_Latn', 'oci_Latn-por_Latn', 'ory_Orya-pan_Guru',
+            'ory_Orya-san_Deva', 'ory_Orya-sat_Beng', 'ory_Orya-sin_Sinh',
+            'ory_Orya-snd_Arab', 'ory_Orya-tam_Taml', 'ory_Orya-tel_Telu',
+            'ory_Orya-urd_Arab', 'pag_Latn-smo_Latn', 'pag_Latn-sun_Latn',
+            'pan_Guru-san_Deva', 'pan_Guru-sat_Beng', 'pan_Guru-sin_Sinh',
+            'pan_Guru-snd_Arab', 'pan_Guru-tam_Taml', 'pan_Guru-tel_Telu',
+            'pan_Guru-urd_Arab', 'pbt_Arab-tam_Taml', 'pbt_Arab-tgk_Cyrl',
+            'plt_Latn-mri_Latn', 'plt_Latn-pag_Latn', 'plt_Latn-smo_Latn',
+            'plt_Latn-sun_Latn', 'plt_Latn-war_Latn', 'por_Latn-ayr_Latn',
+            'por_Latn-quy_Latn', 'prs_Arab-pbt_Arab', 'prs_Arab-tgk_Cyrl',
+            'quy_Latn-spa_Latn', 'run_Latn-sna_Latn', 'run_Latn-som_Latn',
+            'run_Latn-sot_Latn', 'run_Latn-ssw_Latn', 'run_Latn-swh_Latn',
+            'run_Latn-tir_Ethi', 'run_Latn-tsn_Latn', 'run_Latn-tso_Latn',
+            'run_Latn-tum_Latn', 'run_Latn-twi_Latn', 'run_Latn-umb_Latn',
+            'run_Latn-wol_Latn', 'run_Latn-xho_Latn', 'run_Latn-yor_Latn',
+            'run_Latn-zul_Latn', 'rus_Cyrl-tat_Cyrl', 'rus_Cyrl-tgk_Cyrl',
+            'san_Deva-sat_Beng', 'san_Deva-sin_Sinh', 'san_Deva-snd_Arab',
+            'san_Deva-tam_Taml', 'san_Deva-tel_Telu', 'san_Deva-urd_Arab',
+            'sat_Beng-sin_Sinh', 'sat_Beng-snd_Arab', 'sat_Beng-tam_Taml',
+            'sat_Beng-tel_Telu', 'sat_Beng-urd_Arab', 'sin_Sinh-snd_Arab',
+            'sin_Sinh-tam_Taml', 'sin_Sinh-tel_Telu', 'sin_Sinh-urd_Arab',
+            'smo_Latn-sun_Latn', 'smo_Latn-war_Latn', 'sna_Latn-som_Latn',
+            'sna_Latn-sot_Latn', 'sna_Latn-ssw_Latn', 'sna_Latn-swh_Latn',
+            'sna_Latn-tir_Ethi', 'sna_Latn-tsn_Latn', 'sna_Latn-tso_Latn',
+            'sna_Latn-tum_Latn', 'sna_Latn-twi_Latn', 'sna_Latn-umb_Latn',
+            'sna_Latn-wol_Latn', 'sna_Latn-xho_Latn', 'sna_Latn-yor_Latn',
+            'sna_Latn-zul_Latn', 'snd_Arab-tam_Taml', 'snd_Arab-tel_Telu',
+            'snd_Arab-urd_Arab', 'som_Latn-sot_Latn', 'som_Latn-ssw_Latn',
+            'som_Latn-swh_Latn', 'som_Latn-tir_Ethi', 'som_Latn-tsn_Latn',
+            'som_Latn-tso_Latn', 'som_Latn-tum_Latn', 'som_Latn-twi_Latn',
+            'som_Latn-umb_Latn', 'som_Latn-wol_Latn', 'som_Latn-xho_Latn',
+            'som_Latn-yor_Latn', 'som_Latn-zul_Latn', 'sot_Latn-ssw_Latn',
+            'sot_Latn-swh_Latn', 'sot_Latn-tir_Ethi', 'sot_Latn-tsn_Latn',
+            'sot_Latn-tso_Latn', 'sot_Latn-tum_Latn', 'sot_Latn-twi_Latn',
+            'sot_Latn-umb_Latn', 'sot_Latn-wol_Latn', 'sot_Latn-xho_Latn',
+            'sot_Latn-yor_Latn', 'sot_Latn-zul_Latn', 'ssw_Latn-swh_Latn',
+            'ssw_Latn-tir_Ethi', 'ssw_Latn-tsn_Latn', 'ssw_Latn-tso_Latn',
+            'ssw_Latn-tum_Latn', 'ssw_Latn-twi_Latn', 'ssw_Latn-umb_Latn',
+            'ssw_Latn-wol_Latn', 'ssw_Latn-xho_Latn', 'ssw_Latn-yor_Latn',
+            'ssw_Latn-zul_Latn', 'sun_Latn-war_Latn', 'swh_Latn-tir_Ethi',
+            'swh_Latn-tsn_Latn', 'swh_Latn-tso_Latn', 'swh_Latn-tum_Latn',
+            'swh_Latn-twi_Latn', 'swh_Latn-umb_Latn', 'swh_Latn-wol_Latn',
+            'swh_Latn-xho_Latn', 'swh_Latn-yor_Latn', 'swh_Latn-zul_Latn',
+            'tam_Taml-tel_Telu', 'tam_Taml-urd_Arab', 'tat_Cyrl-tuk_Latn',
+            'tat_Cyrl-uig_Arab', 'tat_Cyrl-uzn_Latn', 'tel_Telu-urd_Arab',
+            'tir_Ethi-tsn_Latn', 'tir_Ethi-tso_Latn', 'tir_Ethi-tum_Latn',
+            'tir_Ethi-twi_Latn', 'tir_Ethi-umb_Latn', 'tir_Ethi-wol_Latn',
+            'tir_Ethi-xho_Latn', 'tir_Ethi-yor_Latn', 'tir_Ethi-zul_Latn',
+            'tsn_Latn-tso_Latn', 'tsn_Latn-tum_Latn', 'tsn_Latn-twi_Latn',
+            'tsn_Latn-umb_Latn', 'tsn_Latn-wol_Latn', 'tsn_Latn-xho_Latn',
+            'tsn_Latn-yor_Latn', 'tsn_Latn-zul_Latn', 'tso_Latn-tum_Latn',
+            'tso_Latn-twi_Latn', 'tso_Latn-umb_Latn', 'tso_Latn-wol_Latn',
+            'tso_Latn-xho_Latn', 'tso_Latn-yor_Latn', 'tso_Latn-zul_Latn',
+            'tuk_Latn-uig_Arab', 'tuk_Latn-uzn_Latn', 'tum_Latn-twi_Latn',
+            'tum_Latn-umb_Latn', 'tum_Latn-wol_Latn', 'tum_Latn-xho_Latn',
+            'tum_Latn-yor_Latn', 'tum_Latn-zul_Latn', 'twi_Latn-umb_Latn',
+            'twi_Latn-wol_Latn', 'twi_Latn-xho_Latn', 'twi_Latn-yor_Latn',
+            'twi_Latn-zul_Latn', 'uig_Arab-uzn_Latn', 'umb_Latn-wol_Latn',
+            'umb_Latn-xho_Latn', 'umb_Latn-yor_Latn', 'umb_Latn-zul_Latn',
+            'wol_Latn-xho_Latn', 'wol_Latn-yor_Latn', 'wol_Latn-zul_Latn',
+            'xho_Latn-yor_Latn', 'xho_Latn-zul_Latn', 'yor_Latn-zul_Latn'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('nllb', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_universal_dependencies(self):
+        subset = [
+            'af_afribooms', 'akk_pisandub', 'akk_riao', 'aqz_tudet', 'sq_tsa',
+            'am_att', 'grc_perseus', 'grc_proiel', 'apu_ufpa', 'ar_nyuad',
+            'ar_padt', 'ar_pud', 'hy_armtdp', 'aii_as', 'bm_crb', 'eu_bdt',
+            'be_hse', 'bho_bhtb', 'br_keb', 'bg_btb', 'bxr_bdt', 'yue_hk',
+            'ca_ancora', 'zh_cfl', 'zh_gsd', 'zh_gsdsimp', 'zh_hk', 'zh_pud',
+            'ckt_hse', 'lzh_kyoto', 'cop_scriptorium', 'hr_set', 'cs_cac',
+            'cs_cltt', 'cs_fictree', 'cs_pdt', 'cs_pud', 'da_ddt', 'nl_alpino',
+            'nl_lassysmall', 'en_esl', 'en_ewt', 'en_gum', 'en_gumreddit',
+            'en_lines', 'en_partut', 'en_pronouns', 'en_pud', 'myv_jr',
+            'et_edt', 'et_ewt', 'fo_farpahc', 'fo_oft', 'fi_ftb', 'fi_ood',
+            'fi_pud', 'fi_tdt', 'fr_fqb', 'fr_ftb', 'fr_gsd', 'fr_partut',
+            'fr_pud', 'fr_sequoia', 'fr_spoken', 'gl_ctg', 'gl_treegal',
+            'de_gsd', 'de_hdt', 'de_lit', 'de_pud', 'got_proiel', 'el_gdt',
+            'he_htb', 'qhe_hiencs', 'hi_hdtb', 'hi_pud', 'hu_szeged',
+            'is_icepahc', 'is_pud', 'id_csui', 'id_gsd', 'id_pud', 'ga_idt',
+            'it_isdt', 'it_partut', 'it_postwita', 'it_pud', 'it_twittiro',
+            'it_vit', 'ja_bccwj', 'ja_gsd', 'ja_modern', 'ja_pud', 'krl_kkpp',
+            'kk_ktb', 'kfm_aha', 'koi_uh', 'kpv_ikdp', 'kpv_lattice', 'ko_gsd',
+            'ko_kaist', 'ko_pud', 'kmr_mg', 'la_ittb', 'la_llct', 'la_perseus',
+            'la_proiel', 'lv_lvtb', 'lt_alksnis', 'lt_hse', 'olo_kkpp',
+            'mt_mudt', 'gv_cadhan', 'mr_ufal', 'gun_dooley', 'gun_thomas',
+            'mdf_jr', 'myu_tudet', 'pcm_nsc', 'nyq_aha', 'sme_giella',
+            'no_bokmaal', 'no_nynorsk', 'no_nynorsklia', 'cu_proiel',
+            'fro_srcmf', 'orv_rnc', 'orv_torot', 'otk_tonqq', 'fa_perdt',
+            'fa_seraji', 'pl_lfg', 'pl_pdb', 'pl_pud', 'pt_bosque', 'pt_gsd',
+            'pt_pud', 'ro_nonstandard', 'ro_rrt', 'ro_simonero', 'ru_gsd',
+            'ru_pud', 'ru_syntagrus', 'ru_taiga', 'sa_ufal', 'sa_vedic',
+            'gd_arcosg', 'sr_set', 'sms_giellagas', 'sk_snk', 'sl_ssj',
+            'sl_sst', 'soj_aha', 'ajp_madar', 'es_ancora', 'es_gsd', 'es_pud',
+            'swl_sslc', 'sv_lines', 'sv_pud', 'sv_talbanken', 'gsw_uzh',
+            'tl_trg', 'tl_ugnayan', 'ta_mwtt', 'ta_ttb', 'te_mtg', 'th_pud',
+            'tpn_tudet', 'qtd_sagt', 'tr_boun', 'tr_gb', 'tr_imst', 'tr_pud',
+            'uk_iu', 'hsb_ufal', 'ur_udtb', 'ug_udt', 'vi_vtb', 'wbp_ufal',
+            'cy_ccg', 'wo_wtb', 'yo_ytb'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('universal_dependencies', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_imdb(self):
+        dataset = MsDataset.load('imdb')
+        if isinstance(dataset, MsDataset):
+            lens = len(dataset)
+            print(f'dataset imdb len: {lens}')
+            self.assertTrue(lens > 0)
+        else:
+            assert isinstance(dataset, dict)
+            lens = {key: len(subset) for key, subset in dataset.items()}
+            print(f'dataset imdb len: {lens}')
+            self.assertTrue(all([_len > 0 for _len in lens.values()]))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_clue(self):
+        subset = [
+            'afqmc', 'tnews', 'iflytek', 'cmnli', 'cluewsc2020', 'csl',
+            'cmrc2018', 'drcd', 'chid', 'c3', 'ocnli', 'diagnostics'
+        ]
+        for subset_name in subset:
+            self.download_subset('clue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_wikitext(self):
+        subset = [
+            'wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1',
+            'wikitext-2-raw-v1'
+        ]
+        for subset_name in subset:
+            self.download_subset('wikitext', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_xnli(self):
+        subset = [
+            'XNLI', 'tydiqa', 'SQuAD', 'PAN-X.af', 'PAN-X.ar', 'PAN-X.bg',
+            'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es',
+            'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr',
+            'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it',
+            'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko',
+            'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl',
+            'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te',
+            'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi',
+            'PAN-X.yo', 'PAN-X.zh', 'MLQA.ar.ar', 'MLQA.ar.de', 'MLQA.ar.vi',
+            'MLQA.ar.zh', 'MLQA.ar.en', 'MLQA.ar.es', 'MLQA.ar.hi',
+            'MLQA.de.ar', 'MLQA.de.de', 'MLQA.de.vi', 'MLQA.de.zh',
+            'MLQA.de.en', 'MLQA.de.es', 'MLQA.de.hi', 'MLQA.vi.ar',
+            'MLQA.vi.de', 'MLQA.vi.vi', 'MLQA.vi.zh', 'MLQA.vi.en',
+            'MLQA.vi.es', 'MLQA.vi.hi', 'MLQA.zh.ar', 'MLQA.zh.de',
+            'MLQA.zh.vi', 'MLQA.zh.zh', 'MLQA.zh.en', 'MLQA.zh.es',
+            'MLQA.zh.hi', 'MLQA.en.ar', 'MLQA.en.de', 'MLQA.en.vi',
+            'MLQA.en.zh', 'MLQA.en.en', 'MLQA.en.es', 'MLQA.en.hi',
+            'MLQA.es.ar', 'MLQA.es.de', 'MLQA.es.vi', 'MLQA.es.zh',
+            'MLQA.es.en', 'MLQA.es.es', 'MLQA.es.hi', 'MLQA.hi.ar',
+            'MLQA.hi.de', 'MLQA.hi.vi', 'MLQA.hi.zh', 'MLQA.hi.en',
+            'MLQA.hi.es', 'MLQA.hi.hi', 'XQuAD.ar', 'XQuAD.de', 'XQuAD.vi',
+            'XQuAD.zh', 'XQuAD.en', 'XQuAD.es', 'XQuAD.hi', 'XQuAD.el',
+            'XQuAD.ru', 'XQuAD.th', 'XQuAD.tr', 'bucc18.de', 'bucc18.fr',
+            'bucc18.zh', 'bucc18.ru', 'PAWS-X.de', 'PAWS-X.en', 'PAWS-X.es',
+            'PAWS-X.fr', 'PAWS-X.ja', 'PAWS-X.ko', 'PAWS-X.zh', 'tatoeba.afr',
+            'tatoeba.ara', 'tatoeba.ben', 'tatoeba.bul', 'tatoeba.deu',
+            'tatoeba.cmn', 'tatoeba.ell', 'tatoeba.est', 'tatoeba.eus',
+            'tatoeba.fin', 'tatoeba.fra', 'tatoeba.heb', 'tatoeba.hin',
+            'tatoeba.hun', 'tatoeba.ind', 'tatoeba.ita', 'tatoeba.jav',
+            'tatoeba.jpn', 'tatoeba.kat', 'tatoeba.kaz', 'tatoeba.kor',
+            'tatoeba.mal', 'tatoeba.mar', 'tatoeba.nld', 'tatoeba.pes',
+            'tatoeba.por', 'tatoeba.rus', 'tatoeba.spa', 'tatoeba.swh',
+            'tatoeba.tam', 'tatoeba.tel', 'tatoeba.tgl', 'tatoeba.tha',
+            'tatoeba.tur', 'tatoeba.urd', 'tatoeba.vie', 'udpos.Afrikaans',
+            'udpos.Arabic', 'udpos.Basque', 'udpos.Bulgarian', 'udpos.Dutch',
+            'udpos.English', 'udpos.Estonian', 'udpos.Finnish', 'udpos.French',
+            'udpos.German', 'udpos.Greek', 'udpos.Hebrew', 'udpos.Hindi',
+            'udpos.Hungarian', 'udpos.Indonesian', 'udpos.Italian',
+            'udpos.Japanese', 'udpos.Kazakh', 'udpos.Korean', 'udpos.Chinese',
+            'udpos.Marathi', 'udpos.Persian', 'udpos.Portuguese',
+            'udpos.Russian', 'udpos.Spanish', 'udpos.Tagalog', 'udpos.Tamil',
+            'udpos.Telugu', 'udpos.Thai', 'udpos.Turkish', 'udpos.Urdu',
+            'udpos.Vietnamese', 'udpos.Yoruba'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('xtreme', subset_name)
diff --git a/tests/models/test_deberta_v2_backbone.py b/tests/models/test_deberta_v2_backbone.py
new file mode 100644
index 00000000..706b18f8
--- /dev/null
+++ b/tests/models/test_deberta_v2_backbone.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.nlp.deberta_v2 import (DebertaV2ForMaskedLM,
+                                              DebertaV2Model)
+from modelscope.utils.constant import Tasks
+
+
+class DebertaV2BackboneTest(unittest.TestCase):
+
+    def test_load_model(self):
+        model = Model.from_pretrained(
+            'damo/nlp_debertav2_fill-mask_chinese-lite')
+        self.assertTrue(model.__class__ == DebertaV2ForMaskedLM)
+        model = Model.from_pretrained(
+            'damo/nlp_debertav2_fill-mask_chinese-lite', task=Tasks.backbone)
+        self.assertTrue(model.__class__ == DebertaV2Model)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/outputs/__init__.py b/tests/outputs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py
new file mode 100644
index 00000000..31271869
--- /dev/null
+++ b/tests/outputs/test_model_outputs.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.outputs import TextClassificationModelOutput
+from modelscope.utils.test_utils import test_level
+
+
+class TestModelOutput(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_model_outputs(self):
+        outputs = TextClassificationModelOutput(logits=torch.Tensor([1]))
+        self.assertEqual(outputs['logits'], torch.Tensor([1]))
+        self.assertEqual(outputs[0], torch.Tensor([1]))
+        self.assertEqual(outputs.logits, torch.Tensor([1]))
+        logits, loss = outputs
+        self.assertEqual(logits, torch.Tensor([1]))
+        self.assertTrue(loss is None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/nlp/test_faq.py b/tests/pipelines/nlp/test_faq.py
new file mode 100644
index 00000000..8bac55d4
--- /dev/null
+++ b/tests/pipelines/nlp/test_faq.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqRanking, SbertForFaqRetrieval
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FaqPipeline
+from modelscope.preprocessors import FaqPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaqTest(unittest.TestCase):
+    model_id = '/Users/tanfan/Desktop/Workdir/Gitlab/maas/MaaS-lib/.faq_test_model'
+    param = {
+        'query_set': ['明天星期几', '今天星期六', '今天星期六'],
+        'support_set': [{
+            'text': '今天星期六',
+            'label': 'label0'
+        }, {
+            'text': '明天星期几',
+            'label': 'label1'
+        }]
+    }
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_run_with_direct_file_download(self):
+    #     cache_path = self.model_id  # snapshot_download(self.model_id)
+    #     preprocessor = FaqPreprocessor(cache_path)
+    #     model = SbertForFaq(cache_path)
+    #     pipeline_ins = FaqPipeline(model, preprocessor=preprocessor)
+    #
+    #     result = pipeline_ins(self.param)
+    #     print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = FaqPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.faq, model=model, preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_run_with_model_name(self):
+    #     pipeline_ins = pipeline(task=Tasks.faq, model=self.model_id)
+    #     result = pipeline_ins(self.param)
+    #     print(result)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_run_with_default_model(self):
+    #     pipeline_ins = pipeline(task=Tasks.faq)
+    #     print(pipeline_ins(self.param))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 21a4e0ce..17fffcaf 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -9,7 +9,8 @@ from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
+from modelscope.utils.nlp.space_T_en.utils import \
+    text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
index 5894297f..2ee46388 100644
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -25,7 +25,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id, revision='update')
+        cache_path = snapshot_download(self.model_id)
         preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
         model = SpaceForDialogIntent(
             model_dir=cache_path,
@@ -46,7 +46,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id, revision='update')
+        model = Model.from_pretrained(self.model_id)
         preprocessor = DialogIntentPredictionPreprocessor(
             model_dir=model.model_dir)
 
@@ -64,10 +64,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model=self.model_id, model_revision='update')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         for my_pipeline, item in list(zip(pipelines, self.test_case)):
             print(my_pipeline(item))
 
diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
index 19d6ed2f..6b6259ce 100644
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -115,8 +115,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
 
-        cache_path = snapshot_download(
-            self.model_id, revision='task_oriented_conversation')
+        cache_path = snapshot_download(self.model_id)
 
         preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
         model = SpaceForDialogModeling(
@@ -130,8 +129,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(
-            self.model_id, revision='task_oriented_conversation')
+        model = Model.from_pretrained(self.model_id)
         preprocessor = DialogModelingPreprocessor(model_dir=model.model_dir)
 
         pipelines = [
@@ -142,20 +140,12 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task,
-                model=self.model_id,
-                model_revision='task_oriented_conversation')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         self.generate_and_print_dialog_response(pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model_revision='task_oriented_conversation')
-        ]
+        pipelines = [pipeline(task=self.task)]
         self.generate_and_print_dialog_response(pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 81bdd9be..6cdd5ee7 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -3,13 +3,14 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SpaceForDialogStateTracking
+from modelscope.models.nlp import SpaceForDST
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
+from modelscope.utils.nlp.space.utils_dst import \
+    tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
@@ -85,9 +86,9 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id, revision='update')
+        cache_path = snapshot_download(self.model_id)
 
-        model = SpaceForDialogStateTracking(cache_path)
+        model = SpaceForDST.from_pretrained(cache_path)
         preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
         pipelines = [
             DialogStateTrackingPipeline(
@@ -101,7 +102,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id, revision='update')
+        model = Model.from_pretrained(self.model_id)
 
         preprocessor = DialogStateTrackingPreprocessor(
             model_dir=model.model_dir)
@@ -115,10 +116,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model=self.model_id, model_revision='update')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         tracking_and_print_dialog_states(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 7eea0ddf..2f66f516 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -47,9 +47,9 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        preprocessor = FaqQuestionAnsweringPreprocessor(cache_path)
-        model = SbertForFaqQuestionAnswering(cache_path)
-        model.load_checkpoint(cache_path)
+        preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(
+            cache_path)
+        model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
         pipeline_ins = FaqQuestionAnsweringPipeline(
             model, preprocessor=preprocessor)
         result = pipeline_ins(self.param)
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 0e5e242b..568865c6 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -5,8 +5,7 @@ from regex import R
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
-                                   VecoForMaskedLM)
+from modelscope.models.nlp import SbertForMaskedLM, VecoForMaskedLM
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import NLPPreprocessor
@@ -55,7 +54,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = NLPPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
-            model = StructBertForMaskedLM.from_pretrained(model_dir)
+            model = SbertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
             pipeline2 = pipeline(
                 Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -130,18 +129,6 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
-        # bert
-        language = 'zh'
-        model = Model.from_pretrained(self.model_id_bert, revision='beta')
-        preprocessor = NLPPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        pipeline_ins.model, f'fill_mask_bert_{language}'
-        print(
-            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-            f'{pipeline_ins(self.test_inputs[language])}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index db4b9912..5f2dcb25 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -27,9 +27,8 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
-        model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        model = Model.from_pretrained(cache_path)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
index 61cdfe73..038a90f0 100644
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -23,7 +23,7 @@ class PartOfSpeechTest(unittest.TestCase):
         model = TokenClassificationModel.from_pretrained(cache_path)
         pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.token_classification, model=model, preprocessor=tokenizer)
+            Tasks.part_of_speech, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
         print()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 739dd7ab..e96724a8 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -4,7 +4,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SentenceEmbedding
+from modelscope.models.nlp import BertForSentenceEmbedding
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
 from modelscope.preprocessors import SentenceEmbeddingPreprocessor
@@ -40,7 +40,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SentenceEmbeddingPreprocessor(cache_path)
-        model = SentenceEmbedding.from_pretrained(cache_path)
+        model = BertForSentenceEmbedding.from_pretrained(cache_path)
         pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 288d38c7..76db0a8f 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -28,8 +28,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
         print('test1')
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index d0b1b40f..b3d9b9d6 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -28,8 +28,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
         tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
             self.model_id, num_labels=2, revision='beta')
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.text_classification, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 44f1531b..eece7f57 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -13,7 +13,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level
 
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
new file mode 100644
index 00000000..5b38e116
--- /dev/null
+++ b/tests/pipelines/test_text_classification.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    sentence1 = 'i like this wonderful place'
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/bert-base-sst2'
+        self.task = Tasks.text_classification
+
+    def predict(self, pipeline_ins: TextClassificationPipeline):
+        from easynlp.appzoo import load_dataset
+
+        set = load_dataset('glue', 'sst2')
+        data = set['test']['sentence'][:3]
+
+        results = pipeline_ins(data[0])
+        print(results)
+        results = pipeline_ins(data[1])
+        print(results)
+
+        print(data)
+
+    def printDataset(self, dataset: MsDataset):
+        for i, r in enumerate(dataset):
+            if i > 10:
+                break
+            print(r)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = SequenceClassificationPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification,
+            model=model,
+            preprocessor=preprocessor)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_model_name(self):
+        text_classification = pipeline(
+            task=Tasks.text_classification, model=self.model_id)
+        result = text_classification(
+            MsDataset.load(
+                'xcopa',
+                subset_name='translation-et',
+                namespace='damotest',
+                split='test',
+                target='premise'))
+        self.printDataset(result)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_default_model(self):
+        text_classification = pipeline(task=Tasks.text_classification)
+        result = text_classification(
+            MsDataset.load(
+                'xcopa',
+                subset_name='translation-et',
+                namespace='damotest',
+                split='test',
+                target='premise'))
+        self.printDataset(result)
+
+    # @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_modelscope_dataset(self):
+        text_classification = pipeline(task=Tasks.text_classification)
+        # loaded from modelscope dataset
+        dataset = MsDataset.load(
+            'xcopa',
+            subset_name='translation-et',
+            namespace='damotest',
+            split='test',
+            target='premise')
+        result = text_classification(dataset)
+        self.printDataset(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index 57fa809c..0b43e8b4 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -4,7 +4,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TextRanking
+from modelscope.models.nlp import BertForTextRanking
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextRankingPipeline
 from modelscope.preprocessors import TextRankingPreprocessor
@@ -33,7 +33,7 @@ class TextRankingTest(unittest.TestCase):
         for model_id in self.models:
             cache_path = snapshot_download(model_id)
             tokenizer = TextRankingPreprocessor(cache_path)
-            model = TextRanking.from_pretrained(cache_path)
+            model = BertForTextRanking.from_pretrained(cache_path)
             pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
             pipeline2 = pipeline(
                 Tasks.text_ranking, model=model, preprocessor=tokenizer)
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index aa8aba5c..ae780793 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -8,7 +8,7 @@ from modelscope.metainfo import Preprocessors, Trainers
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
-from modelscope.trainers import build_trainer
+from modelscope.trainers import NlpTrainerArguments, build_trainer
 from modelscope.trainers.hooks import Hook
 from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
                                              NlpEpochBasedTrainer)
@@ -38,6 +38,52 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_cfg_class(self):
+        dataset = MsDataset.load('clue', subset_name='tnews')
+        train_dataset = dataset['train']
+        validation_dataset = dataset['validation']
+        cfg_modify_fn = NlpTrainerArguments(
+            task=Tasks.text_classification,
+            preprocessor_type=Preprocessors.sen_cls_tokenizer,
+            train_first_sequence='sentence',
+            train_label='label',
+            labels=[
+                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+                '12', '13', '14'
+            ],
+            max_epochs=5,
+            optimizer_args={
+                'lr': 3e-5,
+            },
+            lr_scheduler_args={
+                'total_iters': int(len(train_dataset) / 32) * 5,
+            },
+            checkpoint_saving_type='BestCkptSaverHook',
+            metric_key='accuracy',
+            train_batch_size_per_gpu=32,
+            checkpoint_interval=1,
+            train_workers_per_gpu=0,
+            checkpoint_by_epoch=False,
+            evaluation_interval=1,
+            evaluation_by_epoch=False,
+            eval_workers_per_gpu=0,
+            metrics=['seq-cls-metric'],
+        )
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=train_dataset,
+            eval_dataset=validation_dataset,
+            work_dir=self.tmp_dir,
+            seed=42,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
     @unittest.skip(
         'Skip testing trainer repeatable, because it\'s unstable in daily UT')
     def test_trainer_repeatable(self):
@@ -330,7 +376,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                 2,
                 'dataloader': {
                     'batch_size_per_gpu': 16,
-                    'workers_per_gpu': 1
+                    'workers_per_gpu': 0
                 },
                 'optimizer': {
                     'type': 'AdamW',
@@ -351,7 +397,6 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                 'hooks': [{
                     'type': 'CheckpointHook',
                     'interval': 1,
-                    'save_dir': '/root'
                 }, {
                     'type': 'TextLoggerHook',
                     'interval': 1
@@ -366,7 +411,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             cfg['evaluation'] = {
                 'dataloader': {
                     'batch_size_per_gpu': 128,
-                    'workers_per_gpu': 1,
+                    'workers_per_gpu': 0,
                     'shuffle': False
                 }
             }
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 5b0c9982..9380ad0f 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -7,8 +7,7 @@ import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
 from modelscope.models.base import Model
-from modelscope.models.nlp.sequence_classification import \
-    SbertForSequenceClassification
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import EpochBasedTrainer, build_trainer