[to #42322933]新增：nli，sentiment_classification，dialog_intent，dialog_modeling

添加了，nli，sentiment_classification， dialog_intent, dialog_modeling几个pipeline。同时加入了nlp里面sequence classification一些简单的抽象。去掉了zero_shot_classification Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9159089
3 years ago · 5b98cc1513
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -16,6 +16,7 @@ class Models(object):
    palm = 'palm-v2'
    structbert = 'structbert'
    veco = 'veco'
    space = 'space'
    # audio models
    sambert_hifi_16k = 'sambert-hifi-16k'
@@ -52,7 +53,11 @@ class Pipelines(object):
    word_segmentation = 'word-segmentation'
    text_generation = 'text-generation'
    sentiment_analysis = 'sentiment-analysis'
    sentiment_classification = 'sentiment-classification'
    fill_mask = 'fill-mask'
    nli = 'nli'
    dialog_intent_prediction = 'dialog-intent-prediction'
    dialog_modeling = 'dialog-modeling'
    zero_shot_classification = 'zero-shot-classification'
    # audio tasks
@@ -97,6 +102,11 @@ class Preprocessors(object):
    # nlp preprocessor
    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
    palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
    token_cls_tokenizer = 'token-cls-tokenizer'
    nli_tokenizer = 'nli-tokenizer'
    sen_cls_tokenizer = 'sen-cls-tokenizer'
    dialog_intent_preprocessor = 'dialog-intent-preprocessor'
    dialog_modeling_preprocessor = 'dialog-modeling-preprocessor'
    sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
--- a/modelscope/models/init.py
+++ b/modelscope/models/init.py
@@ -15,9 +15,13 @@ except ModuleNotFoundError as e:
 try:
    from .audio.kws import GenericKeyWordSpotting
    from .multi_modal import OfaForImageCaptioning
    from .nlp import (BertForSequenceClassification,
                      SbertForSentenceSimilarity,
                      SbertForZeroShotClassification)
    from .nlp import (BertForMaskedLM, BertForSequenceClassification,
                      SbertForNLI, SbertForSentenceSimilarity,
                      SbertForSentimentClassification,
                      SbertForTokenClassification,
                      SbertForZeroShotClassification, SpaceForDialogIntent,
                      SpaceForDialogModeling, StructBertForMaskedLM,
                      VecoForMaskedLM)
    from .audio.ans.frcrn import FRCRNModel
 except ModuleNotFoundError as e:
    if str(e) == "No module named 'pytorch'":
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -1,6 +1,10 @@
 from .bert_for_sequence_classification import *  # noqa F403
 from .masked_language_model import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_nli import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_sentiment_classification import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
 from .sbert_for_zero_shot_classification import *  # noqa F403
 from .space.dialog_intent_prediction_model import *  # noqa F403
 from .space.dialog_modeling_model import *  # noqa F403
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -4,8 +4,8 @@ from typing import Any, Dict
 import json
 import numpy as np
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..base import Model
 from ..builder import MODELS
--- a/modelscope/models/nlp/masked_language_model.py
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -16,16 +16,22 @@ class MaskedLanguageModelBase(Model):
        super().__init__(model_dir, *args, **kwargs)
        self.model = self.build_model()
    def build_model():
    def build_model(self):
        raise NotImplementedError()
    def train(self):
        return self.model.train()
    def eval(self):
        return self.model.eval()
    @property
    def config(self):
        if hasattr(self.model, 'config'):
            return self.model.config
        return None
    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
        """return the result by the model
        Args:
@@ -35,10 +41,10 @@ class MaskedLanguageModelBase(Model):
            Dict[str, np.ndarray]: results
        """
        rst = self.model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs['token_type_ids'])
        return {'logits': rst['logits'], 'input_ids': inputs['input_ids']}
            input_ids=input['input_ids'],
            attention_mask=input['attention_mask'],
            token_type_ids=input['token_type_ids'])
        return {'logits': rst['logits'], 'input_ids': input['input_ids']}
@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -1,7 +1,7 @@
 from typing import Dict
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
@@ -20,13 +20,18 @@ class PalmForTextGeneration(Model):
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.tokenizer = model.tokenizer
        self.generator = Translator(model)
    def train(self):
        return self.generator.train()
    def eval(self):
        return self.generator.eval()
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model
--- a/modelscope/models/nlp/sbert_for_nli.py
+++ b/modelscope/models/nlp/sbert_for_nli.py
@@ -0,0 +1,23 @@
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase
 __all__ = ['SbertForNLI']
@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
 class SbertForNLI(SbertForSequenceClassificationBase):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.
        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(
            model_dir, *args, model_args={'num_labels': 3}, **kwargs)
        assert self.model.config.num_labels == 3
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
@@ -1,46 +1,15 @@
 import os
 from typing import Any, Dict
 import json
 import numpy as np
 import torch
 from sofa import SbertModel
 from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
 from torch import nn
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase
 __all__ = ['SbertForSentenceSimilarity']
 class SbertTextClassifier(SbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.encoder = SbertModel(config, add_pooling_layer=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    def forward(self, input_ids=None, token_type_ids=None):
        outputs = self.encoder(
            input_ids,
            token_type_ids=token_type_ids,
            return_dict=None,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
@MODELS.register_module(
    Tasks.sentence_similarity, module_name=Models.structbert)
 class SbertForSentenceSimilarity(Model):
 class SbertForSentenceSimilarity(SbertForSequenceClassificationBase):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the sentence similarity model from the `model_dir` path.
@@ -50,39 +19,7 @@ class SbertForSentenceSimilarity(Model):
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        super().__init__(
            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
        self.model_dir = model_dir
        self.model = SbertTextClassifier.from_pretrained(
            model_dir, num_labels=2)
        self.model.eval()
        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        """return the result by the model
        Args:
            input (Dict[str, Any]): the preprocessed data
        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """
        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(
            input['token_type_ids'], dtype=torch.long)
        with torch.no_grad():
            logits = self.model(input_ids, token_type_ids)
        probs = logits.softmax(-1).numpy()
        pred = logits.argmax(-1).numpy()
        logits = logits.numpy()
        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
        return res
        assert self.model.config.num_labels == 2
--- a/modelscope/models/nlp/sbert_for_sentiment_classification.py
+++ b/modelscope/models/nlp/sbert_for_sentiment_classification.py
@@ -0,0 +1,22 @@
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase
 __all__ = ['SbertForSentimentClassification']
@MODELS.register_module(
    Tasks.sentiment_classification, module_name=Models.structbert)
 class SbertForSentimentClassification(SbertForSequenceClassificationBase):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.
        Args:
            model_dir (str): the model path.
        """
        super().__init__(
            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
        assert self.model.config.num_labels == 2
--- a/modelscope/models/nlp/sbert_for_sequence_classification.py
+++ b/modelscope/models/nlp/sbert_for_sequence_classification.py
@@ -0,0 +1,71 @@
 import os
 from typing import Any, Dict
 import json
 import numpy as np
 import torch
 from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel
 from torch import nn
 from ..base import Model
 class SbertTextClassfier(SbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.encoder = SbertModel(config, add_pooling_layer=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    def forward(self, input_ids=None, token_type_ids=None):
        outputs = self.encoder(
            input_ids,
            token_type_ids=token_type_ids,
            return_dict=None,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return {'logits': logits}
 class SbertForSequenceClassificationBase(Model):
    def __init__(self, model_dir: str, model_args=None, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        if model_args is None:
            model_args = {}
        self.model = SbertTextClassfier.from_pretrained(
            model_dir, **model_args)
        self.id2label = {}
        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        if os.path.exists(self.label_path):
            with open(self.label_path) as f:
                self.label_mapping = json.load(f)
            self.id2label = {
                idx: name
                for name, idx in self.label_mapping.items()
            }
    def train(self):
        return self.model.train()
    def eval(self):
        return self.model.eval()
    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(
            input['token_type_ids'], dtype=torch.long)
        return self.model.forward(input_ids, token_type_ids)
    def postprocess(self, input, **kwargs):
        logits = input['logits']
        probs = logits.softmax(-1).numpy()
        pred = logits.argmax(-1).numpy()
        logits = logits.numpy()
        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
        return res
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -2,18 +2,17 @@ from typing import Any, Dict, Union
 import numpy as np
 import torch
 from sofa import SbertConfig, SbertForTokenClassification
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 __all__ = ['StructBertForTokenClassification']
 __all__ = ['SbertForTokenClassification']
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
 class StructBertForTokenClassification(Model):
 class SbertForTokenClassification(Model):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the word segmentation model from the `model_dir` path.
@@ -25,9 +24,16 @@ class StructBertForTokenClassification(Model):
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.model = SbertForTokenClassification.from_pretrained(
        import sofa
        self.model = sofa.SbertForTokenClassification.from_pretrained(
            self.model_dir)
        self.config = SbertConfig.from_pretrained(self.model_dir)
        self.config = sofa.SbertConfig.from_pretrained(self.model_dir)
    def train(self):
        return self.model.train()
    def eval(self):
        return self.model.eval()
    def forward(self, input: Dict[str,
                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
@@ -46,10 +52,12 @@ class StructBertForTokenClassification(Model):
                    }
        """
        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
        output = self.model(input_ids)
        logits = output.logits
        return {**self.model(input_ids), 'text': input['text']}
    def postprocess(self, input: Dict[str, Tensor],
                    **kwargs) -> Dict[str, Tensor]:
        logits = input['logits']
        pred = torch.argmax(logits[0], dim=-1)
        pred = pred.numpy()
        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
        return rst
--- a/modelscope/models/nlp/space/init.py
+++ b/modelscope/models/nlp/space/init.py
--- a/modelscope/models/nlp/space/dialog_intent_prediction_model.py
+++ b/modelscope/models/nlp/space/dialog_intent_prediction_model.py
@@ -0,0 +1,81 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 from ....metainfo import Models
 from ....preprocessors.space.fields.intent_field import IntentBPETextField
 from ....trainers.nlp.space.trainer.intent_trainer import IntentTrainer
 from ....utils.config import Config
 from ....utils.constant import ModelFile, Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
 from .model.model_base import SpaceModelBase
 __all__ = ['SpaceForDialogIntent']
@MODELS.register_module(
    Tasks.dialog_intent_prediction, module_name=Models.space)
 class SpaceForDialogIntent(Model):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the test generation model from the `model_dir` path.
        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.config = kwargs.pop(
            'config',
            Config.from_file(
                os.path.join(self.model_dir, ModelFile.CONFIGURATION)))
        self.text_field = kwargs.pop(
            'text_field',
            IntentBPETextField(self.model_dir, config=self.config))
        self.generator = Generator.create(self.config, reader=self.text_field)
        self.model = SpaceModelBase.create(
            model_dir=model_dir,
            config=self.config,
            reader=self.text_field,
            generator=self.generator)
        def to_tensor(array):
            """
            numpy array -> tensor
            """
            import torch
            array = torch.tensor(array)
            return array.cuda() if self.config.use_gpu else array
        self.trainer = IntentTrainer(
            model=self.model,
            to_tensor=to_tensor,
            config=self.config,
            reader=self.text_field)
        self.trainer.load()
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model
        Args:
            input (Dict[str, Any]): the preprocessed data
        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """
        import numpy as np
        pred = self.trainer.forward(input)
        pred = np.squeeze(pred[0], 0)
        return {'pred': pred}
--- a/modelscope/models/nlp/space/dialog_modeling_model.py
+++ b/modelscope/models/nlp/space/dialog_modeling_model.py
@@ -0,0 +1,82 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional
 from ....metainfo import Models
 from ....preprocessors.space.fields.gen_field import MultiWOZBPETextField
 from ....trainers.nlp.space.trainer.gen_trainer import MultiWOZTrainer
 from ....utils.config import Config
 from ....utils.constant import ModelFile, Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
 from .model.model_base import SpaceModelBase
 __all__ = ['SpaceForDialogModeling']
@MODELS.register_module(Tasks.dialog_modeling, module_name=Models.space)
 class SpaceForDialogModeling(Model):
    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the test generation model from the `model_dir` path.
        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.config = kwargs.pop(
            'config',
            Config.from_file(
                os.path.join(self.model_dir, ModelFile.CONFIGURATION)))
        self.text_field = kwargs.pop(
            'text_field',
            MultiWOZBPETextField(self.model_dir, config=self.config))
        self.generator = Generator.create(self.config, reader=self.text_field)
        self.model = SpaceModelBase.create(
            model_dir=model_dir,
            config=self.config,
            reader=self.text_field,
            generator=self.generator)
        def to_tensor(array):
            """
            numpy array -> tensor
            """
            import torch
            array = torch.tensor(array)
            return array.cuda() if self.config.use_gpu else array
        self.trainer = MultiWOZTrainer(
            model=self.model,
            to_tensor=to_tensor,
            config=self.config,
            reader=self.text_field,
            evaluator=None)
        self.trainer.load()
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model
        Args:
            input (Dict[str, Any]): the preprocessed data
        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """
        turn = {'user': input['user']}
        old_pv_turn = input['history']
        pv_turn = self.trainer.forward(turn=turn, old_pv_turn=old_pv_turn)
        return pv_turn
--- a/modelscope/models/nlp/space/model/init.py
+++ b/modelscope/models/nlp/space/model/init.py
@@ -0,0 +1,3 @@
 from .gen_unified_transformer import GenUnifiedTransformer
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .unified_transformer import UnifiedTransformer
--- a/modelscope/models/nlp/space/model/gen_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/gen_unified_transformer.py
@@ -0,0 +1,283 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 from .unified_transformer import UnifiedTransformer
 class GenUnifiedTransformer(UnifiedTransformer):
    """
    Implement generation unified transformer.
    """
    def __init__(self, model_dir, config, reader, generator):
        super(GenUnifiedTransformer, self).__init__(model_dir, config, reader,
                                                    generator)
        self.understand = config.BPETextField.understand
        if self.use_gpu:
            self.cuda()
        return
    def _forward(self, inputs, is_training, with_label):
        """ Real forward process of model in different mode(train/test). """
        def cat(x, y, dim=1):
            return torch.cat([x, y], dim=dim)
        outputs = {}
        if self.understand or self.policy:
            if self.understand:
                prompt_token = inputs['understand_token']
                prompt_mask = inputs['understand_mask']
                if self.policy:
                    prompt_token = cat(prompt_token, inputs['policy_token'])
                    prompt_mask = cat(prompt_mask, inputs['policy_mask'])
            else:
                prompt_token = inputs['policy_token']
                prompt_mask = inputs['policy_mask']
            enc_embed, dec_embed, prompt_embed = self._encoder_prompt_decoder_network(
                src_token=inputs['src_token'],
                src_mask=inputs['src_mask'],
                tgt_token=inputs['tgt_token'][:, :-1],
                tgt_mask=inputs['tgt_mask'][:, :-1],
                prompt_token=prompt_token,
                prompt_mask=prompt_mask,
                src_pos=inputs['src_pos'],
                src_type=inputs['src_type'],
                src_turn=inputs['src_turn'],
                tgt_pos=inputs['tgt_pos'][:, :-1],
                tgt_type=inputs['tgt_type'][:, :-1],
                tgt_turn=inputs['tgt_turn'][:, :-1])
        else:
            enc_embed, dec_embed = self._encoder_decoder_network(
                src_token=inputs['src_token'],
                src_mask=inputs['src_mask'],
                tgt_token=inputs['tgt_token'][:, :-1],
                tgt_mask=inputs['tgt_mask'][:, :-1],
                src_pos=inputs['src_pos'],
                src_type=inputs['src_type'],
                src_turn=inputs['src_turn'],
                tgt_pos=inputs['tgt_pos'][:, :-1],
                tgt_type=inputs['tgt_type'][:, :-1],
                tgt_turn=inputs['tgt_turn'][:, :-1])
        outputs['dec_probs'] = self._dec_head(dec_embed=dec_embed)
        return outputs
    def _collect_metrics(self, inputs, outputs, with_label, data_file):
        metrics = {}
        loss = 0.
        label = inputs['tgt_token'][:, 1:]
        token_num = torch.sum(torch.sum(inputs['tgt_mask'], dim=1) - 1)
        nll = self.nll_loss(
            torch.log(outputs['dec_probs'] + 1e-12).permute(0, 2, 1), label)
        nll = torch.sum(nll, dim=1)
        token_nll = torch.sum(nll) / token_num
        nll = torch.mean(nll)
        metrics['nll'] = nll
        metrics['token_nll'] = token_nll
        metrics['token_num'] = token_num
        loss = loss + (token_nll if self.token_loss else nll)
        metrics['loss'] = loss
        if self.gpu > 1:
            return nll, token_nll, token_num
        else:
            return metrics
    def _optimize(self, loss, do_update=False, optimizer=None):
        """ Optimize loss function and update model. """
        assert optimizer is not None
        if self.gradient_accumulation_steps > 1:
            loss = loss / self.gradient_accumulation_steps
        loss.backward()
        if self.grad_clip is not None and self.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(
                parameters=self.parameters(), max_norm=self.grad_clip)
        if do_update:
            optimizer.step()
            optimizer.zero_grad()
        return
    def _init_state(self,
                    src_token,
                    src_mask,
                    src_pos=None,
                    src_type=None,
                    src_turn=None):
        """ Initialize decode state. """
        state = {}
        batch_size = src_token.shape[0]
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        src_embed = self.embed_layer_norm(src_embed)
        mask = self._create_mask(src_mask, append_head=False)
        enc_out = src_embed
        cache = {}
        for _l, layer in enumerate(self.layers):
            cache[f'layer_{_l}'] = {}
            enc_out = layer(enc_out, mask, cache[f'layer_{_l}'])
        state['cache'] = cache
        state['mask'] = mask[:, :1]
        state['batch_size'] = batch_size
        shape = [batch_size, 1, 1]
        state['pred_mask'] = torch.ones(shape, dtype=torch.float32)
        state['pred_pos'] = torch.zeros(shape, dtype=torch.int64)
        state['pred_type'] = torch.zeros(shape, dtype=torch.int64)
        state['pred_turn'] = torch.zeros(shape, dtype=torch.int64)
        if self.use_gpu:
            state['pred_mask'] = state['pred_mask'].cuda()
            state['pred_pos'] = state['pred_pos'].cuda()
            state['pred_type'] = state['pred_type'].cuda()
            state['pred_turn'] = state['pred_turn'].cuda()
        return state
    def _init_prompt_state(self,
                           src_token,
                           src_mask,
                           prompt_token,
                           prompt_mask,
                           src_pos=None,
                           src_type=None,
                           src_turn=None,
                           prompt_pos=None,
                           prompt_type=None,
                           prompt_turn=None):
        """ Initialize decode state. """
        state = {}
        batch_size = src_token.shape[0]
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        prompt_embed = self.embedder(prompt_token, prompt_pos, prompt_type,
                                     prompt_turn)
        embed = torch.cat([src_embed, prompt_embed], dim=1)
        embed = self.embed_layer_norm(embed)
        enc_out = embed
        enc_mask = self._create_mask(src_mask, auto_regressive=False)
        dec_mask = self._create_mask(prompt_mask, auto_regressive=True)
        mask = self._join_mask(enc_mask, dec_mask)
        cache = {}
        for _l, layer in enumerate(self.layers):
            cache[f'layer_{_l}'] = {}
            enc_out = layer(enc_out, mask, cache[f'layer_{_l}'])
        state['cache'] = cache
        state['mask'] = mask[:, -1:]  # state["mask"] = mask[:, :1]
        state['batch_size'] = batch_size
        shape = [batch_size, 1, 1]
        state['pred_mask'] = torch.ones(shape, dtype=torch.float32)
        state['pred_pos'] = torch.zeros(shape, dtype=torch.int64)
        state['pred_type'] = torch.zeros(shape, dtype=torch.int64)
        state['pred_turn'] = torch.zeros(shape, dtype=torch.int64)
        if self.use_gpu:
            state['pred_mask'] = state['pred_mask'].cuda()
            state['pred_pos'] = state['pred_pos'].cuda()
            state['pred_type'] = state['pred_type'].cuda()
            state['pred_turn'] = state['pred_turn'].cuda()
        return state
    def _decode(self, state):
        """ Decoding one time stamp. """
        # shape: [batch_size, 1, seq_len]
        mask = state['mask']
        # shape: [batch_size, 1, 1]
        pred_token = state['pred_token']
        pred_mask = state['pred_mask']
        pred_pos = state['pred_pos']
        pred_type = state['pred_type']
        pred_turn = state['pred_turn']
        # list of shape(len: num_layers): [batch_size, seq_len, hidden_dim]
        cache = state['cache']
        pred_embed = self.embedder(pred_token, pred_pos, pred_type,
                                   pred_turn).squeeze(-2)
        pred_embed = self.embed_layer_norm(pred_embed)
        # shape: [batch_size, 1, seq_len + 1]
        mask = torch.cat([mask, 1 - pred_mask], dim=2)
        # shape: [batch_size, 1, hidden_dim]
        for _l, layer in enumerate(self.layers):
            pred_embed = layer(pred_embed, mask, cache[f'layer_{_l}'])
        # shape: [batch_size, vocab_size]
        pred_probs = self._dec_head(dec_embed=pred_embed[:, 0])
        pred_logits = torch.log(pred_probs)
        state['mask'] = mask
        return pred_logits, state
    def _infer(self,
               inputs,
               start_id=None,
               eos_id=None,
               max_gen_len=None,
               prev_input=None):
        """ Real inference process of model. """
        def cat(x, y, dim=1):
            return torch.cat([x, y], dim=dim)
        # Initial decode state.
        if self.understand or self.policy:
            if self.understand:
                prompt_token = inputs['understand_token']
                prompt_mask = inputs['understand_mask']
                if self.policy:
                    prompt_token = cat(prompt_token, inputs['policy_token'])
                    prompt_mask = cat(prompt_mask, inputs['policy_mask'])
            else:
                prompt_token = inputs['policy_token']
                prompt_mask = inputs['policy_mask']
            state = self._init_prompt_state(
                src_token=inputs['src_token'],
                src_mask=inputs['src_mask'],
                prompt_token=prompt_token,
                prompt_mask=prompt_mask,
                src_pos=inputs['src_pos'],
                src_type=inputs['src_type'],
                src_turn=inputs['src_turn'])
        else:
            state = self._init_state(
                src_token=inputs['src_token'],
                src_mask=inputs['src_mask'],
                src_pos=inputs['src_pos'],
                src_type=inputs['src_type'],
                src_turn=inputs['src_turn'])
        # Generation process.
        gen_results = self.generator(
            step_fn=self._decode,
            state=state,
            start_id=start_id,
            eos_id=eos_id,
            max_gen_len=max_gen_len,
            prev_input=prev_input)
        outputs = gen_results['preds']
        return outputs
 GenUnifiedTransformer.register('GenUnifiedTransformer')
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -0,0 +1,287 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import numpy as np
 import torch
 def repeat(var, times):
    if isinstance(var, list):
        return [repeat(x, times) for x in var]
    elif isinstance(var, dict):
        return {k: repeat(v, times) for k, v in var.items()}
    elif isinstance(var, torch.Tensor):
        var = var.unsqueeze(1)
        expand_times = [1] * len(var.shape)
        expand_times[1] = times
        dtype = var.dtype
        var = var.float()
        var = var.repeat(*expand_times)
        shape = [var.shape[0] * var.shape[1]] + list(var.shape[2:])
        var = var.reshape(*shape)
        var = torch.tensor(var, dtype=dtype)
        return var
    else:
        return var
 def gather(var, idx):
    if isinstance(var, list):
        return [gather(x, idx) for x in var]
    elif isinstance(var, dict):
        return {k: gather(v, idx) for k, v in var.items()}
    elif isinstance(var, torch.Tensor):
        out = var.index_select(dim=0, index=idx)
        return out
    else:
        return var
 class Generator(object):
    """ Genrator class. """
    _registry = dict()
    @classmethod
    def register(cls, name):
        Generator._registry[name] = cls
        return
    @staticmethod
    def by_name(name):
        return Generator._registry[name]
    @staticmethod
    def create(config, *args, **kwargs):
        """ Create generator. """
        generator_cls = Generator.by_name(config.Generator.generator)
        return generator_cls(config, *args, **kwargs)
    def __init__(self, config, reader):
        self.vocab_size = reader.vocab_size
        self.bos_id = reader.bos_id
        self.eos_id = reader.eos_id
        self.unk_id = reader.unk_id
        self.pad_id = reader.pad_id
        self.min_gen_len = config.Generator.min_gen_len
        self.max_gen_len = config.Generator.max_gen_len
        self.use_gpu = config.use_gpu
        assert 1 <= self.min_gen_len <= self.max_gen_len
        return
    def __call__(self, step_fn, state):
        """
        Running generation.
        @param : step_fn : decoding one step
        @type : function
        @param : state : initial state
        @type : dict
        """
        raise NotImplementedError
 class BeamSearch(Generator):
    """ BeamSearch generator. """
    def __init__(self, config, reader):
        super().__init__(config, reader)
        self.beam_size = config.Generator.beam_size
        self.length_average = config.Generator.length_average
        self.length_penalty = config.Generator.length_penalty
        self.ignore_unk = config.Generator.ignore_unk
        return
    def __call__(self,
                 step_fn,
                 state,
                 start_id=None,
                 eos_id=None,
                 max_gen_len=None,
                 prev_input=None):
        """
        Running beam search.
        @param : step_fn : decoding one step
        @type : function
        @param : state : initial state
        @type : dict
        """
        if prev_input is not None:
            if isinstance(prev_input, list):
                length = max(list(map(lambda x: len(x), prev_input)))
                prev_input_numpy = np.full((len(prev_input), length),
                                           self.pad_id)
                for i, x in enumerate(prev_input):
                    prev_input_numpy[i, :len(x)] = x
                prev_input_tensor = torch.from_numpy(prev_input_numpy)
                if self.use_gpu:
                    prev_input_tensor = prev_input_tensor.cuda()
                for i in range(length):
                    state['pred_token'] = prev_input_tensor[:, i].unsqueeze(
                        -1).unsqueeze(-1)
                    if i != 0:
                        state['pred_mask'] = torch.not_equal(
                            state['pred_token'], self.pad_id).float()
                        state['pred_pos'] = state['pred_pos'] + state[
                            'pred_mask'].int()
                    _, state = step_fn(state)
            else:
                assert isinstance(prev_input, torch.Tensor)
                for i, input in enumerate(prev_input):
                    state['pred_token'] = input.expand(1, 1, 1)
                    if i != 0:
                        state['pred_mask'] = torch.not_equal(
                            state['pred_token'], self.pad_id).float()
                        state['pred_pos'] = state['pred_pos'] + 1
                    _, state = step_fn(state)
        batch_size = state['batch_size']
        beam_size = self.beam_size
        # shape: [batch_size, 1]
        pos_index = torch.arange(
            0, batch_size, 1, dtype=torch.int64) * beam_size
        pos_index = pos_index.unsqueeze(1)
        # shape: [batch_size, beam_size, 1]
        if start_id is None:
            start_id = self.bos_id
        if eos_id is None:
            eos_id = self.eos_id
        predictions = torch.ones([batch_size, beam_size, 1],
                                 dtype=torch.int64) * start_id
        if self.use_gpu:
            pos_index = pos_index.cuda()
            predictions = predictions.cuda()
        # initial input (start_id)
        state['pred_token'] = predictions[:, :1]
        if prev_input is not None:
            state['pred_mask'] = torch.not_equal(state['pred_token'],
                                                 self.pad_id).float()
            state['pred_pos'] = state['pred_pos'] + 1
        # shape: [batch_size, vocab_size]
        scores, state = step_fn(state)
        unk_penalty = np.zeros(self.vocab_size, dtype='float32')
        unk_penalty[self.unk_id] = -1e10
        unk_penalty = torch.from_numpy(unk_penalty)
        eos_penalty = np.zeros(self.vocab_size, dtype='float32')
        eos_penalty[eos_id] = -1e10
        eos_penalty = torch.from_numpy(eos_penalty)
        scores_after_end = np.full(self.vocab_size, -1e10, dtype='float32')
        scores_after_end[
            self.
            pad_id] = 0  # we want <pad> is generated after <eos>，so maximum log(p(<pad>)) is (0)
        scores_after_end = torch.from_numpy(scores_after_end)
        if self.use_gpu:
            unk_penalty = unk_penalty.cuda()
            eos_penalty = eos_penalty.cuda()
            scores_after_end = scores_after_end.cuda()
        if self.ignore_unk:
            scores = scores + unk_penalty
        scores = scores + eos_penalty
        # shape: [batch_size, beam_size]
        sequence_scores, preds = torch.topk(scores, self.beam_size)
        predictions = torch.cat([predictions, preds.unsqueeze(2)], dim=2)
        state = repeat(state, beam_size)
        if max_gen_len is None:
            max_gen_len = self.max_gen_len
        for step in range(2, max_gen_len + 1):
            pre_ids = predictions[:, :, -1:]
            state['pred_token'] = pre_ids.reshape(batch_size * beam_size, 1, 1)
            state['pred_mask'] = torch.not_equal(state['pred_token'],
                                                 self.pad_id).float()
            state['pred_pos'] = state['pred_pos'] + 1
            scores, state = step_fn(state)
            # Generate next
            # scores shape: [batch_size * beam_size, vocab_size]
            if self.ignore_unk:
                scores = scores + unk_penalty
            if step <= self.min_gen_len:
                scores = scores + eos_penalty
            # scores shape: [batch_size, beam_size, vocab_size]
            scores = scores.reshape(batch_size, beam_size, self.vocab_size)
            # previous token is [PAD] or [EOS]
            pre_eos_mask = (1 - torch.not_equal(pre_ids, eos_id).float()) + \
                           (1 - torch.not_equal(pre_ids, self.pad_id).float())
            scores = scores * (1 - pre_eos_mask) + pre_eos_mask.repeat(
                1, 1, self.vocab_size) * scores_after_end
            if self.length_average:
                scaled_value = \
                    pre_eos_mask + (1 - pre_eos_mask) * (1 - 1 / step)
                sequence_scores = sequence_scores.unsqueeze(2) * scaled_value
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * (1 / step)
                scores = scores * scaled_value
            elif self.length_penalty >= 0.0:
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \
                    (math.pow((4 + step) / (5 + step), self.length_penalty))
                sequence_scores = scaled_value * sequence_scores
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \
                    (math.pow(1 / (5 + step), self.length_penalty))
                scores = scores * scaled_value
            scores = scores + sequence_scores.unsqueeze(-1)
            scores = scores.reshape(batch_size, beam_size * self.vocab_size)
            topk_scores, topk_indices = torch.topk(scores, beam_size)
            # topk_indices: [batch_size, beam_size * self.vocab_size] (already reshaped)
            parent_idx = topk_indices.floor_divide(self.vocab_size)
            preds = topk_indices % self.vocab_size
            # Gather state / sequence_scores
            parent_idx = parent_idx + pos_index
            parent_idx = parent_idx.reshape(batch_size * beam_size)
            state = gather(state, parent_idx)
            sequence_scores = topk_scores
            predictions = predictions.reshape(batch_size * beam_size, step)
            predictions = gather(predictions, parent_idx)
            predictions = predictions.reshape(batch_size, beam_size, step)
            predictions = torch.cat([predictions, preds.unsqueeze(2)], dim=2)
        # The last token should be <eos> or <pad>
        pre_ids = predictions[:, :, -1]
        pre_eos_mask = (1 - torch.not_equal(pre_ids, eos_id).float()) + \
                       (1 - torch.not_equal(pre_ids, self.pad_id).float())
        sequence_scores = sequence_scores * pre_eos_mask + (
            1 - pre_eos_mask) * (-1e10)
        # first get ascending ordered index，then sort "predictions" and "sequence_scores"
        indices = torch.argsort(sequence_scores, dim=1)
        indices = indices + pos_index
        indices = indices.reshape(-1)
        sequence_scores = sequence_scores.reshape(batch_size * beam_size)
        predictions = predictions.reshape(batch_size * beam_size, -1)
        sequence_scores = gather(sequence_scores, indices)
        predictions = gather(predictions, indices)
        sequence_scores = sequence_scores.reshape(batch_size, beam_size)
        predictions = predictions.reshape(batch_size, beam_size, -1)
        results = {
            'preds': predictions[:, -1],
            'scores': sequence_scores[:, -1]
        }
        return results
 BeamSearch.register('BeamSearch')
--- a/modelscope/models/nlp/space/model/intent_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/intent_unified_transformer.py
@@ -0,0 +1,197 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .....utils.nlp.space.criterions import compute_kl_loss
 from .unified_transformer import UnifiedTransformer
 class IntentUnifiedTransformer(UnifiedTransformer):
    """
    Implement intent unified transformer.
    """
    def __init__(self, model_dir, config, reader, generator):
        super(IntentUnifiedTransformer, self).__init__(model_dir, config,
                                                       reader, generator)
        self.example = config.Model.example
        self.num_intent = config.Model.num_intent
        self.with_rdrop = config.Model.with_rdrop
        self.kl_ratio = config.Model.kl_ratio
        self.loss_fct = nn.CrossEntropyLoss()
        if self.example:
            self.loss_fct = nn.NLLLoss()
        else:
            self.intent_classifier = nn.Linear(self.hidden_dim,
                                               self.num_intent)
            self.loss_fct = nn.CrossEntropyLoss()
        if self.use_gpu:
            self.cuda()
        return
    def _forward(self, inputs, is_training, with_label):
        """ Real forward process of model in different mode(train/test). """
        def aug(v):
            assert isinstance(v, torch.Tensor)
            return torch.cat([v, v], dim=0)
        outputs = {}
        if self.with_mlm:
            mlm_embed = self._encoder_network(
                input_token=inputs['mlm_token'],
                input_mask=inputs['src_mask'],
                input_pos=inputs['src_pos'],
                input_type=inputs['src_type'],
                input_turn=inputs['src_turn'])
            outputs['mlm_probs'] = self._mlm_head(mlm_embed=mlm_embed)
        if self.with_rdrop or self.with_contrastive:
            enc_embed, dec_embed = self._encoder_decoder_network(
                src_token=aug(inputs['src_token']),
                src_mask=aug(inputs['src_mask']),
                tgt_token=aug(inputs['tgt_token']),
                tgt_mask=aug(inputs['tgt_mask']),
                src_pos=aug(inputs['src_pos']),
                src_type=aug(inputs['src_type']),
                src_turn=aug(inputs['src_turn']))
        else:
            enc_embed, dec_embed = self._encoder_decoder_network(
                src_token=inputs['src_token'],
                src_mask=inputs['src_mask'],
                tgt_token=inputs['tgt_token'],
                tgt_mask=inputs['tgt_mask'],
                src_pos=inputs['src_pos'],
                src_type=inputs['src_type'],
                src_turn=inputs['src_turn'])
        features = dec_embed[:, -1]
        features = self.pooler(features) if self.with_pool else features
        if self.example:
            assert not self.with_rdrop
            ex_enc_embed, ex_dec_embed = self._encoder_decoder_network(
                src_token=inputs['example_src_token'],
                src_mask=inputs['example_src_mask'],
                tgt_token=inputs['example_tgt_token'],
                tgt_mask=inputs['example_tgt_mask'],
                src_pos=inputs['example_src_pos'],
                src_type=inputs['example_src_type'],
                src_turn=inputs['example_src_turn'])
            ex_features = ex_dec_embed[:, -1]
            ex_features = self.pooler(
                ex_features) if self.with_pool else ex_features
            probs = self.softmax(features.mm(ex_features.t()))
            example_intent = inputs['example_intent'].unsqueeze(0)
            intent_probs = torch.zeros(probs.size(0), self.num_intent)
            intent_probs = intent_probs.cuda(
            ) if self.use_gpu else intent_probs
            intent_probs = intent_probs.scatter_add(
                -1, example_intent.repeat(probs.size(0), 1), probs)
            outputs['intent_probs'] = intent_probs
        else:
            intent_logits = self.intent_classifier(features)
            outputs['intent_logits'] = intent_logits
        if self.with_contrastive:
            features = features if self.with_pool else self.pooler(features)
            batch_size = features.size(0) // 2
            features = \
                torch.cat(
                    [features[:batch_size].unsqueeze(1), features[batch_size:].unsqueeze(1)],
                    dim=1
                )
            features = F.normalize(features, dim=-1, p=2)
            outputs['features'] = features
        return outputs
    def _collect_metrics(self, inputs, outputs, with_label, data_file):
        metrics = {}
        batch_size = inputs['src_token'].size(0)
        intent_label = torch.cat([inputs['intent_label'], inputs['intent_label']], dim=0) \
            if self.with_rdrop or self.with_contrastive else inputs['intent_label']
        if self.example:
            intent_loss = self.loss_fct(
                torch.log(outputs['intent_probs'] + 1e-12).view(
                    -1, self.num_intent), intent_label.type(torch.long))
        else:
            intent_loss = self.loss_fct(
                outputs['intent_logits'].view(-1, self.num_intent),
                intent_label.type(torch.long))
        metrics['intent_loss'] = intent_loss
        loss = intent_loss
        if self.with_mlm:
            mlm_num = torch.sum(torch.sum(inputs['mlm_mask'], dim=1))
            mlm = self.nll_loss(
                torch.log(outputs['mlm_probs'] + 1e-12).permute(0, 2, 1),
                inputs['mlm_label'])
            mlm = torch.sum(mlm, dim=1)
            token_mlm = torch.sum(mlm) / mlm_num
            mlm = torch.mean(mlm)
            metrics['mlm'] = mlm
            metrics['token_mlm'] = token_mlm
            metrics['mlm_num'] = mlm_num
            loss = loss + (token_mlm
                           if self.token_loss else mlm) * self.mlm_ratio
        else:
            mlm, token_mlm, mlm_num = None, None, None
        if self.with_rdrop:
            kl = compute_kl_loss(
                p=outputs['intent_logits'][:batch_size],
                q=outputs['intent_logits'][batch_size:])
            metrics['kl'] = kl
            loss = loss + kl * self.kl_ratio
        else:
            kl = None
        if self.with_contrastive:
            pass
            con = None
        else:
            con = None
        metrics['loss'] = loss
        if self.gpu > 1:
            return intent_loss, mlm, token_mlm, mlm_num, kl, con
        else:
            return metrics
    def _infer(self,
               inputs,
               start_id=None,
               eos_id=None,
               max_gen_len=None,
               prev_input=None):
        """ Real inference process of model. """
        results = {}
        enc_embed, dec_embed = self._encoder_decoder_network(
            src_token=inputs['src_token'],
            src_mask=inputs['src_mask'],
            tgt_token=inputs['tgt_token'],
            tgt_mask=inputs['tgt_mask'],
            src_pos=inputs['src_pos'],
            src_type=inputs['src_type'],
            src_turn=inputs['src_turn'])
        features = dec_embed[:, -1]
        features = self.pooler(features) if self.with_pool else features
        if self.example:
            results['features'] = features
        else:
            intent_logits = self.intent_classifier(features)
            intent_probs = self.softmax(intent_logits)
            results['intent_probs'] = intent_probs
        return results
 IntentUnifiedTransformer.register('IntentUnifiedTransformer')
--- a/modelscope/models/nlp/space/model/model_base.py
+++ b/modelscope/models/nlp/space/model/model_base.py
@@ -0,0 +1,101 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import torch.nn as nn
 from .....utils.constant import ModelFile
 class SpaceModelBase(nn.Module):
    """
    Basic model wrapper for static graph and dygrpah.
    """
    _registry = dict()
    @classmethod
    def register(cls, name):
        SpaceModelBase._registry[name] = cls
        return
    @staticmethod
    def by_name(name):
        return SpaceModelBase._registry[name]
    @staticmethod
    def create(model_dir, config, *args, **kwargs):
        model_cls = SpaceModelBase.by_name(config.Model.model)
        return model_cls(model_dir, config, *args, **kwargs)
    def __init__(self, model_dir, config):
        super(SpaceModelBase, self).__init__()
        self.init_checkpoint = os.path.join(model_dir,
                                            ModelFile.TORCH_MODEL_BIN_FILE)
        self.abandon_label = config.Dataset.abandon_label
        self.use_gpu = config.use_gpu
        self.gpu = config.Trainer.gpu
        return
    def _create_parameters(self):
        """ Create model's paramters. """
        raise NotImplementedError
    def _forward(self, inputs, is_training, with_label):
        """ NO LABEL: Real forward process of model in different mode(train/test). """
        raise NotImplementedError
    def _collect_metrics(self, inputs, outputs, with_label, data_file):
        """ NO LABEL: Calculate loss function by using inputs and outputs. """
        raise NotImplementedError
    def _optimize(self, loss, optimizer, lr_scheduler):
        """ Optimize loss function and update model. """
        raise NotImplementedError
    def _infer(self, inputs, start_id, eos_id, max_gen_len, prev_input):
        """ Real inference process of model. """
        raise NotImplementedError
    def forward(self,
                inputs,
                is_training=False,
                with_label=False,
                data_file=None):
        """
        Forward process, include real forward, collect metrices and optimize(optional)
        @params : inputs : input data
        @type : dict of numpy.ndarray/int/float/...
        """
        if is_training:
            self.train()
        else:
            self.eval()
        with_label = False if self.abandon_label else with_label
        outputs = self._forward(inputs, is_training, with_label=with_label)
        metrics = self._collect_metrics(
            inputs, outputs, with_label=with_label, data_file=data_file)
        return metrics
    def infer(self,
              inputs,
              start_id=None,
              eos_id=None,
              max_gen_len=None,
              prev_input=None):
        """
        Inference process.
        @params : inputs : input data
        @type : dict of numpy.ndarray/int/float/...
        """
        self.eval()
        results = self._infer(
            inputs,
            start_id=start_id,
            eos_id=eos_id,
            max_gen_len=max_gen_len,
            prev_input=prev_input)
        return results
--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -0,0 +1,313 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from ..modules.embedder import Embedder
 from ..modules.transformer_block import TransformerBlock
 from .model_base import SpaceModelBase
 class UnifiedTransformer(SpaceModelBase):
    """
    Implement unified transformer.
    """
    def __init__(self, model_dir, config, reader, generator, dtype='float32'):
        super(UnifiedTransformer, self).__init__(model_dir, config)
        self.reader = reader
        self.generator = generator
        self.policy = config.BPETextField.policy
        self.generation = config.BPETextField.generation
        self.num_token_embeddings = config.Model.num_token_embeddings
        self.num_pos_embeddings = config.Model.num_pos_embeddings
        self.num_type_embeddings = config.Model.num_type_embeddings
        self.num_turn_embeddings = config.Model.num_turn_embeddings
        self.temperature = config.Model.temperature
        self.hidden_dim = config.Model.hidden_dim
        self.num_heads = config.Model.num_heads
        self.num_layers = config.Model.num_layers
        self.padding_idx = config.Model.padding_idx
        self.dropout = config.Model.dropout
        self.embed_dropout = config.Model.embed_dropout
        self.attn_dropout = config.Model.attn_dropout
        self.ff_dropout = config.Model.ff_dropout
        self.mlm_ratio = config.Model.mlm_ratio
        self.mmd_ratio = config.Model.mmd_ratio
        self.pos_trainable = config.Model.pos_trainable
        self.label_smooth = config.Model.label_smooth
        self.initializer_range = config.Model.initializer_range
        self.gradient_accumulation_steps = config.Model.gradient_accumulation_steps
        self.token_loss = config.Trainer.token_loss
        self.learning_method = config.Dataset.learning_method
        self.with_contrastive = config.Dataset.with_contrastive
        self.with_query_bow = config.BPETextField.with_query_bow
        self.with_resp_bow = config.BPETextField.with_resp_bow
        self.with_pool = config.Model.with_pool
        self.with_mlm = config.Dataset.with_mlm
        self._dtype = dtype
        self.embedder = Embedder(
            self.hidden_dim,
            self.num_token_embeddings,
            self.num_pos_embeddings,
            self.num_type_embeddings,
            self.num_turn_embeddings,
            padding_idx=self.padding_idx,
            dropout=self.embed_dropout,
            pos_trainable=self.pos_trainable)
        self.embed_layer_norm = nn.LayerNorm(
            normalized_shape=self.hidden_dim,
            eps=1e-12,
            elementwise_affine=True)
        self.layers = nn.ModuleList([
            TransformerBlock(self.hidden_dim, self.num_heads, self.dropout,
                             self.attn_dropout, self.ff_dropout)
            for _ in range(config.Model.num_layers)
        ])
        if self.with_mlm:
            self.mlm_transform = nn.Sequential(
                nn.Linear(self.hidden_dim, self.hidden_dim), nn.GELU(),
                nn.LayerNorm(
                    normalized_shape=self.hidden_dim,
                    eps=1e-12,
                    elementwise_affine=True))
            self.mlm_bias = nn.Parameter(
                torch.zeros(self.num_token_embeddings))
        self.pooler = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim), nn.Tanh())
        if self.with_query_bow or self.with_resp_bow:
            self.bow_predictor = nn.Linear(
                self.hidden_dim, self.num_token_embeddings, bias=False)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=-1)
        self.bce_loss = nn.BCELoss(reduction='none')
        self.nll_loss = nn.NLLLoss(
            ignore_index=self.padding_idx, reduction='none')
        self._create_parameters()
        self.max_grad_norm = config.Model.max_grad_norm
        if self.max_grad_norm is not None:
            self.grad_clip = self.max_grad_norm
        else:
            self.grad_clip = None
        self.weight_decay = config.Model.weight_decay
        if self.use_gpu:
            self.cuda()
        return
    def _create_parameters(self):
        """ Create model's paramters. """
        sequence_mask = np.tri(
            self.num_pos_embeddings,
            self.num_pos_embeddings,
            dtype=self._dtype)
        self.sequence_mask = torch.tensor(sequence_mask)
        return
    def _create_mask(self,
                     input_mask,
                     append_head=False,
                     auto_regressive=False):
        """
        Create attention mask.
        from sequence to matrix：[batch_size, max_seq_len， 1] -> [batch_size, max_seq_len, max_seq_len]
        @param : input_mask
        @type : Variable(shape: [batch_size, max_seq_len])
        @param : auto_regressive
        @type : bool
        """
        seq_len = input_mask.shape[1]
        input_mask = input_mask.float()
        mask1 = input_mask.unsqueeze(-1).repeat(1, 1, seq_len)
        mask2 = mask1.permute(0, 2, 1)
        mask = mask1 * mask2
        if append_head:
            mask = torch.cat([mask[:, :1, :], mask], dim=1)
            mask = torch.cat([mask[:, :, :1], mask], dim=2)
            seq_len += 1
        if auto_regressive:
            seq_mask = self.sequence_mask[:seq_len, :seq_len]
            seq_mask = seq_mask.to(mask.device)
            mask = mask * seq_mask
        mask = 1 - mask
        return mask
    def _join_mask(self, mask1, mask2):
        """
        Merge source attention mask and target attention mask.
        There are four parts：left upper (lu) / right upper (ru) / left below (lb) / right below (rb)
        @param : mask1 : source attention mask
        @type : Variable(shape: [batch_size, max_src_len, max_src_len])
        @param : mask1 : target attention mask
        @type : Variable(shape: [batch_size, max_tgt_len, max_tgt_len])
        """
        batch_size = mask1.shape[0]
        seq_len1 = mask1.shape[1]
        seq_len2 = mask2.shape[1]
        # seq_len = seq_len1 + seq_len2
        mask_lu = mask1
        mask_ru = torch.ones(batch_size, seq_len1, seq_len2)
        if self.use_gpu:
            mask_ru = mask_ru.cuda()
        mask3 = mask2[:, :, :1].repeat(1, 1, seq_len1)
        mask4 = mask1[:, :1].repeat(1, seq_len2, 1)
        mask_lb = mask3 + mask4 - mask3 * mask4
        mask_rb = mask2
        mask_u = torch.cat([mask_lu, mask_ru], dim=2)
        mask_b = torch.cat([mask_lb, mask_rb], dim=2)
        mask = torch.cat([mask_u, mask_b], dim=1)
        return mask
    def _mlm_head(self, mlm_embed):
        mlm_embed = self.mlm_transform(mlm_embed)
        mlm_logits = torch.matmul(
            mlm_embed, self.embedder.token_embedding.weight.T) + self.mlm_bias
        mlm_probs = self.softmax(mlm_logits)
        return mlm_probs
    def _dec_head(self, dec_embed):
        dec_logits = torch.matmul(dec_embed,
                                  self.embedder.token_embedding.weight.T)
        dec_probs = self.softmax(dec_logits)
        return dec_probs
    def _refactor_feature(self, features):
        features = self.pooler(features) if self.with_pool else features
        batch_size = features.size(0) // 2
        features = \
            torch.cat(
                [features[:batch_size].unsqueeze(1), features[batch_size:].unsqueeze(1)],
                dim=1
            )
        features = F.normalize(features, dim=-1, p=2)
        return features
    def _encoder_network(self,
                         input_token,
                         input_mask,
                         input_pos=None,
                         input_type=None,
                         input_turn=None):
        embed = self.embedder(input_token, input_pos, input_type, input_turn)
        embed = self.embed_layer_norm(embed)
        mask = self._create_mask(input_mask, auto_regressive=False)
        for layer in self.layers:
            embed = layer(embed, mask, None)
        return embed
    def _encoder_decoder_network(self,
                                 src_token,
                                 src_mask,
                                 tgt_token,
                                 tgt_mask,
                                 src_pos=None,
                                 src_type=None,
                                 src_turn=None,
                                 tgt_pos=None,
                                 tgt_type=None,
                                 tgt_turn=None):
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn)
        embed = torch.cat([src_embed, tgt_embed], dim=1)
        embed = self.embed_layer_norm(embed)
        enc_mask = self._create_mask(src_mask, auto_regressive=False)
        dec_mask = self._create_mask(tgt_mask, auto_regressive=True)
        mask = self._join_mask(enc_mask, dec_mask)
        for layer in self.layers:
            embed = layer(embed, mask, None)
        tgt_len = tgt_token.shape[1]
        enc_embed = embed[:, :-tgt_len]
        dec_embed = embed[:, -tgt_len:]
        return enc_embed, dec_embed
    def _encoder_prompt_decoder_network(self,
                                        src_token,
                                        src_mask,
                                        tgt_token,
                                        tgt_mask,
                                        prompt_token,
                                        prompt_mask,
                                        src_pos=None,
                                        src_type=None,
                                        src_turn=None,
                                        tgt_pos=None,
                                        tgt_type=None,
                                        tgt_turn=None,
                                        prompt_pos=None,
                                        prompt_type=None,
                                        prompt_turn=None):
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn)
        prompt_embed = self.embedder(prompt_token, prompt_pos, prompt_type,
                                     prompt_turn)
        embed = torch.cat([src_embed, prompt_embed, tgt_embed], dim=1)
        embed = self.embed_layer_norm(embed)
        enc_mask = self._create_mask(src_mask, auto_regressive=False)
        dec_mask = self._create_mask(
            torch.cat([prompt_mask, tgt_mask], dim=1), auto_regressive=True)
        mask = self._join_mask(enc_mask, dec_mask)
        for layer in self.layers:
            embed = layer(embed, mask, None)
        src_len = src_token.shape[1]
        tgt_len = tgt_token.shape[1]
        enc_embed = embed[:, :src_len]
        dec_embed = embed[:, -tgt_len:]
        prompt_embed = embed[:, src_len:-tgt_len]
        return enc_embed, dec_embed, prompt_embed
    def _optimize(self, loss, optimizer=None, lr_scheduler=None):
        """ Optimize loss function and update model. """
        assert optimizer is not None
        optimizer.zero_grad()
        loss.backward()
        if self.grad_clip is not None and self.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(
                parameters=self.parameters(), max_norm=self.grad_clip)
        optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
        return
    def _infer(self,
               inputs,
               start_id=None,
               eos_id=None,
               max_gen_len=None,
               prev_input=None):
        """ Real inference process of model. """
        results = {}
        return results
 UnifiedTransformer.register('UnifiedTransformer')
--- a/modelscope/models/nlp/space/modules/init.py
+++ b/modelscope/models/nlp/space/modules/init.py
--- a/modelscope/models/nlp/space/modules/embedder.py
+++ b/modelscope/models/nlp/space/modules/embedder.py
@@ -0,0 +1,65 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 class Embedder(nn.Module):
    """
    Composite embedding layer.
    """
    def __init__(self,
                 hidden_dim,
                 num_token_embeddings,
                 num_pos_embeddings,
                 num_type_embeddings,
                 num_turn_embeddings,
                 padding_idx=None,
                 dropout=0.1,
                 pos_trainable=False):
        super(Embedder, self).__init__()
        self.token_embedding = nn.Embedding(num_token_embeddings, hidden_dim)
        self.pos_embedding = nn.Embedding(num_pos_embeddings, hidden_dim)
        self.pos_embedding.weight.requires_grad = pos_trainable
        self.type_embedding = nn.Embedding(num_type_embeddings, hidden_dim)
        self.turn_embedding = nn.Embedding(num_turn_embeddings, hidden_dim)
        self.dropout_layer = nn.Dropout(p=dropout)
        # follow the default xavier_uniform initializer in paddle version
        # otherwise, there are bugs for dec_probs computation in weight typing setting
        # default norm initializer in nn.Embedding in pytorch, which samples larger values
        nn.init.xavier_uniform_(self.token_embedding.weight)
        nn.init.xavier_uniform_(self.pos_embedding.weight)
        nn.init.xavier_uniform_(self.type_embedding.weight)
        nn.init.xavier_uniform_(self.turn_embedding.weight)
        return
    def forward(self, token_inp, pos_inp=None, type_inp=None, turn_inp=None):
        embed = self.token_embedding(token_inp)
        if pos_inp is not None:
            embed += self.pos_embedding(pos_inp)
        if type_inp is not None:
            embed += self.type_embedding(type_inp)
        if turn_inp is not None:
            embed += self.turn_embedding(turn_inp)
        embed = self.dropout_layer(embed)
        return embed
 def main():
    import numpy as np
    model = Embedder(10, 20, 20, 20, 20)
    token_inp = torch.tensor(
        np.random.randint(0, 19, [10, 10]).astype('int64'))
    pos_inp = torch.tensor(np.random.randint(0, 19, [10, 10]).astype('int64'))
    type_inp = torch.tensor(np.random.randint(0, 19, [10, 10]).astype('int64'))
    turn_inp = torch.tensor(np.random.randint(0, 19, [10, 10]).astype('int64'))
    out = model(token_inp, pos_inp, type_inp, turn_inp)
    print(out)
 if __name__ == '__main__':
    main()
--- a/modelscope/models/nlp/space/modules/feedforward.py
+++ b/modelscope/models/nlp/space/modules/feedforward.py
@@ -0,0 +1,41 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 class FeedForward(nn.Module):
    """
    Positional feed forward layer.
    """
    def __init__(self, hidden_dim, inner_dim, dropout):
        super(FeedForward, self).__init__()
        self.hidden_dim = hidden_dim
        self.inner_dim = inner_dim
        self.linear_hidden = nn.Sequential(
            nn.Linear(hidden_dim, inner_dim), nn.GELU())
        self.linear_out = nn.Linear(inner_dim, hidden_dim)
        self.dropout_layer = nn.Dropout(p=dropout)
        return
    def forward(self, x):
        out = self.linear_hidden(x)
        out = self.dropout_layer(out)
        out = self.linear_out(out)
        return out
 def main():
    import numpy as np
    model = FeedForward(10, 20, 0.5)
    inp = np.random.rand(2, 3, 10).astype('float32')
    inp = torch.tensor(inp)
    out = model(inp)
    print(out)
 if __name__ == '__main__':
    main()
--- a/modelscope/models/nlp/space/modules/functions.py
+++ b/modelscope/models/nlp/space/modules/functions.py
@@ -0,0 +1,62 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn.functional as F
 def unsqueeze(input, dims):
    """ Implement multi-dimension unsqueeze function. """
    if isinstance(dims, (list, tuple)):
        dims = [
            dim if dim >= 0 else dim + len(input.shape) + 1 for dim in dims
        ]
        dims = sorted(dims, reverse=True)
        shape = list(input.shape)
        for dim in dims:
            shape.insert(dim, 1)
        return torch.reshape(input, shape)
    elif isinstance(dims, int):
        return input.unsqueeze(dims)
    else:
        raise ValueError('Warning: type(dims) must in (list, tuple, int)!')
 def gumbel_softmax(input, tau=1, eps=1e-10):
    """ Basic implement of gumbel_softmax. """
    U = torch.tensor(np.random.rand(*input.shape))
    gumbel = 0.0 - torch.log(eps - torch.log(U + eps))
    y = input + gumbel
    return F.softmax(y / tau)
 def equal(x, y, dtype=None):
    """ Implement equal in dygraph mode. (paddle) """
    if dtype is None:
        dtype = 'float32'
    if isinstance(x, torch.Tensor):
        x = x.numpy()
    if isinstance(y, torch.Tensor):
        y = y.numpy()
    out = np.equal(x, y).astype(dtype)
    return torch.tensor(out)
 def not_equal(x, y, dtype=None):
    """ Implement not_equal in dygraph mode. (paddle) """
    return 1 - equal(x, y, dtype)
 if __name__ == '__main__':
    a = torch.tensor([[1, 1], [3, 4]])
    b = torch.tensor([[1, 1], [3, 4]])
    c = torch.equal(a, a)
    c1 = equal(a, 3)
    d = 1 - torch.not_equal(a, 3).float()
    print(c)
    print(c1)
    print(d)
    e = F.gumbel_softmax(a)
    f = a.unsqueeze(a)
    g = unsqueeze(a, dims=[0, 0, 1])
    print(g, g.shape)
--- a/modelscope/models/nlp/space/modules/multihead_attention.py
+++ b/modelscope/models/nlp/space/modules/multihead_attention.py
@@ -0,0 +1,105 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 class MultiheadAttention(nn.Module):
    """
    Multi head attention layer.
    """
    def __init__(self, hidden_dim, num_heads, dropout):
        assert hidden_dim % num_heads == 0
        super(MultiheadAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads
        self.scale = self.head_dim**-0.5
        self.linear_qkv = nn.Linear(hidden_dim, hidden_dim * 3)
        self.linear_out = nn.Linear(hidden_dim, hidden_dim)
        self.dropout_layer = nn.Dropout(p=dropout)
        self.softmax = nn.Softmax(dim=-1)
        return
    def _split_heads(self, x, is_key=False):
        x = x.reshape(x.size(0), x.size(1), self.num_heads, self.head_dim)
        x = x.permute(0, 2, 3, 1) if is_key else x.permute(0, 2, 1, 3)
        return x
    def _merge_heads(self, x):
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(x.size(0), x.size(1), self.hidden_dim)
        return x
    def _attn(self, query, key, value, mask):
        # shape: [batch_size, num_head, seq_len, seq_len]
        scores = torch.matmul(query, key)
        scores = scores * self.scale
        if mask is not None:
            mask = mask.unsqueeze(1)
            mask = mask.repeat(1, self.num_heads, 1, 1)
            scores.masked_fill_(
                mask.bool(),
                float('-inf'))  # scores = (1 - mask) * scores + mask * (-1e10)
        attn = self.softmax(scores)
        attn = self.dropout_layer(attn)
        if mask is not None:
            '''
            mask: [batch size, num_heads, seq_len, seq_len]
            >>> F.softmax([-1e10, -100, -100])
            >>> [0.00, 0.50, 0.50]
            >>> F.softmax([-1e10, -1e10, -1e10])
            >>> [0.33, 0.33, 0.33]
            ==> [0.00, 0.00, 0.00]
            '''
            attn.masked_fill_(mask.bool(), 0.)  # attn = (1 - mask) * attn
        out = torch.matmul(attn, value)
        return out
    def forward(self, inp, mask=None, cache=None):
        """ Forward process of self attention. """
        # shape: [batch_size, seq_len, 3 * hidden_dim]
        qkv = self.linear_qkv(inp)
        query, key, value = torch.split(qkv, self.hidden_dim, dim=2)
        # shape: [batch_size, num_head, seq_len, head_dim]
        query = self._split_heads(query)
        # shape: [batch_size, num_head, head_dim, seq_len]
        key = self._split_heads(key, is_key=True)
        # shape: [batch_size, num_head, seq_len, head_dim]
        value = self._split_heads(value)
        if cache is not None:
            if 'key' in cache and 'value' in cache:
                key = torch.cat([cache['key'], key], dim=3)
                value = torch.cat([cache['value'], value], dim=2)
            cache['key'] = key
            cache['value'] = value
        out = self._attn(query, key, value, mask)
        out = self._merge_heads(out)
        out = self.linear_out(out)
        return out
 def main():
    import numpy as np
    model = MultiheadAttention(10, 2, 0.5)
    inp = np.random.rand(2, 3, 10).astype('float32')
    inp = torch.tensor(inp)
    mask = (np.random.rand(2, 3, 3) > 0.5).astype('float32')
    mask = torch.tensor(mask)
    out = model(inp, mask=mask, cache=None)
    print(out)
 if __name__ == '__main__':
    main()
--- a/modelscope/models/nlp/space/modules/transformer_block.py
+++ b/modelscope/models/nlp/space/modules/transformer_block.py
@@ -0,0 +1,70 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 from .feedforward import FeedForward
 from .multihead_attention import MultiheadAttention
 class TransformerBlock(nn.Module):
    """
    Transformer block module.
    """
    def __init__(self, hidden_dim, num_heads, dropout, attn_dropout,
                 ff_dropout):
        super(TransformerBlock, self).__init__()
        self.attn = MultiheadAttention(
            hidden_dim=hidden_dim, num_heads=num_heads, dropout=attn_dropout)
        self.attn_norm = nn.LayerNorm(
            normalized_shape=hidden_dim, eps=1e-12, elementwise_affine=True)
        self.ff = FeedForward(
            hidden_dim=hidden_dim,
            inner_dim=4 * hidden_dim,
            dropout=ff_dropout)
        self.ff_norm = nn.LayerNorm(
            normalized_shape=hidden_dim, eps=1e-12, elementwise_affine=True)
        self.dropout_layer = nn.Dropout(p=dropout)
        return
    def forward(self, inp, mask=None, cache=None):
        """
        Forward process on one transformer layer.
        @param : x
        @type : Variable(shape: [batch_size, seq_len, hidden_size])
        @param : memory
        @type : Variable(shape: [batch_size, seq_len, hidden_size])
        @param : mask
        @param : cache
        """
        attn_out = self.attn(inp, mask, cache)
        attn_out = self.dropout_layer(attn_out)
        attn_out = self.attn_norm(attn_out + inp)
        ff_out = self.ff(attn_out)
        ff_out = self.dropout_layer(ff_out)
        ff_out = self.ff_norm(ff_out + attn_out)
        return ff_out
 def main():
    import numpy as np
    model = TransformerBlock(10, 2, 0.5, 0.5, 0.5)
    inp = np.random.rand(2, 3, 10).astype('float32')
    inp = torch.tensor(inp)
    mask = (np.random.rand(2, 3, 3) > 0.5).astype('float32')
    mask = torch.tensor(mask)
    out = model(inp, mask=mask, cache=None)
    print(out)
 if __name__ == '__main__':
    main()
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -21,6 +21,12 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.sentence_similarity:
    (Pipelines.sentence_similarity,
     'damo/nlp_structbert_sentence-similarity_chinese-base'),
    Tasks.nli: (Pipelines.nli, 'damo/nlp_structbert_nli_chinese-base'),
    Tasks.sentiment_classification:
    (Pipelines.sentiment_classification,
     'damo/nlp_structbert_sentiment-classification_chinese-base'),
    Tasks.text_classification: ('bert-sentiment-analysis',
                                'damo/bert-base-sst2'),
    Tasks.image_matting: (Pipelines.image_matting,
                          'damo/cv_unet_image-matting'),
    Tasks.text_classification: (Pipelines.sentiment_analysis,
@@ -30,6 +36,11 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.zero_shot_classification:
    (Pipelines.zero_shot_classification,
     'damo/nlp_structbert_zero-shot-classification_chinese-base'),
    Tasks.dialog_intent_prediction:
    (Pipelines.dialog_intent_prediction,
     'damo/nlp_space_dialog-intent-prediction'),
    Tasks.dialog_modeling: (Pipelines.dialog_modeling,
                            'damo/nlp_space_dialog-modeling'),
    Tasks.image_captioning: (Pipelines.image_caption,
                             'damo/ofa_image-caption_coco_large_en'),
    Tasks.image_generation:
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -1,6 +1,10 @@
 try:
    from .dialog_intent_prediction_pipeline import *  # noqa F403
    from .dialog_modeling_pipeline import *  # noqa F403
    from .fill_mask_pipeline import *  # noqa F403
    from .nli_pipeline import *  # noqa F403
    from .sentence_similarity_pipeline import *  # noqa F403
    from .sentiment_classification_pipeline import *  # noqa F403
    from .sequence_classification_pipeline import *  # noqa F403
    from .text_generation_pipeline import *  # noqa F403
    from .word_segmentation_pipeline import *  # noqa F403
--- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
@@ -0,0 +1,53 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 from ...metainfo import Pipelines
 from ...models.nlp import SpaceForDialogIntent
 from ...preprocessors import DialogIntentPredictionPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
 __all__ = ['DialogIntentPredictionPipeline']
@PIPELINES.register_module(
    Tasks.dialog_intent_prediction,
    module_name=Pipelines.dialog_intent_prediction)
 class DialogIntentPredictionPipeline(Pipeline):
    def __init__(self, model: SpaceForDialogIntent,
                 preprocessor: DialogIntentPredictionPreprocessor, **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
        Args:
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model = model
        self.categories = preprocessor.categories
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results
        Args:
            inputs (Dict[str, Any]): _description_
        Returns:
            Dict[str, str]: the prediction results
        """
        import numpy as np
        pred = inputs['pred']
        pos = np.where(pred == np.max(pred))
        result = {
            OutputKeys.PREDICTION: pred,
            OutputKeys.LABEL_POS: pos[0],
            OutputKeys.LABEL: self.categories[pos[0][0]]
        }
        return result
--- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
@@ -0,0 +1,49 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional
 from ...metainfo import Pipelines
 from ...models.nlp import SpaceForDialogModeling
 from ...preprocessors import DialogModelingPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
 __all__ = ['DialogModelingPipeline']
@PIPELINES.register_module(
    Tasks.dialog_modeling, module_name=Pipelines.dialog_modeling)
 class DialogModelingPipeline(Pipeline):
    def __init__(self, model: SpaceForDialogModeling,
                 preprocessor: DialogModelingPreprocessor, **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
        Args:
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model = model
        self.preprocessor = preprocessor
    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results
        Args:
            inputs (Dict[str, Any]): _description_
        Returns:
            Dict[str, str]: the prediction results
        """
        sys_rsp = self.preprocessor.text_field.tokenizer.convert_ids_to_tokens(
            inputs['resp'])
        assert len(sys_rsp) > 2
        sys_rsp = sys_rsp[1:len(sys_rsp) - 1]
        inputs[OutputKeys.RESPONSE] = sys_rsp
        return inputs
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,5 +1,7 @@
 import os
 from typing import Dict, Optional, Union
 from typing import Any, Dict, Optional, Union
 import torch
 from ...metainfo import Pipelines
 from ...models import Model
@@ -21,6 +23,7 @@ class FillMaskPipeline(Pipeline):
    def __init__(self,
                 model: Union[MaskedLanguageModelBase, str],
                 preprocessor: Optional[FillMaskPreprocessor] = None,
                 first_sequence='sentense',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
@@ -30,12 +33,16 @@ class FillMaskPipeline(Pipeline):
        """
        fill_mask_model = model if isinstance(
            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = FillMaskPreprocessor(
                fill_mask_model.model_dir,
                first_sequence='sentence',
                first_sequence=first_sequence,
                second_sequence=None)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        fill_mask_model.eval()
        super().__init__(
            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
        self.preprocessor = preprocessor
        self.config = Config.from_file(
            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
@@ -63,6 +70,11 @@ class FillMaskPipeline(Pipeline):
            }
        }
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """process the prediction results
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ b/modelscope/pipelines/nlp/nli_pipeline.py
@@ -0,0 +1,73 @@
 import uuid
 from typing import Any, Dict, Union
 import numpy as np
 import torch
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForNLI
 from ...preprocessors import NLIPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
 __all__ = ['NLIPipeline']
@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
 class NLIPipeline(Pipeline):
    def __init__(self,
                 model: Union[SbertForNLI, str],
                 preprocessor: NLIPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
        Args:
            model (SbertForNLI): a model instance
            preprocessor (NLIPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForNLI), \
            'model must be a single str or SbertForNLI'
        model = model if isinstance(
            model, SbertForNLI) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = NLIPreprocessor(
                model.model_dir,
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        assert len(model.id2label) > 0
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self,
                    inputs: Dict[str, Any],
                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results
        Args:
            inputs (Dict[str, Any]): _description_
        Returns:
            Dict[str, str]: the prediction results
        """
        probs = inputs['probabilities'][0]
        num_classes = probs.shape[0]
        topk = min(topk, num_classes)
        top_indices = np.argpartition(probs, -topk)[-topk:]
        cls_ids = top_indices[np.argsort(probs[top_indices])]
        probs = probs[cls_ids].tolist()
        cls_names = [self.model.id2label[cid] for cid in cls_ids]
        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -1,12 +1,13 @@
 from typing import Any, Dict, Union
 import numpy as np
 import torch
 from modelscope.metainfo import Pipelines
 from modelscope.models.nlp import SbertForSentenceSimilarity
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForSentenceSimilarity
 from ...preprocessors import SequenceClassificationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
@@ -19,8 +20,10 @@ __all__ = ['SentenceSimilarityPipeline']
 class SentenceSimilarityPipeline(Pipeline):
    def __init__(self,
                 model: Union[SbertForSentenceSimilarity, str],
                 model: Union[Model, str],
                 preprocessor: SequenceClassificationPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
@@ -36,14 +39,21 @@ class SentenceSimilarityPipeline(Pipeline):
        if preprocessor is None:
            preprocessor = SequenceClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        sc_model.eval()
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
        assert hasattr(self.model, 'id2label'), \
            'id2label map should be initalizaed in init function.'
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self, inputs: Dict[str, Any],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results
        Args:
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
@@ -0,0 +1,78 @@
 import os
 import uuid
 from typing import Any, Dict, Union
 import json
 import numpy as np
 import torch
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForSentimentClassification
 from ...preprocessors import SentimentClassificationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
 __all__ = ['SentimentClassificationPipeline']
@PIPELINES.register_module(
    Tasks.sentiment_classification,
    module_name=Pipelines.sentiment_classification)
 class SentimentClassificationPipeline(Pipeline):
    def __init__(self,
                 model: Union[SbertForSentimentClassification, str],
                 preprocessor: SentimentClassificationPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
        Args:
            model (SbertForSentimentClassification): a model instance
            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForSentimentClassification), \
            'model must be a single str or SbertForSentimentClassification'
        model = model if isinstance(
            model,
            SbertForSentimentClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = SentimentClassificationPreprocessor(
                model.model_dir,
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        assert len(model.id2label) > 0
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self,
                    inputs: Dict[str, Any],
                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results
        Args:
            inputs (Dict[str, Any]): _description_
        Returns:
            Dict[str, str]: the prediction results
        """
        probs = inputs['probabilities'][0]
        num_classes = probs.shape[0]
        topk = min(topk, num_classes)
        top_indices = np.argpartition(probs, -topk)[-topk:]
        cls_ids = top_indices[np.argsort(probs[top_indices])]
        probs = probs[cls_ids].tolist()
        cls_names = [self.model.id2label[cid] for cid in cls_ids]
        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,10 +1,12 @@
 from typing import Dict, Optional, Union
 from typing import Any, Dict, Optional, Union
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 import torch
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import PalmForTextGeneration
 from ...preprocessors import TextGenerationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
@@ -34,10 +36,17 @@ class TextGenerationPipeline(Pipeline):
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = model.tokenizer
    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self, inputs: Dict[str, Tensor],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results
        Args:
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,10 +1,12 @@
 from typing import Any, Dict, Optional, Union
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 import torch
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForTokenClassification
 from ...preprocessors import TokenClassifcationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 from ..outputs import OutputKeys
@@ -17,7 +19,7 @@ __all__ = ['WordSegmentationPipeline']
 class WordSegmentationPipeline(Pipeline):
    def __init__(self,
                 model: Union[StructBertForTokenClassification, str],
                 model: Union[SbertForTokenClassification, str],
                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -28,15 +30,23 @@ class WordSegmentationPipeline(Pipeline):
        """
        model = model if isinstance(
            model,
            StructBertForTokenClassification) else Model.from_pretrained(model)
            SbertForTokenClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        self.config = model.config
        assert len(self.config.id2label) > 0
        self.id2label = self.config.id2label
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)
    def postprocess(self, inputs: Dict[str, Any],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results
        Args:
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -5,7 +5,9 @@ from modelscope.utils.constant import Tasks
 class OutputKeys(object):
    SCORES = 'scores'
    LABEL = 'label'
    LABELS = 'labels'
    LABEL_POS = 'label_pos'
    POSES = 'poses'
    CAPTION = 'caption'
    BOXES = 'boxes'
@@ -16,6 +18,8 @@ class OutputKeys(object):
    OUTPUT_PCM = 'output_pcm'
    IMG_EMBEDDING = 'img_embedding'
    TEXT_EMBEDDING = 'text_embedding'
    RESPONSE = 'response'
    PREDICTION = 'prediction'
 TASK_OUTPUTS = {
@@ -119,6 +123,13 @@ TASK_OUTPUTS = {
    #   }
    Tasks.sentence_similarity: [OutputKeys.SCORES, OutputKeys.LABELS],
    # sentiment classification result for single sample
    #   {
    #       "labels": ["happy", "sad", "calm", "angry"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.sentiment_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
    # zero-shot classification result for single sample
    #   {
    #       "scores": [0.9, 0.1, 0.05, 0.05]
@@ -126,6 +137,39 @@ TASK_OUTPUTS = {
    #   }
    Tasks.zero_shot_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
    # nli result for single sample
    #   {
    #       "labels": ["happy", "sad", "calm", "angry"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.nli: [OutputKeys.SCORES, OutputKeys.LABELS],
    # {'pred': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
    #        1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
    #        6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01,
    #        2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05,
    #        2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05,
    #        7.34537680e-05, 6.61411541e-05, 3.62534920e-05, 8.58885178e-05,
    #        8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05,
    #        5.97518992e-05, 3.92273068e-05, 3.44069012e-05, 9.92335918e-05,
    #        9.25978165e-05, 6.26462061e-05, 3.32317031e-05, 1.32061413e-03,
    #        2.01607945e-05, 3.36636294e-05, 3.99156743e-05, 5.84108493e-05,
    #        2.53432900e-05, 4.95731190e-04, 2.64443643e-05, 4.46992999e-05,
    #        2.42672231e-05, 4.75615161e-05, 2.66230145e-05, 4.00083954e-05,
    #        2.90536875e-04, 4.23891543e-05, 8.63691166e-05, 4.98188965e-05,
    #        3.47019341e-05, 4.52718523e-05, 4.20905781e-05, 5.50173208e-05,
    #        4.92360487e-05, 3.56021264e-05, 2.13957210e-05, 6.17428886e-05,
    #        1.43893281e-04, 7.32152112e-05, 2.91354867e-04, 2.46623786e-05,
    #        3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05,
    #        4.31488479e-05, 4.94503947e-05, 4.30105974e-05, 1.00963116e-04,
    #        2.82062047e-05, 1.15582036e-04, 4.48261271e-05, 3.99339879e-05,
    #        7.27692823e-05], dtype=float32), 'label_pos': array([11]), 'label': 'lost_or_stolen_card'}
    Tasks.dialog_intent_prediction:
    [OutputKeys.PREDICTION, OutputKeys.LABEL_POS, OutputKeys.LABEL],
    # sys : ['you', 'are', 'welcome', '.', 'have', 'a', 'great', 'day', '!']
    Tasks.dialog_modeling: [OutputKeys.RESPONSE],
    # ============ audio tasks ===================
    # audio processed for single file in PCM format
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -11,6 +11,8 @@ try:
    from .audio import LinearAECAndFbank
    from .multi_modal import *  # noqa F403
    from .nlp import *  # noqa F403
    from .space.dialog_intent_prediction_preprocessor import *  # noqa F403
    from .space.dialog_modeling_preprocessor import *  # noqa F403
 except ModuleNotFoundError as e:
    if str(e) == "No module named 'tensorflow'":
        pass
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,15 +5,16 @@ from typing import Any, Dict, Union
 from transformers import AutoTokenizer
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.type_assert import type_assert
 from ..metainfo import Models, Preprocessors
 from ..utils.constant import Fields, InputFields
 from ..utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 __all__ = [
    'Tokenize', 'SequenceClassificationPreprocessor',
    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
    'NLIPreprocessor', 'SentimentClassificationPreprocessor',
    'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor'
 ]
@@ -32,6 +33,140 @@ class Tokenize(Preprocessor):
        return data
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 class NLIPreprocessor(Preprocessor):
    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.first_sequence: str = kwargs.pop('first_sequence',
                                              'first_sequence')
        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
        self.sequence_length = kwargs.pop('sequence_length', 128)
        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
    @type_assert(object, tuple)
    def __call__(self, data: tuple) -> Dict[str, Any]:
        """process the raw input data
        Args:
            data (tuple): [sentence1, sentence2]
                sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
                sentence2 (str): a sentence
                    Example:
                        'you are so beautiful.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        sentence1, sentence2 = data
        new_data = {
            self.first_sequence: sentence1,
            self.second_sequence: sentence2
        }
        # preprocess the data for the model input
        rst = {
            'id': [],
            'input_ids': [],
            'attention_mask': [],
            'token_type_ids': []
        }
        max_seq_length = self.sequence_length
        text_a = new_data[self.first_sequence]
        text_b = new_data[self.second_sequence]
        feature = self.tokenizer(
            text_a,
            text_b,
            padding=False,
            truncation=True,
            max_length=max_seq_length)
        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])
        return rst
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
 class SentimentClassificationPreprocessor(Preprocessor):
    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.first_sequence: str = kwargs.pop('first_sequence',
                                              'first_sequence')
        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
        self.sequence_length = kwargs.pop('sequence_length', 128)
        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data
        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        new_data = {self.first_sequence: data}
        # preprocess the data for the model input
        rst = {
            'id': [],
            'input_ids': [],
            'attention_mask': [],
            'token_type_ids': []
        }
        max_seq_length = self.sequence_length
        text_a = new_data[self.first_sequence]
        text_b = new_data.get(self.second_sequence, None)
        feature = self.tokenizer(
            text_a,
            text_b,
            padding='max_length',
            truncation=True,
            max_length=max_seq_length)
        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])
        return rst
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 class SequenceClassificationPreprocessor(Preprocessor):
@@ -178,7 +313,6 @@ class TextGenerationPreprocessor(Preprocessor):
        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        return {k: torch.tensor(v) for k, v in rst.items()}
@@ -241,7 +375,7 @@ class FillMaskPreprocessor(Preprocessor):
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sbert_token_cls_tokenizer)
    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
 class TokenClassifcationPreprocessor(Preprocessor):
    def __init__(self, model_dir: str, *args, **kwargs):
@@ -269,6 +403,7 @@ class TokenClassifcationPreprocessor(Preprocessor):
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        # preprocess the data for the model input
        text = data.replace(' ', '').strip()
--- a/modelscope/preprocessors/space/init.py
+++ b/modelscope/preprocessors/space/init.py
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
@@ -0,0 +1,57 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 import json
 from ...metainfo import Preprocessors
 from ...utils.config import Config
 from ...utils.constant import Fields, ModelFile
 from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
 from .fields.intent_field import IntentBPETextField
 __all__ = ['DialogIntentPredictionPreprocessor']
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.dialog_intent_preprocessor)
 class DialogIntentPredictionPreprocessor(Preprocessor):
    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
        self.model_dir: str = model_dir
        self.config = Config.from_file(
            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
        self.text_field = IntentBPETextField(
            self.model_dir, config=self.config)
        self.categories = None
        with open(os.path.join(self.model_dir, 'categories.json'), 'r') as f:
            self.categories = json.load(f)
        assert len(self.categories) == 77
    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data
        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        samples = self.text_field.preprocessor([data])
        samples, _ = self.text_field.collate_fn_multi_turn(samples)
        return samples
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -0,0 +1,51 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 from ...metainfo import Preprocessors
 from ...utils.config import Config
 from ...utils.constant import Fields, ModelFile
 from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
 from .fields.gen_field import MultiWOZBPETextField
 __all__ = ['DialogModelingPreprocessor']
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.dialog_modeling_preprocessor)
 class DialogModelingPreprocessor(Preprocessor):
    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
        self.model_dir: str = model_dir
        self.config = Config.from_file(
            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
        self.text_field = MultiWOZBPETextField(
            self.model_dir, config=self.config)
    @type_assert(object, Dict)
    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """process the raw input data
        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        user_ids = self.text_field.get_ids(data['user_input'])
        data['user'] = user_ids
        return data
--- a/modelscope/preprocessors/space/fields/init.py
+++ b/modelscope/preprocessors/space/fields/init.py
--- a/modelscope/preprocessors/space/fields/dst_processors.py
+++ b/modelscope/preprocessors/space/fields/dst_processors.py
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -0,0 +1,675 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import random
 from collections import OrderedDict
 from itertools import chain
 import numpy as np
 from ....utils.nlp.space import ontology, utils
 from ....utils.nlp.space.db_ops import MultiWozDB
 from ....utils.nlp.space.utils import list2np
 from ..tokenizer import Tokenizer
 class BPETextField(object):
    pad_token = '[PAD]'
    bos_token = '[BOS]'
    eos_token = '[EOS]'
    unk_token = '[UNK]'
    sos_u_token = '<sos_u>'
    eos_u_token = '<eos_u>'
    sos_b_token = '<sos_b>'
    eos_b_token = '<eos_b>'
    sos_d_token = '<sos_d>'
    eos_d_token = '<eos_d>'
    sos_a_token = '<sos_a>'
    eos_a_token = '<eos_a>'
    sos_db_token = '<sos_db>'
    eos_db_token = '<eos_db>'
    sos_r_token = '<sos_r>'
    eos_r_token = '<eos_r>'
    @property
    def bot_id(self):
        return 0
    @property
    def user_id(self):
        return 1
    @property
    def vocab_size(self):
        return self.tokenizer.vocab_size
    @property
    def num_specials(self):
        return len(self.tokenizer.special_tokens)
    @property
    def pad_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.pad_token])[0]
    @property
    def bos_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.bos_token])[0]
    @property
    def eos_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_token])[0]
    @property
    def unk_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.unk_token])[0]
    @property
    def sos_u_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_u_token])[0]
    @property
    def eos_u_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_u_token])[0]
    @property
    def sos_b_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_b_token])[0]
    @property
    def eos_b_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_b_token])[0]
    @property
    def sos_db_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_db_token])[0]
    @property
    def eos_db_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_db_token])[0]
    @property
    def sos_a_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_a_token])[0]
    @property
    def eos_a_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_a_token])[0]
    @property
    def sos_r_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_r_token])[0]
    @property
    def eos_r_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_r_token])[0]
    @property
    def sos_d_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.sos_d_token])[0]
    @property
    def eos_d_id(self):
        return self.tokenizer.convert_tokens_to_ids([self.eos_d_token])[0]
    def __init__(self, config):
        self.gpu = 0
        self.tokenizer = None
        self.vocab = None
        self.db = None
        self.set_stats = {}
        self.prompt_num_for_understand = config.BPETextField.prompt_num_for_understand
        self.prompt_num_for_policy = config.BPETextField.prompt_num_for_policy
        self.understand_tokens = ontology.get_understand_tokens(
            self.prompt_num_for_understand)
        self.policy_tokens = ontology.get_policy_tokens(
            self.prompt_num_for_policy)
        self.with_query_bow = config.BPETextField.with_query_bow
        self.understand = config.BPETextField.understand
        self.policy = config.BPETextField.policy
        self.batch_size = config.Trainer.batch_size
        self.filtered = config.BPETextField.filtered
        self.max_len = config.BPETextField.max_len
        self.min_utt_len = config.BPETextField.min_utt_len
        self.max_utt_len = config.BPETextField.max_utt_len
        self.min_ctx_turn = config.BPETextField.min_ctx_turn
        self.max_ctx_turn = config.BPETextField.max_ctx_turn - 1  # subtract reply turn
        self.use_true_prev_bspn = config.Generator.use_true_prev_bspn
        self.use_true_prev_aspn = config.Generator.use_true_prev_aspn
        self.use_true_db_pointer = config.Generator.use_true_db_pointer
        self.use_true_prev_resp = config.Generator.use_true_prev_resp
        self.use_true_curr_bspn = config.Generator.use_true_curr_bspn
        self.use_true_curr_aspn = config.Generator.use_true_curr_aspn
        self.use_all_previous_context = config.Generator.use_all_previous_context
        self.use_true_bspn_for_ctr_eval = config.Generator.use_true_bspn_for_ctr_eval
        self.use_true_domain_for_ctr_eval = config.Generator.use_true_domain_for_ctr_eval
    def collate_fn_multi_turn(self, samples):
        batch_size = len(samples)
        batch = {}
        src = [sp['src'][-self.max_ctx_turn:] for sp in samples]
        query_token, src_token, src_pos, src_turn, src_role = [], [], [], [], []
        for utts in src:
            query_token.append(utts[-1])
            utt_lens = [len(utt) for utt in utts]
            # Token ids
            src_token.append(list(chain(*utts))[-self.max_len:])
            # Position ids
            pos = [list(range(utt_len)) for utt_len in utt_lens]
            src_pos.append(list(chain(*pos))[-self.max_len:])
            # Turn ids
            turn = [[len(utts) - i] * l for i, l in enumerate(utt_lens)]
            src_turn.append(list(chain(*turn))[-self.max_len:])
            # Role ids
            role = [
                [self.bot_id if (len(utts) - i) % 2 == 0 else self.user_id] * l
                for i, l in enumerate(utt_lens)
            ]
            src_role.append(list(chain(*role))[-self.max_len:])
        # src sequence and tgt sequence should be padded separately，to make sure the first word is aligned
        src_token = list2np(src_token, padding=self.pad_id)
        src_pos = list2np(src_pos, padding=self.pad_id)
        src_turn = list2np(src_turn, padding=self.pad_id)
        src_role = list2np(src_role, padding=self.pad_id)
        batch['src_token'] = src_token
        batch['src_pos'] = src_pos
        batch['src_type'] = src_role
        batch['src_turn'] = src_turn
        batch['src_mask'] = (src_token != self.pad_id).astype('int64')
        if self.with_query_bow:
            query_token = list2np(query_token, padding=self.pad_id)
            batch['query_token'] = query_token
            batch['query_mask'] = (query_token != self.pad_id).astype('int64')
        if self.understand_ids and self.understand:
            understand = [self.understand_ids for _ in samples]
            understand_token = np.array(understand).astype('int64')
            batch['understand_token'] = understand_token
            batch['understand_mask'] = \
                (understand_token != self.pad_id).astype('int64')
        if self.policy_ids and self.policy:
            policy = [self.policy_ids for _ in samples]
            policy_token = np.array(policy).astype('int64')
            batch['policy_token'] = policy_token
            batch['policy_mask'] = \
                (policy_token != self.pad_id).astype('int64')
        if 'tgt' in samples[0]:
            tgt = [sp['tgt'] for sp in samples]
            # Token ids & Label ids
            tgt_token = list2np(tgt, padding=self.pad_id)
            # Position ids
            tgt_pos = np.zeros_like(tgt_token)
            tgt_pos[:] = np.arange(tgt_token.shape[1], dtype=tgt_token.dtype)
            # Turn ids
            tgt_turn = np.zeros_like(tgt_token)
            # Role ids
            tgt_role = np.full_like(tgt_token, self.bot_id)
            batch['tgt_token'] = tgt_token
            batch['tgt_pos'] = tgt_pos
            batch['tgt_type'] = tgt_role
            batch['tgt_turn'] = tgt_turn
            batch['tgt_mask'] = (tgt_token != self.pad_id).astype('int64')
        return batch, batch_size
    def _bucket_by_turn(self, encoded_data):
        turn_bucket = {}
        for dial in encoded_data:
            turn_len = len(dial)
            if turn_len not in turn_bucket:
                turn_bucket[turn_len] = []
            turn_bucket[turn_len].append(dial)
        return OrderedDict(sorted(turn_bucket.items(), key=lambda i: i[0]))
    def _construct_mini_batch(self, data):
        all_batches = []
        batch = []
        for dial in data:
            batch.append(dial)
            if len(batch) == self.batch_size:
                # print('batch size: %d, batch num +1'%(len(batch)))
                all_batches.append(batch)
                batch = []
        # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch
        # print('last batch size: %d, batch num +1'%(len(batch)))
        # if (len(batch) % len(cfg.cuda_device)) != 0:
        #     batch = batch[:-(len(batch) % len(cfg.cuda_device))]
        # TODO deal with deleted data
        if self.gpu <= 1:
            if len(batch) > 0.5 * self.batch_size:
                all_batches.append(batch)
            elif len(all_batches):
                all_batches[-1].extend(batch)
            else:
                all_batches.append(batch)
        return all_batches
    def transpose_batch(self, batch):
        dial_batch = []
        turn_num = len(batch[0])
        for turn in range(turn_num):
            turn_l = {}
            for dial in batch:
                this_turn = dial[turn]
                for k in this_turn:
                    if k not in turn_l:
                        turn_l[k] = []
                    turn_l[k].append(this_turn[k])
            dial_batch.append(turn_l)
        return dial_batch
    def get_eval_data(self, set_name='dev'):
        name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev}
        dial = name_to_set[set_name]
        if set_name not in self.set_stats:
            self.set_stats[set_name] = {}
        num_turns = 0
        num_dials = len(dial)
        for d in dial:
            num_turns += len(d)
        self.set_stats[set_name]['num_turns'] = num_turns
        self.set_stats[set_name]['num_dials'] = num_dials
        return dial
    def get_nontranspose_data_iterator(self, all_batches):
        for i, batch in enumerate(all_batches):
            yield batch
    def get_data_iterator(self, all_batches):
        for i, batch in enumerate(all_batches):
            yield self.transpose_batch(batch)
 class MultiWOZBPETextField(BPETextField):
    def __init__(self, model_dir, config):
        super(MultiWOZBPETextField, self).__init__(config)
        import spacy
        self.nlp = spacy.load('en_core_web_sm')
        self.db = MultiWozDB(
            model_dir, {
                'attraction': 'db/attraction_db_processed.json',
                'hospital': 'db/hospital_db_processed.json',
                'hotel': 'db/hotel_db_processed.json',
                'police': 'db/police_db_processed.json',
                'restaurant': 'db/restaurant_db_processed.json',
                'taxi': 'db/taxi_db_processed.json',
                'train': 'db/train_db_processed.json',
            })
        self._build_vocab(model_dir)
        special_tokens = [
            self.pad_token, self.bos_token, self.eos_token, self.unk_token
        ]
        special_tokens.extend(self.add_sepcial_tokens())
        self.tokenizer = Tokenizer(
            vocab_path=os.path.join(model_dir, 'vocab.txt'),
            special_tokens=special_tokens,
            tokenizer_type=config.BPETextField.tokenizer_type)
        self.understand_ids = self.tokenizer.convert_tokens_to_ids(
            self.understand_tokens)
        self.policy_ids = self.tokenizer.convert_tokens_to_ids(
            self.policy_tokens)
        return
    def get_ids(self, data: str):
        result = [self.sos_u_id] + self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.tokenize(
                self._get_convert_str(data))) + [self.eos_u_id]
        return result
    def inverse_transpose_turn(self, turn_list):
        """
        eval, one dialog at a time
        """
        dialogs = {}
        turn_num = len(turn_list)
        dial_id = turn_list[0]['dial_id']
        dialogs[dial_id] = []
        for turn_idx in range(turn_num):
            dial_turn = {}
            turn = turn_list[turn_idx]
            for key, value in turn.items():
                if key == 'dial_id':
                    continue
                if key == 'pointer' and self.db is not None:
                    turn_domain = turn['turn_domain'][-1]
                    value = self.db.pointerBack(value, turn_domain)
                dial_turn[key] = value
            dialogs[dial_id].append(dial_turn)
        return dialogs
    def inverse_transpose_batch(self, turn_batch_list):
        """
        :param turn_batch_list: list of transpose dial batch
        """
        dialogs = {}
        total_turn_num = len(turn_batch_list)
        # initialize
        for idx_in_batch, dial_id in enumerate(turn_batch_list[0]['dial_id']):
            dialogs[dial_id] = []
            for turn_n in range(total_turn_num):
                dial_turn = {}
                turn_batch = turn_batch_list[turn_n]
                for key, v_list in turn_batch.items():
                    if key == 'dial_id':
                        continue
                    value = v_list[idx_in_batch]
                    if key == 'pointer' and self.db is not None:
                        turn_domain = turn_batch['turn_domain'][idx_in_batch][
                            -1]
                        value = self.db.pointerBack(value, turn_domain)
                    dial_turn[key] = value
                dialogs[dial_id].append(dial_turn)
        return dialogs
    def get_batches(self, set_name):
        """
        compute dataset stats.
        """
        global dia_count
        log_str = ''
        name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev}
        dial = name_to_set[set_name]
        turn_bucket = self._bucket_by_turn(dial)
        # self._shuffle_turn_bucket(turn_bucket)
        all_batches = []
        if set_name not in self.set_stats:
            self.set_stats[set_name] = {}
        num_training_steps = 0
        num_turns = 0
        num_dials = 0
        for k in turn_bucket:
            if set_name != 'test' and k == 1 or k >= 17:
                continue
            batches = self._construct_mini_batch(turn_bucket[k])
            try:
                log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % (
                    k, len(turn_bucket[k]), len(batches), len(batches[-1]))
            except Exception:
                log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % (
                    k, len(turn_bucket[k]), len(batches), 0.0)
            # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches)))
            num_training_steps += k * len(batches)
            num_turns += k * len(turn_bucket[k])
            num_dials += len(turn_bucket[k])
            all_batches += batches
        log_str += 'total batch num: %d\n' % len(all_batches)
        # print('total batch num: %d'%len(all_batches))
        # print('dialog count: %d'%dia_count)
        # return all_batches
        # log stats
        # logging.info(log_str)
        # cfg.num_training_steps = num_training_steps * cfg.epoch_num
        self.set_stats[set_name][
            'num_training_steps_per_epoch'] = num_training_steps  # turn-level steps
        self.set_stats[set_name]['num_turns'] = num_turns
        self.set_stats[set_name]['num_dials'] = num_dials
        if set_name == 'train':
            random.shuffle(all_batches)
        return all_batches
    def add_sepcial_tokens(self):
        """
            add special tokens to gpt tokenizer
            serves a similar role of Vocab.construt()
            make a dict of special tokens
        """
        special_tokens = []
        prompt_tokens = self.understand_tokens + self.policy_tokens
        special_tokens.extend(
            ontology.get_special_tokens(other_tokens=prompt_tokens))
        for word in ontology.all_domains + ['general']:
            word = '[' + word + ']'
            special_tokens.append(word)
        for word in ontology.all_acts:
            word = '[' + word + ']'
            special_tokens.append(word)
        for word in self.vocab._word2idx.keys():
            if word.startswith('[value_') and word.endswith(']'):
                special_tokens.append(word)
        return special_tokens
    def _build_vocab(self, model_dir: str):
        self.vocab = utils.MultiWOZVocab(3000)
        vp = os.path.join('{}/vocab'.format(model_dir))
        self.vocab.load_vocab(vp)
        return self.vocab.vocab_size
    def _get_convert_str(self, sent):
        assert isinstance(sent, str)
        return ' '.join([
            self.tokenizer.spec_convert_dict.get(tok, tok)
            for tok in sent.split()
        ])
    def bspan_to_DBpointer(self, bspan, turn_domain):
        constraint_dict = self.bspan_to_constraint_dict(bspan)
        # print(constraint_dict)
        matnums = self.db.get_match_num(constraint_dict)
        match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1]
        match_dom = match_dom[1:-1] if match_dom.startswith('[') else match_dom
        match = matnums[match_dom]
        # vector = self.db.addDBPointer(match_dom, match)
        vector = self.db.addDBIndicator(match_dom, match)
        return vector
    def bspan_to_constraint_dict(self, bspan, bspn_mode='bspn'):
        """
        ['[hotel]', 'pricerange', 'cheap', 'type', 'hotel'] -> {'hotel': {'pricerange': 'cheap', 'type': 'hotel'}}
        """
        bspan = bspan.split() if isinstance(bspan, str) else bspan
        constraint_dict = {}
        domain = None
        conslen = len(bspan)
        for idx, cons in enumerate(bspan):
            cons = self.vocab.decode(cons) if type(cons) is not str else cons
            if cons == '<eos_b>':
                break
            if '[' in cons:
                if cons[1:-1] not in ontology.all_domains:
                    continue
                domain = cons[1:-1]
            elif cons in ontology.get_slot:
                if domain is None:
                    continue
                if cons == 'people':
                    # handle confusion of value name "people's portraits..." and slot people
                    try:
                        ns = bspan[idx + 1]
                        ns = self.vocab.decode(ns) if type(
                            ns) is not str else ns
                        if ns == "'s":
                            continue
                    except Exception:
                        continue
                if not constraint_dict.get(domain):
                    constraint_dict[domain] = {}
                if bspn_mode == 'bsdx':
                    constraint_dict[domain][cons] = 1
                    continue
                vidx = idx + 1
                if vidx == conslen:
                    break
                vt_collect = []
                vt = bspan[vidx]
                vt = self.vocab.decode(vt) if type(vt) is not str else vt
                while vidx < conslen and vt != '<eos_b>' and '[' not in vt and vt not in ontology.get_slot:
                    vt_collect.append(vt)
                    vidx += 1
                    if vidx == conslen:
                        break
                    vt = bspan[vidx]
                    vt = self.vocab.decode(vt) if type(vt) is not str else vt
                if vt_collect:
                    constraint_dict[domain][cons] = ' '.join(vt_collect)
        return constraint_dict
    def convert_batch_turn(self, turn_batch, pv_batch, first_turn=False):
        """
        convert the current and the last turn
        concat [U_0,R_0,...,U_{t-1}, R_{t-1}, U_t, B_t, A_t, R_t]
        firts turn: [U_t, B_t, A_t, R_t]
        try: [user, bspn, db, aspn, resp]
        """
        inputs = []
        if first_turn:
            batch_zipped = zip(turn_batch['user'], turn_batch['bspn'],
                               turn_batch['db'], turn_batch['aspn'],
                               turn_batch['resp'])
            for u, b, db, a, r in batch_zipped:
                if self.use_true_curr_bspn:
                    src = [u + b + db]
                    tgt = a + r
                else:
                    src = [u]
                    tgt = b + db + a + r
                inputs.append({'src': src, 'tgt': tgt})
                pv = [src[-1], tgt]
                pv_batch.append(pv)
        else:
            batch_zipped = zip(pv_batch, turn_batch['user'],
                               turn_batch['bspn'], turn_batch['db'],
                               turn_batch['aspn'], turn_batch['resp'])
            for i, (pv, u, b, db, a, r) in enumerate(batch_zipped):
                if self.use_true_curr_bspn:
                    src = pv + [u + b + db]
                    tgt = a + r
                else:
                    src = pv + [u]
                    tgt = b + db + a + r
                inputs.append({'src': src, 'tgt': tgt})
                pv = [src[-1], tgt]
                pv_batch[i].extend(pv)
        return inputs, pv_batch
    def wrap_result_lm(self, result_dict, eos_syntax=None):
        results = []
        eos_syntax = ontology.eos_tokens if not eos_syntax else eos_syntax
        sos_syntax = ontology.sos_tokens
        # ground truth bs, as, ds.. generate response
        field = [
            'dial_id', 'turn_num', 'user', 'bspn_gen', 'bsdx', 'resp_gen',
            'resp', 'aspn_gen', 'aspn', 'dspn_gen', 'dspn', 'bspn', 'pointer',
            'qspn_gen', 'qspn'
        ]
        for dial_id, turns in result_dict.items():
            entry = {'dial_id': dial_id, 'trun_num': len(turns)}
            for f in field[2:]:
                entry[f] = ''  # TODO ???
            results.append(entry)
            for turn_idx, turn in enumerate(turns):
                entry = {'dial_id': dial_id}
                for key in field:
                    if key in ['dial_id']:
                        continue
                    v = turn.get(key, '')
                    if key == 'turn_domain':
                        v = ' '.join(v)
                    if key in eos_syntax and v != '':
                        # remove eos tokens
                        v = self.tokenizer.decode(v)
                        v = v.split()
                        # remove eos/sos in span
                        if eos_syntax[key] in v:
                            v.remove(eos_syntax[key])
                        if sos_syntax[key] in v:
                            v.remove(sos_syntax[key])
                        v = ' '.join(v)
                    else:
                        pass  # v = v
                    entry[key] = v
                results.append(entry)
        return results, field
    def convert_turn_eval(self, turn, pv_turn, first_turn=False):
        """
        input: [all previous ubar, U_t, B_t, A_t] predict R_t
        firts turn: [U_t, B_t, A_t] predict R_t
        regarding the context, all previous ubar is too slow, try the previous ubar
        """
        inputs = {}
        context_list = []
        prompt_id = None
        if self.use_true_curr_bspn:
            if self.use_true_curr_aspn:  # only predict resp
                context_list = ['user', 'bspn', 'db', 'aspn']
                prompt_id = self.sos_r_id
            else:  # predicted aspn
                context_list = ['user', 'bspn', 'db']
                prompt_id = self.sos_a_id
        else:  # predict bspn aspn resp. db are not predicted. this part tbd.
            context_list = ['user']
            prompt_id = self.sos_b_id
        if first_turn:
            context = []
            for c in context_list:
                context += turn[c]
            inputs['src'] = [context]
            inputs['labels'] = [context]
        else:
            context = []
            for c in context_list:
                context += turn[c]
            if self.use_true_curr_bspn:
                pv_context = pv_turn['labels'] + [
                    pv_turn['aspn'] + pv_turn['resp']
                ]
            else:
                pv_info = pv_turn['bspn'] + pv_turn['db'] + pv_turn[
                    'aspn'] + pv_turn['resp']
                pv_context = pv_turn['labels'] + [pv_info]
            # prompt response, add sos_r
            inputs['src'] = pv_context + [context]
            if self.use_all_previous_context:
                inputs['labels'] = pv_context + [
                    context
                ]  # use all previous ubar history
            else:
                inputs['labels'] = [context]  # use previous turn
        return inputs, prompt_id
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
--- a/modelscope/preprocessors/space/tokenizer.py
+++ b/modelscope/preprocessors/space/tokenizer.py
@@ -0,0 +1,668 @@
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import collections
 import logging
 import os
 import sys
 import unicodedata
 import json
 import regex as re
 def clean_string(string):
    replace_mp = {
        ' - ': '-',
        " ' ": "'",
        " n't": "n't",
        " 'm": "'m",
        ' do not': " don't",
        " 's": "'s",
        " 've": "'ve",
        " 're": "'re"
    }
    for k, v in replace_mp.items():
        string = string.replace(k, v)
    return string
 class Tokenizer(object):
    def __init__(self, vocab_path, special_tokens=[], tokenizer_type='Bert'):
        self.tokenizer_type = tokenizer_type
        if tokenizer_type == 'Bert':
            self.spec_convert_dict = {
                '[BOS]': '[unused0]',
                '[EOS]': '[unused1]'
            }
            for token in special_tokens:
                if token not in self.spec_convert_dict and token not in [
                        '[PAD]', '[UNK]'
                ]:
                    self.spec_convert_dict[
                        token] = f'[unused{len(self.spec_convert_dict)}]'
            self.spec_revert_dict = {
                v: k
                for k, v in self.spec_convert_dict.items()
            }
            special_tokens = [
                self.spec_convert_dict.get(tok, tok) for tok in special_tokens
            ]
            self.special_tokens = ('[UNK]', '[SEP]', '[PAD]', '[CLS]',
                                   '[MASK]')
            self.special_tokens += tuple(x for x in special_tokens
                                         if x not in self.special_tokens)
            self._tokenizer = BertTokenizer(
                vocab_path, never_split=self.special_tokens)
            for tok in self.special_tokens:
                assert tok in self._tokenizer.vocab, f"special token '{tok}' is not in the vocabulary"
            self.vocab_size = len(self._tokenizer.vocab)
        elif tokenizer_type == 'GPT2':
            self.spec_convert_dict = {'[UNK]': '<unk>'}
            self.spec_revert_dict = {
                v: k
                for k, v in self.spec_convert_dict.items()
            }
            special_tokens = [
                tok for tok in special_tokens
                if tok not in self.spec_convert_dict
            ]
            vocab_file = os.path.join(vocab_path, 'vocab.json')
            merges_file = os.path.join(vocab_path, 'merges.txt')
            self._tokenizer = GPT2Tokenizer(
                vocab_file, merges_file, special_tokens=special_tokens)
            self.num_specials = len(special_tokens)
            self.vocab_size = len(self._tokenizer)
        else:
            raise ValueError
    def tokenize(self, text):
        return self._tokenizer.tokenize(text)
    def convert_tokens_to_ids(self, tokens):
        if self.tokenizer_type == 'Bert':
            tokens = [self.spec_convert_dict.get(tok, tok) for tok in tokens]
            ids = self._tokenizer.convert_tokens_to_ids(tokens)
            return ids
        else:
            tokens = [self.spec_convert_dict.get(tok, tok) for tok in tokens]
            ids = self._tokenizer.convert_tokens_to_ids(tokens)
            ids = [(i + self.num_specials) % self.vocab_size for i in ids]
            return ids
    def convert_ids_to_tokens(self, ids):
        if self.tokenizer_type == 'Bert':
            tokens = self._tokenizer.convert_ids_to_tokens(ids)
            tokens = [self.spec_revert_dict.get(tok, tok) for tok in tokens]
            return tokens
        else:
            ids = [(i - self.num_specials) % self.vocab_size for i in ids]
            tokens = self._tokenizer.convert_ids_to_tokens(ids)
            tokens = [self.spec_revert_dict.get(tok, tok) for tok in tokens]
            return tokens
    def decode(self, ids, ignore_tokens=[]):
        tokens = self.convert_ids_to_tokens(ids)
        if len(ignore_tokens) > 0:
            ignore_tokens = set(ignore_tokens)
            tokens = [tok for tok in tokens if tok not in ignore_tokens]
        if self.tokenizer_type == 'Bert':
            string = ' '.join(tokens).replace(' ##', '')
        else:
            string = ''.join(tokens)
            string = bytearray([
                self._tokenizer.byte_decoder[c] for c in string
            ]).decode('utf-8')
        string = clean_string(string)
        return string
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes."""
 logger = logging.getLogger(__name__)
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, 'r', encoding='utf-8') as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab
 def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens
 class BertTokenizer(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
        """Constructs a BertTokenizer.
        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_lower_case: Whether to lower case the input
                         Only has an effect when do_wordpiece_only=False
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case, never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
    def tokenize(self, text):
        split_tokens = []
        if self.do_basic_tokenize:
            for token in self.basic_tokenizer.tokenize(text):
                for sub_token in self.wordpiece_tokenizer.tokenize(token):
                    split_tokens.append(sub_token)
        else:
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens
    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            logger.warning(
                'Token indices sequence length is longer than the specified maximum '
                ' sequence length for this BERT model ({} > {}). Running this'
                ' sequence through BERT will result in indexing errors'.format(
                    len(ids), self.max_len))
        return ids
    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens
 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
    def __init__(self,
                 do_lower_case=True,
                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split
    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case and token not in self.never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))
        output_tokens = whitespace_tokenize(' '.join(split_tokens))
        return output_tokens
    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize('NFD', text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == 'Mn':
                continue
            output.append(char)
        return ''.join(output)
    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        if text in self.never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        return [''.join(x) for x in output]
    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(' ')
                output.append(char)
                output.append(' ')
            else:
                output.append(char)
        return ''.join(output)
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        tmp = (cp >= 0x4E00 and cp <= 0x9FFF)
        tmp = tmp or (cp >= 0x3400 and cp <= 0x4DBF)
        tmp = tmp or (cp >= 0x20000 and cp <= 0x2A6DF)
        tmp = tmp or (cp >= 0x2A700 and cp <= 0x2B73F)
        tmp = tmp or (cp >= 0x2B740 and cp <= 0x2B81F)
        tmp = tmp or (cp >= 0x2B820 and cp <= 0x2CEAF)
        tmp = tmp or (cp >= 0xF900 and cp <= 0xFAFF)
        tmp = tmp or (cp >= 0x2F800 and cp <= 0x2FA1F)
        if tmp:
            return True
        return False
    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(' ')
            else:
                output.append(char)
        return ''.join(output)
 class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.
        Returns:
          A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = ''.join(chars[start:end])
                    if start > 0:
                        substr = '##' + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens
 def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
        return True
    cat = unicodedata.category(char)
    if cat == 'Zs':
        return True
    return False
 def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == '\t' or char == '\n' or char == '\r':
        return False
    cat = unicodedata.category(char)
    if cat.startswith('C'):
        return True
    return False
 def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    tmp = (cp >= 33 and cp <= 47)
    tmp = tmp or (cp >= 58 and cp <= 64)
    tmp = tmp or (cp >= 91 and cp <= 96)
    tmp = tmp or (cp >= 123 and cp <= 126)
    if tmp:
        return True
    cat = unicodedata.category(char)
    if cat.startswith('P'):
        return True
    return False
 # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 try:
    from functools import lru_cache
 except ImportError:
    # Just a dummy decorator to get the checks to run on python2
    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
    def lru_cache():
        return lambda func: func
@lru_cache()
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    _chr = unichr if sys.version_info[0] == 2 else chr
    bs = list(range(ord('!'),
                    ord('~') + 1)) + list(range(
                        ord('¡'),
                        ord('¬') + 1)) + list(range(ord('®'),
                                                    ord('ÿ') + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [_chr(n) for n in cs]
    return dict(zip(bs, cs))
 def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs
 class GPT2Tokenizer(object):
    """
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level BPE
    """
    def __init__(self,
                 vocab_file,
                 merges_file,
                 errors='replace',
                 special_tokens=None,
                 max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}
        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(
            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
        )
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens)
    def __len__(self):
        return len(self.encoder) + len(self.special_tokens)
    def set_special_tokens(self, special_tokens):
        """ Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        """
        if not special_tokens:
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return
        self.special_tokens = dict((tok, len(self.encoder) + i)
                                   for i, tok in enumerate(special_tokens))
        self.special_tokens_decoder = {
            v: k
            for k, v in self.special_tokens.items()
        }
        logger.info('Special tokens {}'.format(self.special_tokens))
    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)
        if not pairs:
            return token
        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except Exception:
                    new_word.extend(word[i:])
                    break
                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word
    def tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[ord(b)] for b in token
                            if ord(b) in self.byte_encoder)
            if token == '':
                continue
            bpe_tokens.extend(
                bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens
    def convert_tokens_to_ids(self, tokens):
        """ Converts a sequence of tokens into ids using the vocab. """
        ids = []
        python_version_3 = isinstance(tokens, str)
        python_version_2 = (
            sys.version_info[0] == 2 and isinstance(tokens, unicode))
        if python_version_3 or python_version_2:
            if tokens in self.special_tokens:
                return self.special_tokens[tokens]
            else:
                return self.encoder.get(tokens, 0)
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(self.encoder.get(token, 0))
        if len(ids) > self.max_len:
            logger.warning(
                'Token indices sequence length is longer than the specified maximum '
                ' sequence length for this OpenAI GPT model ({} > {}). Running this'
                ' sequence through the model will result in indexing errors'.
                format(len(ids), self.max_len))
        return ids
    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """Converts a sequence of ids in BPE tokens using the vocab."""
        tokens = []
        for i in ids:
            if i in self.special_tokens_decoder:
                if not skip_special_tokens:
                    tokens.append(self.special_tokens_decoder[i])
            else:
                tokens.append(self.decoder[i])
        return tokens
    def encode(self, text):
        return self.convert_tokens_to_ids(self.tokenize(text))
    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors=self.errors)
        return text
--- a/modelscope/trainers/nlp/space/init.py
+++ b/modelscope/trainers/nlp/space/init.py
--- a/modelscope/trainers/nlp/space/metrics/init.py
+++ b/modelscope/trainers/nlp/space/metrics/init.py
--- a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
+++ b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
@@ -0,0 +1,73 @@
 """
 MetricsTracker class
 """
 import math
 from collections import defaultdict
 class MetricsTracker(object):
    """ Tracking metrics. """
    def __init__(self):
        self.metrics_val = defaultdict(float)  # for one batch
        self.metrics_avg = defaultdict(float)  # avg batches
        self.num_samples = 0
    def update(self, metrics, num_samples):
        for key, val in metrics.items():
            if val is not None:
                val = float(val)  # [val] -> val
                self.metrics_val[key] = val
                avg_val = \
                    (self.metrics_avg.get(key, 0) * self.num_samples + val * num_samples) / \
                    (self.num_samples + num_samples)
                self.metrics_avg[key] = avg_val
        self.num_samples += num_samples
    def clear(self):
        self.metrics_val = defaultdict(float)
        self.metrics_avg = defaultdict(float)
        self.num_samples = 0
    def items(self):
        return self.metrics_avg.items()
    def get(self, name):
        if self.num_samples == 0:
            raise ValueError('There is no data in Metrics.')
        return self.metrics_avg.get(name)
    def state_dict(self):
        return {
            'metrics_val': self.metrics_val,
            'metrics_avg': self.metrics_avg,
            'num_samples': self.num_samples,
        }
    def load_state_dict(self, state_dict):
        self.metrics_val = state_dict['metrics_val']
        self.metrics_avg = state_dict['metrics_avg']
        self.num_samples = state_dict['num_samples']
    def value(self):
        metric_strs = []
        for key, val in self.metrics_val.items():
            metric_str = f'{key.upper()}-{val:.3f}'
            metric_strs.append(metric_str)
        if 'token_nll' in self.metrics_val:
            metric_str = f"TOKEN_PPL-{math.exp(self.metrics_val['token_nll']):.3f}"
            metric_strs.append(metric_str)
        metric_strs = '   '.join(metric_strs)
        return metric_strs
    def summary(self):
        metric_strs = []
        for key, val in self.metrics_avg.items():
            metric_str = f'{key.upper()}-{val:.3f}'
            metric_strs.append(metric_str)
        if 'token_nll' in self.metrics_avg:
            metric_str = f"TOKEN_PPL-{math.exp(self.metrics_avg['token_nll']):.3f}"
            metric_strs.append(metric_str)
        metric_strs = '   '.join(metric_strs)
        return metric_strs
--- a/modelscope/trainers/nlp/space/trainer/init.py
+++ b/modelscope/trainers/nlp/space/trainer/init.py
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -0,0 +1,761 @@
 """
 Trainer class.
 """
 import logging
 import os
 import sys
 import time
 from collections import OrderedDict
 import json
 import numpy as np
 import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 from .....utils.nlp.space import ontology
 from ..metrics.metrics_tracker import MetricsTracker
 def get_logger(log_path, name='default'):
    logger = logging.getLogger(name)
    logger.propagate = False
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(message)s')
    sh = logging.StreamHandler(sys.stdout)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    fh = logging.FileHandler(log_path, mode='w')
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger
 class Trainer(object):
    def __init__(self,
                 model,
                 to_tensor,
                 config,
                 logger=None,
                 lr_scheduler=None,
                 optimizer=None,
                 reader=None,
                 evaluator=None):
        self.to_tensor = to_tensor
        self.do_train = config.do_train
        self.do_infer = config.do_infer
        self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
            0] == '-'
        self.valid_metric_name = config.Trainer.valid_metric_name[1:]
        self.num_epochs = config.Trainer.num_epochs
        # self.save_dir = config.Trainer.save_dir
        self.log_steps = config.Trainer.log_steps
        self.valid_steps = config.Trainer.valid_steps
        self.save_checkpoint = config.Trainer.save_checkpoint
        self.save_summary = config.Trainer.save_summary
        self.lr = config.Model.lr
        self.weight_decay = config.Model.weight_decay
        self.batch_size = config.Trainer.batch_size
        self.gradient_accumulation_steps = config.Model.gradient_accumulation_steps
        self.warmup_steps = config.Model.warmup_steps
        self.gpu = config.Trainer.gpu
        self.lr_scheduler = lr_scheduler
        self.optimizer = optimizer
        self.model = model
        self.func_model = self.model.module if self.gpu > 1 else self.model
        self.reader = reader
        self.evaluator = evaluator
        self.tokenizer = reader.tokenizer
        # if not os.path.exists(self.save_dir):
        #     os.makedirs(self.save_dir)
        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
        self.logger = logger or get_logger('trainer.log', 'trainer')
        self.batch_metrics_tracker = MetricsTracker()
        self.token_metrics_tracker = MetricsTracker()
        self.best_valid_metric = float(
            'inf' if self.is_decreased_valid_metric else '-inf')
        self.epoch = 0
    def decode_generated_bspn_resp(self, generated):
        """
        decode generated
        return decoded ('bspn', 'resp')
        """
        decoded = {}
        eos_r_id = self.reader.eos_r_id
        eos_b_id = self.reader.eos_b_id
        # eos_r may not exists if gpt2 generated repetitive words.
        if eos_r_id in generated:
            eos_r_idx = generated.index(eos_r_id)
        else:
            eos_r_idx = len(generated) - 1
            # self.logger.info('eos_r not in generated: ' + self.tokenizer.decode(generated))
        # predicted bspn, resp
        eos_b_idx = generated.index(eos_b_id)
        decoded['bspn'] = generated[:eos_b_idx + 1]
        decoded['resp'] = generated[eos_b_idx + 1:eos_r_idx + 1]
        return decoded
    def decode_generated_act_resp(self, generated):
        """
        decode generated
        return decoded['resp'] ('bspn', 'aspn')
        """
        decoded = {}
        eos_a_id = self.reader.eos_a_id
        eos_r_id = self.reader.eos_r_id
        # eos_b_id = self.reader.eos_b_id
        # eos_r may not exists if gpt2 generated repetitive words.
        if eos_r_id in generated:
            eos_r_idx = generated.index(eos_r_id)
        else:
            eos_r_idx = len(generated) - 1
            msg = 'eos_r not in generated: ' + self.tokenizer.decode(generated)
            self.logger.info(msg)
        if self.reader.use_true_curr_aspn:  # only predict resp
            decoded['resp'] = generated[:eos_r_idx + 1]
        else:  # predicted aspn, resp
            eos_a_idx = generated.index(eos_a_id)
            decoded['aspn'] = generated[:eos_a_idx + 1]
            decoded['resp'] = generated[eos_a_idx + 1:eos_r_idx + 1]
        return decoded
    def decode_generated_bspn(self, generated):
        eos_b_id = self.reader.eos_b_id
        if eos_b_id in generated:
            eos_b_idx = generated.index(eos_b_id)
        else:
            eos_b_idx = len(generated) - 1
        return generated[:eos_b_idx + 1]
    def set_optimizers(self):
        """
        Setup the optimizer and the learning rate scheduler.
        from transformers.Trainer
        parameters from cfg: lr (1e-3); warmup_steps
        """
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'norm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                self.weight_decay,
            },
            {
                'params': [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr)
        num_training_steps = \
            self.reader.set_stats['train']['num_training_steps_per_epoch'] \
            * self.num_epochs \
            // self.gradient_accumulation_steps
        num_warmup_steps = self.warmup_steps if self.warmup_steps >= 0 else int(
            num_training_steps * 0.1)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps)
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
    def train(self, train_data, dev_data):
        # log info
        set_stats = self.reader.set_stats['train']
        self.logger.info('***** Running training *****')
        self.logger.info(
            '  Num Training steps(one turn in a batch of dialogs) per epoch = %d',
            set_stats['num_training_steps_per_epoch'])
        self.logger.info('  Num Turns = %d', set_stats['num_turns'])
        self.logger.info('  Num Dialogs = %d', set_stats['num_dials'])
        self.logger.info('  Num Epochs = %d', self.num_epochs)
        self.logger.info('  Batch size  = %d', self.batch_size)
        self.logger.info('  Gradient Accumulation steps = %d',
                         self.gradient_accumulation_steps)
        steps = set_stats[
            'num_training_steps_per_epoch'] * self.num_epochs // self.gradient_accumulation_steps
        msg = '  Total optimization steps = %d' % steps
        self.logger.info(msg)
        # begin training
        num_epochs = self.num_epochs - self.epoch
        for epoch in range(num_epochs):
            self.train_epoch(train_data=train_data, dev_data=dev_data)
    def train_epoch(self, train_data, dev_data):
        """
        Train an epoch.
        """
        raise NotImplementedError
    def infer(self, data_type):
        """
        Inference interface.
        """
        raise NotImplementedError
    def forward(self, turn, old_pv_turn):
        """
        one turn inference
        """
        raise NotImplementedError
    def save(self, is_best=False):
        """ save """
        train_state = {
            'epoch': self.epoch,
            'best_valid_metric': self.best_valid_metric,
            'optimizer': self.optimizer.state_dict()
        }
        if self.lr_scheduler is not None:
            train_state['lr_scheduler'] = self.lr_scheduler.state_dict()
        # Save checkpoint
        if self.save_checkpoint:
            model_file = os.path.join(self.save_dir,
                                      f'state_epoch_{self.epoch}.model')
            torch.save(self.model.state_dict(), model_file)
            self.logger.info(f"Saved model state to '{model_file}'")
            train_file = os.path.join(self.save_dir,
                                      f'state_epoch_{self.epoch}.train')
            torch.save(train_state, train_file)
            self.logger.info(f"Saved train state to '{train_file}'")
        # Save current best model
        if is_best:
            best_model_file = os.path.join(self.save_dir, 'best.model')
            torch.save(self.model.state_dict(), best_model_file)
            best_train_file = os.path.join(self.save_dir, 'best.train')
            torch.save(train_state, best_train_file)
            self.logger.info(
                f"Saved best model state to '{best_model_file}' with new best valid metric "
                f'{self.valid_metric_name.upper()}={self.best_valid_metric:.3f}'
            )
    def load(self):
        """ load """
        def _load_model_state():
            model_state_dict = torch.load(
                f'{self.func_model.init_checkpoint}',
                map_location=lambda storage, loc: storage)
            if 'module.' in list(model_state_dict.keys())[0]:
                new_model_state_dict = OrderedDict()
                for k, v in model_state_dict.items():
                    assert k[:7] == 'module.'
                    new_model_state_dict[k[7:]] = v
                model_state_dict = new_model_state_dict
            new_model_state_dict = OrderedDict()
            parameters = {
                name: param
                for name, param in self.func_model.named_parameters()
            }
            for name, param in model_state_dict.items():
                if name in parameters:
                    if param.shape != parameters[name].shape:
                        assert hasattr(param, 'numpy')
                        arr = param.numpy()
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        if name == 'embedder.token_embedding.weight':
                            z[-param.shape[0]:] = arr
                            print(
                                f'part of parameter({name}) random normlize initialize'
                            )
                        else:
                            if z.shape[0] < param.shape[0]:
                                z = arr[:z.shape[0]]
                                print(f'part of parameter({name}) are dropped')
                            else:
                                z[:param.shape[0]] = arr
                                print(
                                    f'part of parameter({name}) random normlize initialize'
                                )
                        dtype, device = param.dtype, param.device
                        z = torch.tensor(z, dtype=dtype, device=device)
                        new_model_state_dict[name] = z
                    else:
                        new_model_state_dict[name] = param
                else:
                    print(f'parameter({name}) are dropped')
            model_state_dict = new_model_state_dict
            for name in parameters:
                if name not in model_state_dict:
                    if parameters[name].requires_grad:
                        print(f'parameter({name}) random normlize initialize')
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        dtype, device = parameters[name].dtype, parameters[
                            name].device
                        model_state_dict[name] = torch.tensor(
                            z, dtype=dtype, device=device)
                    else:
                        model_state_dict[name] = parameters[name]
            self.func_model.load_state_dict(model_state_dict)
            self.logger.info(
                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
            )
        def _load_train_state():
            train_file = f'{self.func_model.init_checkpoint}.train'
            if os.path.exists(train_file):
                train_state_dict = torch.load(
                    train_file, map_location=lambda storage, loc: storage)
                self.epoch = train_state_dict['epoch']
                self.best_valid_metric = train_state_dict['best_valid_metric']
                if self.optimizer is not None and 'optimizer' in train_state_dict:
                    self.optimizer.load_state_dict(
                        train_state_dict['optimizer'])
                if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict:
                    self.lr_scheduler.load_state_dict(
                        train_state_dict['lr_scheduler'])
                self.logger.info(
                    f"Loaded train state from '{train_file}' with (epoch-{self.epoch} "
                    f'best_valid_metric={self.best_valid_metric:.3f})')
            else:
                self.logger.info('Loaded no train state')
        if self.func_model.init_checkpoint is None:
            self.logger.info('Loaded no model !!!')
            return
        if self.do_train:
            _load_model_state()
            return
        if self.do_infer:
            _load_model_state()
            _load_train_state()
 class MultiWOZTrainer(Trainer):
    def __init__(self,
                 model,
                 to_tensor,
                 config,
                 logger=None,
                 lr_scheduler=None,
                 optimizer=None,
                 reader=None,
                 evaluator=None):
        super(MultiWOZTrainer,
              self).__init__(model, to_tensor, config, logger, lr_scheduler,
                             optimizer, reader, evaluator)
    def train_epoch(self, train_data, dev_data):
        """
        Train an epoch.
        """
        times = []
        epoch_step = 0
        global_step = 0
        tr_batch_loss = 0.0
        tr_token_loss = 0.0
        self.epoch += 1
        self.batch_metrics_tracker.clear()
        self.token_metrics_tracker.clear()
        num_training_steps = \
            self.reader.set_stats['train']['num_training_steps_per_epoch'] // \
            self.gradient_accumulation_steps  # similar to the original num_batches
        self.model.zero_grad()
        data_iterator = self.reader.get_data_iterator(all_batches=train_data)
        for batch_idx, dial_batch in enumerate(data_iterator):
            pv_batch = []
            for turn_num, turn_batch in enumerate(dial_batch):
                first_turn = (turn_num == 0)
                samples, pv_batch = self.reader.convert_batch_turn(
                    turn_batch, pv_batch, first_turn)
                batch, batch_size = self.reader.collate_fn_multi_turn(
                    samples=samples)
                batch = type(batch)(
                    map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                        batch.items()))
                # Do a training iteration
                start_time = time.time()
                metrics = self.model(batch, is_training=True)
                if self.gpu > 1:
                    for metric in metrics:
                        if metric is not None:
                            assert len(metric) == self.gpu
                    nll, token_nll, token_num = metrics
                    metrics = {}
                    token_num = torch.sum(token_num)
                    token_nll = \
                        torch.sum(nll) * (batch_size / self.gpu) / \
                        token_num
                    nll = torch.mean(nll)
                    metrics['token_num'] = token_num
                    metrics['token_nll'] = token_nll
                    metrics['nll'] = nll
                    loss = token_nll if self.func_model.token_loss else nll
                    metrics['loss'] = loss
                else:
                    loss = metrics['loss']
                self.func_model._optimize(
                    loss, do_update=False, optimizer=self.optimizer)
                metrics = {
                    k: v.cpu().detach().numpy()
                    if isinstance(v, torch.Tensor) else v
                    for k, v in metrics.items()
                }
                token_num = metrics.pop('token_num', None)
                # bow_num = metrics.pop("bow_num", None)
                elapsed = time.time() - start_time
                times.append(elapsed)
                epoch_step += 1
                tr_batch_loss += metrics['nll']
                tr_token_loss += metrics['token_nll']
                batch_metrics = {
                    k: v
                    for k, v in metrics.items() if 'token' not in k
                }
                token_metrics = {
                    k: v
                    for k, v in metrics.items() if 'token' in k
                }
                self.batch_metrics_tracker.update(batch_metrics, batch_size)
                self.token_metrics_tracker.update(token_metrics, token_num)
                if (epoch_step % self.gradient_accumulation_steps == 0) or \
                        (epoch_step == self.reader.set_stats['train']['num_training_steps_per_epoch']):
                    self.optimizer.step()
                    self.lr_scheduler.step()
                    self.optimizer.zero_grad()
                    global_step += 1
                    if self.log_steps > 0 and global_step % self.log_steps == 0:
                        batch_metrics_message = self.batch_metrics_tracker.value(
                        )
                        token_metrics_message = self.token_metrics_tracker.value(
                        )
                        message_prefix = f'[Train][{self.epoch}][{global_step}/{num_training_steps}]'
                        avg_time = f'AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}'
                        message = '   '.join([
                            message_prefix, batch_metrics_message,
                            token_metrics_message, avg_time
                        ])
                        self.logger.info(message)
        self.logger.info('-' * 150)
        avg_batch_loss = tr_batch_loss / epoch_step
        avg_token_loss = tr_token_loss / epoch_step
        batch_metrics_message = self.batch_metrics_tracker.summary()
        token_metrics_message = self.token_metrics_tracker.summary()
        message_prefix = f'[Valid][{self.epoch}]'
        message = '   '.join([
            message_prefix, batch_metrics_message, token_metrics_message,
            str(avg_batch_loss),
            str(avg_token_loss)
        ])
        self.logger.info(message)
        cur_valid_metric = self.batch_metrics_tracker.get(
            self.valid_metric_name)
        if self.is_decreased_valid_metric:
            is_best = cur_valid_metric < self.best_valid_metric
        else:
            is_best = cur_valid_metric > self.best_valid_metric
        if is_best:
            self.best_valid_metric = cur_valid_metric
        self.save(is_best)
        self.logger.info('-' * 150)
        return
    def infer(self, data_type='test'):
        """
        Inference interface.
        """
        self.logger.info('Generation starts ...')
        infer_save_file = os.path.join(self.save_dir,
                                       f'infer_{self.epoch}.result.json')
        infer_samples_save_file = os.path.join(
            self.save_dir, f'infer_samples_{self.epoch}.result.json')
        # Inference
        result_collection = {}
        begin_time = time.time()
        eval_data = self.reader.get_eval_data(data_type)
        set_stats = self.reader.set_stats[data_type]
        self.logger.info('***** Running Evaluation *****')
        self.logger.info('  Num Turns = %d', set_stats['num_turns'])
        with torch.no_grad():
            pbar = tqdm(eval_data)
            for dial_idx, dialog in enumerate(pbar):
                pv_turn = {}
                for turn_idx, turn in enumerate(dialog):
                    first_turn = (turn_idx == 0)
                    inputs, prompt_id = self.reader.convert_turn_eval(
                        turn, pv_turn, first_turn)
                    batch, batch_size = self.reader.collate_fn_multi_turn(
                        samples=[inputs])
                    batch = type(batch)(
                        map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                            batch.items()))
                    if self.reader.use_true_curr_bspn:  # generate act, response
                        max_len = 60
                        if not self.reader.use_true_curr_aspn:
                            max_len = 80
                        outputs = self.func_model.infer(
                            inputs=batch,
                            start_id=prompt_id,
                            eos_id=self.reader.eos_r_id,
                            max_gen_len=max_len)
                        # resp_gen, need to trim previous context
                        generated = outputs[0].cpu().numpy().tolist()
                        try:
                            decoded = self.decode_generated_act_resp(generated)
                        except ValueError as exception:
                            self.logger.info(str(exception))
                            self.logger.info(self.tokenizer.decode(generated))
                            decoded = {'resp': [], 'bspn': [], 'aspn': []}
                    else:  # predict bspn, access db, then generate act and resp
                        outputs = self.func_model.infer(
                            inputs=batch,
                            start_id=prompt_id,
                            eos_id=self.reader.eos_b_id,
                            max_gen_len=60)
                        generated_bs = outputs[0].cpu().numpy().tolist()
                        bspn_gen = self.decode_generated_bspn(generated_bs)
                        # check DB result
                        if self.reader.use_true_db_pointer:  # To control whether current db is ground truth
                            db = turn['db']
                        else:
                            db_result = self.reader.bspan_to_DBpointer(
                                self.tokenizer.decode(bspn_gen),
                                turn['turn_domain'])
                            assert len(turn['db']) == 4
                            book_result = turn['db'][2]
                            assert isinstance(db_result, str)
                            db = \
                                [self.reader.sos_db_id] + \
                                self.tokenizer.convert_tokens_to_ids([db_result]) + \
                                [book_result] + \
                                [self.reader.eos_db_id]
                            prompt_id = self.reader.sos_a_id
                        prev_input = torch.tensor(bspn_gen + db)
                        if self.func_model.use_gpu:
                            prev_input = prev_input.cuda()
                        outputs_db = self.func_model.infer(
                            inputs=batch,
                            start_id=prompt_id,
                            eos_id=self.reader.eos_r_id,
                            max_gen_len=80,
                            prev_input=prev_input)
                        generated_ar = outputs_db[0].cpu().numpy().tolist()
                        try:
                            decoded = self.decode_generated_act_resp(
                                generated_ar)
                            decoded['bspn'] = bspn_gen
                        except ValueError as exception:
                            self.logger.info(str(exception))
                            self.logger.info(
                                self.tokenizer.decode(generated_ar))
                            decoded = {'resp': [], 'bspn': [], 'aspn': []}
                    turn['resp_gen'] = decoded['resp']
                    turn['bspn_gen'] = turn[
                        'bspn'] if self.reader.use_true_curr_bspn else decoded[
                            'bspn']
                    turn['aspn_gen'] = turn[
                        'aspn'] if self.reader.use_true_curr_aspn else decoded[
                            'aspn']
                    turn['dspn_gen'] = turn['dspn']
                    pv_turn['labels'] = inputs[
                        'labels']  # all true previous context
                    pv_turn['resp'] = turn[
                        'resp'] if self.reader.use_true_prev_resp else decoded[
                            'resp']
                    if not self.reader.use_true_curr_bspn:
                        pv_turn['bspn'] = turn[
                            'bspn'] if self.reader.use_true_prev_bspn else decoded[
                                'bspn']
                        pv_turn['db'] = turn[
                            'db'] if self.reader.use_true_prev_bspn else db
                    pv_turn['aspn'] = turn[
                        'aspn'] if self.reader.use_true_prev_aspn else decoded[
                            'aspn']
                tmp_dialog_result = self.reader.inverse_transpose_turn(dialog)
                result_collection.update(tmp_dialog_result)
                # compute tmp scores
                results, _ = self.reader.wrap_result_lm(tmp_dialog_result)
                bleu, success, match = self.evaluator.validation_metric(
                    results)
                score = 0.5 * (success + match) + bleu
                pbar.set_description(
                    'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' %
                    (match, success, bleu, score))
        # compute scores
        results, _ = self.reader.wrap_result_lm(result_collection)
        bleu, success, match = self.evaluator.validation_metric(results)
        score = 0.5 * (success + match) + bleu
        # log results
        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' %\
                          (match, success, bleu, score)
        message_prefix = f'[Infer][{self.epoch}]'
        time_cost = f'TIME-{time.time() - begin_time:.3f}'
        message = '   '.join([message_prefix, metrics_message, time_cost])
        self.logger.info(message)
        # save results
        eval_results = {
            'bleu': bleu,
            'success': success,
            'match': match,
            'score': score,
            'result': message
        }
        with open(infer_save_file, 'w') as fp:
            json.dump(eval_results, fp, indent=2)
        self.logger.info(f'Saved inference results to {infer_save_file}')
        with open(infer_samples_save_file, 'w') as fp:
            for sample in results:
                line = json.dumps(sample)
                fp.write(line)
                fp.write('\n')
        self.logger.info(
            f'Saved inference samples to {infer_samples_save_file}')
        return
    def _get_turn_domain(self, old_pv_turn, bspn_gen_ids, first_turn):
        def _get_slots(constraint):
            domain_name = ''
            slots = {}
            for item in constraint:
                if item in ontology.placeholder_tokens:
                    continue
                if item in ontology.all_domains_with_bracket:
                    domain_name = item
                    slots[domain_name] = set()
                else:
                    assert domain_name in ontology.all_domains_with_bracket
                    slots[domain_name].add(item)
            return slots
        turn_domain = []
        if first_turn and len(bspn_gen_ids) == 0:
            turn_domain = ['[general]']
            return turn_domain
        bspn_token = self.tokenizer.convert_ids_to_tokens(bspn_gen_ids)
        turn_slots = _get_slots(bspn_token)
        if first_turn:
            return list(turn_slots.keys())
        assert 'bspn' in old_pv_turn
        pv_bspn_token = self.tokenizer.convert_ids_to_tokens(
            old_pv_turn['bspn'])
        pv_turn_slots = _get_slots(pv_bspn_token)
        for domain, value in turn_slots.items():
            pv_value = pv_turn_slots[
                domain] if domain in pv_turn_slots else set()
            if len(value - pv_value) > 0 or len(pv_value - value):
                turn_domain.append(domain)
        if len(turn_domain) == 0:
            turn_domain = list(turn_slots.keys())
        return turn_domain
    def forward(self, turn, old_pv_turn):
        with torch.no_grad():
            first_turn = True if len(old_pv_turn) == 0 else False
            inputs, prompt_id = self.reader.convert_turn_eval(
                turn, old_pv_turn, first_turn)
            batch, batch_size = self.reader.collate_fn_multi_turn(
                samples=[inputs])
            batch = type(batch)(
                map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items()))
            pv_turn = {}
            outputs = self.func_model.infer(
                inputs=batch,
                start_id=prompt_id,
                eos_id=self.reader.eos_b_id,
                max_gen_len=60)
            generated_bs = outputs[0].cpu().numpy().tolist()
            bspn_gen = self.decode_generated_bspn(generated_bs)
            turn_domain = self._get_turn_domain(old_pv_turn, bspn_gen,
                                                first_turn)
            db_result = self.reader.bspan_to_DBpointer(
                self.tokenizer.decode(bspn_gen), turn_domain)
            assert isinstance(db_result, str)
            db = \
                [self.reader.sos_db_id] + \
                self.tokenizer.convert_tokens_to_ids([db_result]) + \
                [self.reader.eos_db_id]
            prompt_id = self.reader.sos_a_id
            prev_input = torch.tensor(bspn_gen + db)
            if self.func_model.use_gpu:
                prev_input = prev_input.cuda()
            outputs_db = self.func_model.infer(
                inputs=batch,
                start_id=prompt_id,
                eos_id=self.reader.eos_r_id,
                max_gen_len=80,
                prev_input=prev_input)
            generated_ar = outputs_db[0].cpu().numpy().tolist()
            decoded = self.decode_generated_act_resp(generated_ar)
            decoded['bspn'] = bspn_gen
            pv_turn['labels'] = inputs['labels']
            pv_turn['resp'] = decoded['resp']
            pv_turn['bspn'] = decoded['bspn']
            pv_turn['db'] = db
            pv_turn['aspn'] = decoded['aspn']
        return pv_turn
--- a/modelscope/trainers/nlp/space/trainer/intent_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/intent_trainer.py
@@ -0,0 +1,821 @@
 """
 Trainer class.
 """
 import logging
 import os
 import sys
 import time
 from collections import OrderedDict
 import json
 import numpy as np
 import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 from ..metrics.metrics_tracker import MetricsTracker
 def get_logger(log_path, name='default'):
    logger = logging.getLogger(name)
    logger.propagate = False
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(message)s')
    sh = logging.StreamHandler(sys.stdout)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    fh = logging.FileHandler(log_path, mode='w')
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger
 class Trainer(object):
    def __init__(self,
                 model,
                 to_tensor,
                 config,
                 reader=None,
                 logger=None,
                 lr_scheduler=None,
                 optimizer=None):
        self.model = model
        self.to_tensor = to_tensor
        self.do_train = config.do_train
        self.do_infer = config.do_infer
        self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
            0] == '-'
        self.valid_metric_name = config.Trainer.valid_metric_name[1:]
        self.num_epochs = config.Trainer.num_epochs
        self.save_dir = config.Trainer.save_dir
        self.log_steps = config.Trainer.log_steps
        self.valid_steps = config.Trainer.valid_steps
        self.save_checkpoint = config.Trainer.save_checkpoint
        self.save_summary = config.Trainer.save_summary
        self.learning_method = config.Dataset.learning_method
        self.weight_decay = config.Model.weight_decay
        self.warmup_steps = config.Model.warmup_steps
        self.batch_size_label = config.Trainer.batch_size_label
        self.batch_size_nolabel = config.Trainer.batch_size_nolabel
        self.gpu = config.Trainer.gpu
        self.lr = config.Model.lr
        self.model = model
        self.func_model = self.model.module if self.gpu > 1 else self.model
        self.reader = reader
        self.tokenizer = reader.tokenizer
        self.lr_scheduler = lr_scheduler
        self.optimizer = optimizer
        # if not os.path.exists(self.save_dir):
        #     os.makedirs(self.save_dir)
        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
        self.logger = logger or get_logger('trainer.log', 'trainer')
        self.batch_metrics_tracker_label = MetricsTracker()
        self.token_metrics_tracker_label = MetricsTracker()
        self.batch_metrics_tracker_nolabel = MetricsTracker()
        self.token_metrics_tracker_nolabel = MetricsTracker()
        self.best_valid_metric = float(
            'inf' if self.is_decreased_valid_metric else '-inf')
        self.epoch = 0
        self.batch_num = 0
    def set_optimizers(self, num_training_steps_per_epoch):
        """
        Setup the optimizer and the learning rate scheduler.
        from transformers.Trainer
        parameters from cfg: lr (1e-3); warmup_steps
        """
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'norm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                self.weight_decay,
            },
            {
                'params': [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr)
        num_training_steps = num_training_steps_per_epoch * self.num_epochs
        num_warmup_steps = self.warmup_steps if self.warmup_steps >= 0 else int(
            num_training_steps * 0.1)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps)
        # reset optimizer and lr_scheduler
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        # log info
        self.logger.info(
            f'***** Running training: {self.learning_method} *****')
        self.logger.info('  Num Epochs = %d', self.num_epochs)
        self.logger.info(
            '  Num Training steps(one turn in a batch of dialogs) per epoch = %d',
            num_training_steps_per_epoch)
        self.logger.info('  Batch size for labeled data = %d',
                         self.batch_size_label)
        self.logger.info('  Batch size for unlabeled data = %d',
                         self.batch_size_nolabel)
        self.logger.info('  Total optimization steps = %d', num_training_steps)
        self.logger.info('  Total warmup steps = %d', num_warmup_steps)
        self.logger.info('************************************')
    def train(self,
              train_label_iter,
              train_nolabel_iter=None,
              valid_label_iter=None,
              valid_nolabel_iter=None):
        # begin training
        num_epochs = self.num_epochs - self.epoch
        for epoch in range(num_epochs):
            self.train_epoch(
                train_label_iter=train_label_iter,
                train_nolabel_iter=train_nolabel_iter,
                valid_label_iter=valid_label_iter,
                valid_nolabel_iter=valid_nolabel_iter)
    def train_epoch(self, train_label_iter, train_nolabel_iter,
                    valid_label_iter, valid_nolabel_iter):
        """
        Train an epoch.
        """
        raise NotImplementedError
    def evaluate(self, data_label_iter, data_nolabel_iter, need_save=True):
        raise NotImplementedError
    def infer(self, data_iter, num_batches=None):
        raise NotImplementedError
    def save(self, is_best=False):
        """ save """
        train_state = {
            'epoch': self.epoch,
            'batch_num': self.batch_num,
            'best_valid_metric': self.best_valid_metric,
            'optimizer': self.optimizer.state_dict()
        }
        if self.lr_scheduler is not None:
            train_state['lr_scheduler'] = self.lr_scheduler.state_dict()
        # Save checkpoint
        if self.save_checkpoint:
            model_file = os.path.join(self.save_dir,
                                      f'state_epoch_{self.epoch}.model')
            torch.save(self.model.state_dict(), model_file)
            self.logger.info(f"Saved model state to '{model_file}'")
            train_file = os.path.join(self.save_dir,
                                      f'state_epoch_{self.epoch}.train')
            torch.save(train_state, train_file)
            self.logger.info(f"Saved train state to '{train_file}'")
        # Save current best model
        if is_best:
            best_model_file = os.path.join(self.save_dir, 'best.model')
            torch.save(self.model.state_dict(), best_model_file)
            best_train_file = os.path.join(self.save_dir, 'best.train')
            torch.save(train_state, best_train_file)
            self.logger.info(
                f"Saved best model state to '{best_model_file}' with new best valid metric "
                f'{self.valid_metric_name.upper()}={self.best_valid_metric:.3f}'
            )
    def load(self):
        """ load """
        def _load_model_state():
            model_state_dict = torch.load(
                f'{self.func_model.init_checkpoint}.model',
                map_location=lambda storage, loc: storage)
            if 'module.' in list(model_state_dict.keys())[0]:
                new_model_state_dict = OrderedDict()
                for k, v in model_state_dict.items():
                    assert k[:7] == 'module.'
                    new_model_state_dict[k[7:]] = v
                model_state_dict = new_model_state_dict
            new_model_state_dict = OrderedDict()
            parameters = {
                name: param
                for name, param in self.func_model.named_parameters()
            }
            for name, param in model_state_dict.items():
                if name in parameters:
                    if param.shape != parameters[name].shape:
                        assert hasattr(param, 'numpy')
                        arr = param.numpy()
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        if name == 'embedder.token_embedding.weight':
                            z[-param.shape[0]:] = arr
                            print(
                                f'part of parameter({name}) random normlize initialize'
                            )
                        else:
                            if z.shape[0] < param.shape[0]:
                                z = arr[:z.shape[0]]
                                print(f'part of parameter({name}) are dropped')
                            else:
                                z[:param.shape[0]] = arr
                                print(
                                    f'part of parameter({name}) random normlize initialize'
                                )
                        dtype, device = param.dtype, param.device
                        z = torch.tensor(z, dtype=dtype, device=device)
                        new_model_state_dict[name] = z
                    else:
                        new_model_state_dict[name] = param
                else:
                    print(f'parameter({name}) are dropped')
            model_state_dict = new_model_state_dict
            for name in parameters:
                if name not in model_state_dict:
                    if parameters[name].requires_grad:
                        print(f'parameter({name}) random normlize initialize')
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        dtype, device = parameters[name].dtype, parameters[
                            name].device
                        model_state_dict[name] = torch.tensor(
                            z, dtype=dtype, device=device)
                    else:
                        model_state_dict[name] = parameters[name]
            self.func_model.load_state_dict(model_state_dict)
            self.logger.info(
                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
            )
        def _load_train_state():
            train_file = f'{self.func_model.init_checkpoint}.train'
            if os.path.exists(train_file):
                train_state_dict = torch.load(
                    train_file, map_location=lambda storage, loc: storage)
                self.epoch = train_state_dict['epoch']
                self.best_valid_metric = train_state_dict['best_valid_metric']
                if self.optimizer is not None and 'optimizer' in train_state_dict:
                    self.optimizer.load_state_dict(
                        train_state_dict['optimizer'])
                if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict:
                    self.lr_scheduler.load_state_dict(
                        train_state_dict['lr_scheduler'])
                self.logger.info(
                    f"Loaded train state from '{train_file}' with (epoch-{self.epoch} "
                    f'best_valid_metric={self.best_valid_metric:.3f})')
            else:
                self.logger.info('Loaded no train state')
        if self.func_model.init_checkpoint is None:
            self.logger.info('Loaded no model !!!')
            return
        _load_model_state()
        _load_train_state()
 class IntentTrainer(Trainer):
    def __init__(self, model, to_tensor, config, reader=None):
        super(IntentTrainer, self).__init__(model, to_tensor, config, reader)
        self.example = config.Model.example
        self.can_norm = config.Trainer.can_norm
    def can_normalization(self, y_pred, y_true, ex_data_iter):
        # compute ACC
        acc_original = np.mean([y_pred.argmax(1) == y_true])
        message = 'original acc: %s' % acc_original
        # compute uncertainty
        k = 3
        y_pred_topk = np.sort(y_pred, axis=1)[:, -k:]
        y_pred_topk /= y_pred_topk.sum(axis=1, keepdims=True)
        y_pred_uncertainty =\
            -(y_pred_topk * np.log(y_pred_topk)).sum(1) / np.log(k)
        # choose threshold
        # print(np.sort(y_pred_uncertainty)[-100:].tolist())
        threshold = 0.7
        y_pred_confident = y_pred[y_pred_uncertainty < threshold]
        y_pred_unconfident = y_pred[y_pred_uncertainty >= threshold]
        y_true_confident = y_true[y_pred_uncertainty < threshold]
        y_true_unconfident = y_true[y_pred_uncertainty >= threshold]
        # compute ACC again for high and low confidence sets
        acc_confident = (y_pred_confident.argmax(1) == y_true_confident).mean() \
            if len(y_true_confident) else 0.
        acc_unconfident = (y_pred_unconfident.argmax(1) == y_true_unconfident).mean() \
            if len(y_true_unconfident) else 0.
        message += '   (%s) confident acc: %s' % (len(y_true_confident),
                                                  acc_confident)
        message += '   (%s) unconfident acc: %s' % (len(y_true_unconfident),
                                                    acc_unconfident)
        # get prior distribution from training set
        prior = np.zeros(self.func_model.num_intent)
        for _, (batch, batch_size) in ex_data_iter:
            for intent_label in batch['intent_label']:
                prior[intent_label] += 1.
        prior /= prior.sum()
        # revise each sample from the low confidence set, and compute new ACC
        right, alpha, iters = 0, 1, 1
        for i, y in enumerate(y_pred_unconfident):
            Y = np.concatenate([y_pred_confident, y[None]], axis=0)
            for j in range(iters):
                Y = Y**alpha
                Y /= Y.mean(axis=0, keepdims=True)
                Y *= prior[None]
                Y /= Y.sum(axis=1, keepdims=True)
            y = Y[-1]
            if y.argmax() == y_true_unconfident[i]:
                right += 1
        # get final ACC
        acc_final = \
            (acc_confident * len(y_pred_confident) + right) / \
            len(y_pred)
        if len(y_pred_unconfident):
            message += '   new unconfident acc: %s' % (
                right / len(y_pred_unconfident))
        else:
            message += '   no unconfident predictions'
        message += '   final acc: %s' % acc_final
        return acc_original, acc_final, message
    def train_epoch(self, train_label_iter, train_nolabel_iter,
                    valid_label_iter, valid_nolabel_iter):
        """
        Train an epoch.
        """
        times = []
        self.epoch += 1
        self.batch_metrics_tracker_label.clear()
        self.token_metrics_tracker_label.clear()
        self.batch_metrics_tracker_nolabel.clear()
        self.token_metrics_tracker_nolabel.clear()
        num_label_batches = len(train_label_iter)
        num_nolabel_batches = len(
            train_nolabel_iter) if train_nolabel_iter is not None else 0
        num_batches = max(num_label_batches, num_nolabel_batches)
        train_label_iter_loop = iter(train_label_iter)
        train_nolabel_iter_loop = iter(
            train_nolabel_iter) if train_nolabel_iter is not None else None
        report_for_unlabeled_data = True if train_nolabel_iter is not None else False
        for batch_id in range(1, num_batches + 1):
            # Do a training iteration
            start_time = time.time()
            batch_list, batch_size_list, with_label_list, loss_list, metrics_list = [], [], [], [], []
            data_file_list = []
            # collect batch for labeled data
            try:
                data_file_label, (
                    batch_label,
                    batch_size_label) = next(train_label_iter_loop)
            except StopIteration:
                train_label_iter_loop = iter(train_label_iter)
                data_file_label, (
                    batch_label,
                    batch_size_label) = next(train_label_iter_loop)
            batch_list.append(batch_label)
            batch_size_list.append(batch_size_label)
            with_label_list.append(True)
            data_file_list.append(data_file_label)
            # collect batch for unlabeled data
            if train_nolabel_iter is not None:
                try:
                    data_file_nolabel, (
                        batch_nolabel,
                        batch_size_nolabel) = next(train_nolabel_iter_loop)
                except StopIteration:
                    train_nolabel_iter_loop = iter(train_nolabel_iter)
                    data_file_nolabel, (
                        batch_nolabel,
                        batch_size_nolabel) = next(train_nolabel_iter_loop)
                batch_list.append(batch_nolabel)
                batch_size_list.append(batch_size_nolabel)
                with_label_list.append(False)
                data_file_list.append(data_file_nolabel)
            # forward labeled batch and unlabeled batch and collect outputs, respectively
            for (batch, batch_size, with_label, data_file) in \
                    zip(batch_list, batch_size_list, with_label_list, data_file_list):
                batch = type(batch)(
                    map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                        batch.items()))
                if self.example and with_label:
                    current_dataset = train_label_iter.data_file_to_dataset[
                        data_file]
                    example_batch = self.reader.retrieve_examples(
                        dataset=current_dataset,
                        labels=batch['intent_label'],
                        inds=batch['ids'],
                        task='intent')
                    example_batch = type(example_batch)(
                        map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                            example_batch.items()))
                    for k, v in example_batch.items():
                        batch[k] = v
                batch['epoch'] = self.epoch
                batch['num_steps'] = self.batch_num
                metrics = self.model(
                    batch,
                    is_training=True,
                    with_label=with_label,
                    data_file=data_file)
                loss, metrics = self.balance_metrics(
                    metrics=metrics, batch_size=batch_size)
                loss_list.append(loss)
                metrics_list.append(metrics)
            # combine loss for labeled data and unlabeled data
            # TODO change the computation of combined loss of labeled batch and unlabeled batch
            loss = loss_list[0] if len(
                loss_list) == 1 else loss_list[0] + loss_list[1]
            # optimization procedure
            self.func_model._optimize(
                loss, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler)
            elapsed = time.time() - start_time
            times.append(elapsed)
            self.batch_num += 1
            # track metrics and log temporary message
            for (batch_size, metrics,
                 with_label) in zip(batch_size_list, metrics_list,
                                    with_label_list):
                self.track_and_log_message(
                    metrics=metrics,
                    batch_id=batch_id,
                    batch_size=batch_size,
                    num_batches=num_batches,
                    times=times,
                    with_label=with_label)
            # evaluate
            if self.valid_steps > 0 and valid_label_iter is not None and valid_nolabel_iter is not None \
                    and batch_id % self.valid_steps == 0:
                self.evaluate(
                    data_label_iter=valid_label_iter,
                    data_nolabel_iter=valid_nolabel_iter)
        # compute accuracy for valid dataset
        accuracy = self.infer(
            data_iter=valid_label_iter, ex_data_iter=train_label_iter)
        # report summary message and save checkpoints
        self.save_and_log_message(
            report_for_unlabeled_data, cur_valid_metric=-accuracy)
    def forward(self, batch):
        pred = []
        with torch.no_grad():
            batch = type(batch)(
                map(lambda kv: (kv[0], self.to_tensor(kv[1])), batch.items()))
            result = self.model.infer(inputs=batch)
            result = {
                name: result[name].cpu().detach().numpy()
                for name in result
            }
            intent_probs = result['intent_probs']
            if self.can_norm:
                pred += [intent_probs]
            else:
                pred += np.argmax(intent_probs, axis=1).tolist()
        return pred
    def infer(self, data_iter, num_batches=None, ex_data_iter=None):
        """
        Inference interface.
        """
        self.logger.info('Generation starts ...')
        infer_save_file = os.path.join(self.save_dir,
                                       f'infer_{self.epoch}.result.json')
        # Inference
        batch_cnt = 0
        pred, true = [], []
        outputs, labels = [], []
        begin_time = time.time()
        with torch.no_grad():
            if self.example:
                for _, (batch, batch_size) in tqdm(
                        ex_data_iter, desc='Building train memory.'):
                    batch = type(batch)(
                        map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                            batch.items()))
                    result = self.model.infer(inputs=batch)
                    result = {
                        name: result[name].cpu().detach().numpy()
                        for name in result
                    }
                    outputs.append(torch.from_numpy(result['features']))
                    labels += batch['intent_label'].tolist()
                mem = torch.cat(outputs, dim=0)
                mem = mem.cuda() if self.func_model.use_gpu else mem
                labels = torch.LongTensor(labels).unsqueeze(0)
                labels = labels.cuda() if self.func_model.use_gpu else labels
                self.logger.info(f'Memory size: {mem.size()}')
            for _, (batch, batch_size) in tqdm(data_iter, total=num_batches):
                batch = type(batch)(
                    map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                        batch.items()))
                result = self.model.infer(inputs=batch)
                result = {
                    name: result[name].cpu().detach().numpy()
                    for name in result
                }
                if self.example:
                    features = torch.from_numpy(result['features'])
                    features = features.cuda(
                    ) if self.func_model.use_gpu else features
                    probs = torch.softmax(features.mm(mem.t()), dim=-1)
                    intent_probs = torch.zeros(
                        probs.size(0), self.func_model.num_intent)
                    intent_probs = intent_probs.cuda(
                    ) if self.func_model.use_gpu else intent_probs
                    intent_probs = intent_probs.scatter_add(
                        -1, labels.repeat(probs.size(0), 1), probs)
                    intent_probs = intent_probs.cpu().detach().numpy()
                else:
                    intent_probs = result['intent_probs']
                if self.can_norm:
                    pred += [intent_probs]
                    true += batch['intent_label'].cpu().detach().tolist()
                else:
                    pred += np.argmax(intent_probs, axis=1).tolist()
                    true += batch['intent_label'].cpu().detach().tolist()
                batch_cnt += 1
                if batch_cnt == num_batches:
                    break
        if self.can_norm:
            true = np.array(true)
            pred = np.concatenate(pred, axis=0)
            acc_original, acc_final, message = self.can_normalization(
                y_pred=pred, y_true=true, ex_data_iter=ex_data_iter)
            accuracy = max(acc_original, acc_final)
            infer_results = {
                'accuracy': accuracy,
                'pred_labels': pred.tolist(),
                'message': message
            }
            metrics_message = f'Accuracy: {accuracy}   {message}'
        else:
            accuracy = sum(p == t for p, t in zip(pred, true)) / len(pred)
            infer_results = {'accuracy': accuracy, 'pred_labels': pred}
            metrics_message = f'Accuracy: {accuracy}'
        self.logger.info(f'Saved inference results to {infer_save_file}')
        with open(infer_save_file, 'w') as fp:
            json.dump(infer_results, fp, indent=2)
        message_prefix = f'[Infer][{self.epoch}]'
        time_cost = f'TIME-{time.time() - begin_time:.3f}'
        message = '   '.join([message_prefix, metrics_message, time_cost])
        self.logger.info(message)
        return accuracy
    def track_and_log_message(self, metrics, batch_id, batch_size, num_batches,
                              times, with_label):
        # track metrics
        batch_metrics_tracker = self.batch_metrics_tracker_label if with_label else self.batch_metrics_tracker_nolabel
        token_metrics_tracker = self.token_metrics_tracker_label if with_label else self.token_metrics_tracker_nolabel
        metrics = {
            k: v.cpu().detach().numpy() if isinstance(v, torch.Tensor) else v
            for k, v in metrics.items()
        }
        mlm_num = metrics.pop('mlm_num', 0)
        batch_metrics = {k: v for k, v in metrics.items() if 'token' not in k}
        token_metrics = {k: v for k, v in metrics.items() if 'token' in k}
        batch_metrics_tracker.update(batch_metrics, batch_size)
        token_metrics_tracker.update(token_metrics, mlm_num)
        # log message
        if self.log_steps > 0 and batch_id % self.log_steps == 0:
            batch_metrics_message = batch_metrics_tracker.value()
            token_metrics_message = token_metrics_tracker.value()
            label_prefix = 'Labeled' if with_label else 'Unlabeled'
            message_prefix = f'[Train][{self.epoch}][{batch_id}/{num_batches}][{label_prefix}]'
            avg_time = f'AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}'
            message = '   '.join([
                message_prefix, batch_metrics_message, token_metrics_message,
                avg_time
            ])
            self.logger.info(message)
    def save_and_log_message(self,
                             report_for_unlabeled_data,
                             cur_valid_metric=None):
        # report message
        batch_metrics_message = self.batch_metrics_tracker_label.summary()
        token_metrics_message = self.token_metrics_tracker_label.summary()
        message_prefix = f'[Valid][{self.epoch}][Labeled]'
        message = '   '.join(
            [message_prefix, batch_metrics_message, token_metrics_message])
        self.logger.info(message)
        if report_for_unlabeled_data:
            batch_metrics_message = self.batch_metrics_tracker_nolabel.summary(
            )
            token_metrics_message = self.token_metrics_tracker_nolabel.summary(
            )
            message_prefix = f'[Valid][{self.epoch}][Unlabeled]'
            message = '   '.join(
                [message_prefix, batch_metrics_message, token_metrics_message])
            self.logger.info(message)
        # save checkpoints
        assert cur_valid_metric is not None
        if self.is_decreased_valid_metric:
            is_best = cur_valid_metric < self.best_valid_metric
        else:
            is_best = cur_valid_metric > self.best_valid_metric
        if is_best:
            self.best_valid_metric = cur_valid_metric
        self.save(is_best)
    def balance_metrics(self, metrics, batch_size):
        if self.gpu > 1:
            for metric in metrics:
                if metric is not None:
                    assert len(metric) == self.gpu
            intent_loss, mlm, token_mlm, mlm_num, kl, con = metrics
            metrics = {}
            intent_loss = torch.mean(intent_loss)
            metrics['intent_loss'] = intent_loss
            loss = intent_loss
            if mlm is not None:
                mlm_num = torch.sum(mlm_num)
                token_mlm = torch.sum(mlm) * (batch_size / self.gpu) / mlm_num
                mlm = torch.mean(mlm)
                metrics['mlm_num'] = mlm_num
                metrics['token_mlm'] = token_mlm
                metrics['mlm'] = mlm
                loss = loss + (token_mlm if self.func_model.token_loss else
                               mlm) * self.func_model.mlm_ratio
            if kl is not None:
                kl = torch.mean(kl)
                metrics['kl'] = kl
                loss = loss + kl * self.func_model.kl_ratio
            if con is not None:
                con = torch.mean(con)
                metrics['con'] = con
                loss = loss + con
            metrics['loss'] = loss
        assert 'loss' in metrics
        return metrics['loss'], metrics
    def load(self):
        """ load """
        def _load_model_state():
            model_state_dict = torch.load(
                f'{self.func_model.init_checkpoint}',
                map_location=lambda storage, loc: storage)
            if 'module.' in list(model_state_dict.keys())[0]:
                new_model_state_dict = OrderedDict()
                for k, v in model_state_dict.items():
                    assert k[:7] == 'module.'
                    new_model_state_dict[k[7:]] = v
                model_state_dict = new_model_state_dict
            new_model_state_dict = OrderedDict()
            parameters = {
                name: param
                for name, param in self.func_model.named_parameters()
            }
            for name, param in model_state_dict.items():
                if name in parameters:
                    if param.shape != parameters[name].shape:
                        assert hasattr(param, 'numpy')
                        arr = param.numpy()
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        if name == 'embedder.token_embedding.weight':
                            z[-param.shape[0]:] = arr
                            print(
                                f'part of parameter({name}) random normlize initialize'
                            )
                        else:
                            if z.shape[0] < param.shape[0]:
                                z = arr[:z.shape[0]]
                                print(f'part of parameter({name}) are dropped')
                            else:
                                z[:param.shape[0]] = arr
                                print(
                                    f'part of parameter({name}) random normlize initialize'
                                )
                        dtype, device = param.dtype, param.device
                        z = torch.tensor(z, dtype=dtype, device=device)
                        new_model_state_dict[name] = z
                    else:
                        new_model_state_dict[name] = param
                else:
                    print(f'parameter({name}) are dropped')
            model_state_dict = new_model_state_dict
            for name in parameters:
                if name not in model_state_dict:
                    if parameters[name].requires_grad:
                        print(f'parameter({name}) random normlize initialize')
                        z = np.random.normal(
                            scale=self.func_model.initializer_range,
                            size=parameters[name].shape).astype('float32')
                        dtype, device = parameters[name].dtype, parameters[
                            name].device
                        model_state_dict[name] = torch.tensor(
                            z, dtype=dtype, device=device)
                    else:
                        model_state_dict[name] = parameters[name]
            self.func_model.load_state_dict(model_state_dict)
            self.logger.info(
                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
            )
        def _load_train_state():
            train_file = f'{self.func_model.init_checkpoint}.train'
            if os.path.exists(train_file):
                train_state_dict = torch.load(
                    train_file, map_location=lambda storage, loc: storage)
                self.epoch = train_state_dict['epoch']
                self.best_valid_metric = train_state_dict['best_valid_metric']
                if self.optimizer is not None and 'optimizer' in train_state_dict:
                    self.optimizer.load_state_dict(
                        train_state_dict['optimizer'])
                if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict:
                    self.lr_scheduler.load_state_dict(
                        train_state_dict['lr_scheduler'])
                self.logger.info(
                    f"Loaded train state from '{train_file}' with (epoch-{self.epoch} "
                    f'best_valid_metric={self.best_valid_metric:.3f})')
            else:
                self.logger.info('Loaded no train state')
        if self.func_model.init_checkpoint is None:
            self.logger.info('Loaded no model !!!')
            return
        if self.do_train:
            _load_model_state()
            return
        if self.do_infer:
            _load_model_state()
            _load_train_state()
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -34,6 +34,8 @@ class Tasks(object):
    # nlp tasks
    word_segmentation = 'word-segmentation'
    nli = 'nli'
    sentiment_classification = 'sentiment-classification'
    sentiment_analysis = 'sentiment-analysis'
    sentence_similarity = 'sentence-similarity'
    text_classification = 'text-classification'
@@ -43,6 +45,8 @@ class Tasks(object):
    token_classification = 'token-classification'
    conversational = 'conversational'
    text_generation = 'text-generation'
    dialog_modeling = 'dialog-modeling'
    dialog_intent_prediction = 'dialog-intent-prediction'
    table_question_answering = 'table-question-answering'
    feature_extraction = 'feature-extraction'
    fill_mask = 'fill-mask'
--- a/modelscope/utils/nlp/init.py
+++ b/modelscope/utils/nlp/init.py
--- a/modelscope/utils/nlp/space/init.py
+++ b/modelscope/utils/nlp/space/init.py
--- a/modelscope/utils/nlp/space/args.py
+++ b/modelscope/utils/nlp/space/args.py
@@ -0,0 +1,66 @@
 """
 Parse argument.
 """
 import argparse
 import json
 def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Unsupported value encountered.')
 class HParams(dict):
    """ Hyper-parameters class
    Store hyper-parameters in training / infer / ... scripts.
    """
    def __getattr__(self, name):
        if name in self.keys():
            return self[name]
        for v in self.values():
            if isinstance(v, HParams):
                if name in v:
                    return v[name]
        raise AttributeError(f"'HParams' object has no attribute '{name}'")
    def __setattr__(self, name, value):
        self[name] = value
    def save(self, filename):
        with open(filename, 'w', encoding='utf-8') as fp:
            json.dump(self, fp, ensure_ascii=False, indent=4, sort_keys=False)
    def load(self, filename):
        with open(filename, 'r', encoding='utf-8') as fp:
            params_dict = json.load(fp)
        for k, v in params_dict.items():
            if isinstance(v, dict):
                self[k].update(HParams(v))
            else:
                self[k] = v
 def parse_args(parser):
    """ Parse hyper-parameters from cmdline. """
    parsed = parser.parse_args()
    args = HParams()
    optional_args = parser._action_groups[1]
    for action in optional_args._group_actions[1:]:
        arg_name = action.dest
        args[arg_name] = getattr(parsed, arg_name)
    for group in parser._action_groups[2:]:
        group_args = HParams()
        for action in group._group_actions:
            arg_name = action.dest
            group_args[arg_name] = getattr(parsed, arg_name)
        if len(group_args) > 0:
            args[group.title] = group_args
    return args
--- a/modelscope/utils/nlp/space/criterions.py
+++ b/modelscope/utils/nlp/space/criterions.py
@@ -0,0 +1,52 @@
 import torch
 import torch.nn.functional as F
 from torch.nn.modules.loss import _Loss
 def compute_kl_loss(p, q, filter_scores=None):
    p_loss = F.kl_div(
        F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
    q_loss = F.kl_div(
        F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')
    # You can choose whether to use function "sum" and "mean" depending on your task
    p_loss = p_loss.sum(dim=-1)
    q_loss = q_loss.sum(dim=-1)
    # mask is for filter mechanism
    if filter_scores is not None:
        p_loss = filter_scores * p_loss
        q_loss = filter_scores * q_loss
    p_loss = p_loss.mean()
    q_loss = q_loss.mean()
    loss = (p_loss + q_loss) / 2
    return loss
 class CatKLLoss(_Loss):
    """
    CatKLLoss
    """
    def __init__(self, reduction='mean'):
        super(CatKLLoss, self).__init__()
        assert reduction in ['none', 'sum', 'mean']
        self.reduction = reduction
    def forward(self, log_qy, log_py):
        """
        KL(qy|py) = Eq[qy * log(q(y) / p(y))]
        log_qy: (batch_size, latent_size)
        log_py: (batch_size, latent_size)
        """
        qy = torch.exp(log_qy)
        kl = torch.sum(qy * (log_qy - log_py), dim=1)
        if self.reduction == 'mean':
            kl = kl.mean()
        elif self.reduction == 'sum':
            kl = kl.sum()
        return kl
--- a/modelscope/utils/nlp/space/db_ops.py
+++ b/modelscope/utils/nlp/space/db_ops.py
@@ -0,0 +1,313 @@
 import os
 import random
 import sqlite3
 import json
 from .ontology import all_domains, db_domains
 class MultiWozDB(object):
    def __init__(self, db_dir, db_paths):
        self.dbs = {}
        self.sql_dbs = {}
        for domain in all_domains:
            with open(os.path.join(db_dir, db_paths[domain]), 'r') as f:
                self.dbs[domain] = json.loads(f.read().lower())
    def oneHotVector(self, domain, num):
        """Return number of available entities for particular domain."""
        vector = [0, 0, 0, 0]
        if num == '':
            return vector
        if domain != 'train':
            if num == 0:
                vector = [1, 0, 0, 0]
            elif num == 1:
                vector = [0, 1, 0, 0]
            elif num <= 3:
                vector = [0, 0, 1, 0]
            else:
                vector = [0, 0, 0, 1]
        else:
            if num == 0:
                vector = [1, 0, 0, 0]
            elif num <= 5:
                vector = [0, 1, 0, 0]
            elif num <= 10:
                vector = [0, 0, 1, 0]
            else:
                vector = [0, 0, 0, 1]
        return vector
    def addBookingPointer(self, turn_da):
        """Add information about availability of the booking option."""
        # Booking pointer
        # Do not consider booking two things in a single turn.
        vector = [0, 0]
        if turn_da.get('booking-nobook'):
            vector = [1, 0]
        if turn_da.get('booking-book') or turn_da.get('train-offerbooked'):
            vector = [0, 1]
        return vector
    def addDBPointer(self, domain, match_num, return_num=False):
        """Create database pointer for all related domains."""
        # if turn_domains is None:
        #     turn_domains = db_domains
        if domain in db_domains:
            vector = self.oneHotVector(domain, match_num)
        else:
            vector = [0, 0, 0, 0]
        return vector
    def addDBIndicator(self, domain, match_num, return_num=False):
        """Create database indicator for all related domains."""
        # if turn_domains is None:
        #     turn_domains = db_domains
        if domain in db_domains:
            vector = self.oneHotVector(domain, match_num)
        else:
            vector = [0, 0, 0, 0]
        # '[db_nores]', '[db_0]', '[db_1]', '[db_2]', '[db_3]'
        if vector == [0, 0, 0, 0]:
            indicator = '[db_nores]'
        else:
            indicator = '[db_%s]' % vector.index(1)
        return indicator
    def get_match_num(self, constraints, return_entry=False):
        """Create database pointer for all related domains."""
        match = {'general': ''}
        entry = {}
        # if turn_domains is None:
        #     turn_domains = db_domains
        for domain in all_domains:
            match[domain] = ''
            if domain in db_domains and constraints.get(domain):
                matched_ents = self.queryJsons(domain, constraints[domain])
                match[domain] = len(matched_ents)
                if return_entry:
                    entry[domain] = matched_ents
        if return_entry:
            return entry
        return match
    def pointerBack(self, vector, domain):
        # multi domain implementation
        # domnum = cfg.domain_num
        if domain.endswith(']'):
            domain = domain[1:-1]
        if domain != 'train':
            nummap = {0: '0', 1: '1', 2: '2-3', 3: '>3'}
        else:
            nummap = {0: '0', 1: '1-5', 2: '6-10', 3: '>10'}
        if vector[:4] == [0, 0, 0, 0]:
            report = ''
        else:
            num = vector.index(1)
            report = domain + ': ' + nummap[num] + '; '
        if vector[-2] == 0 and vector[-1] == 1:
            report += 'booking: ok'
        if vector[-2] == 1 and vector[-1] == 0:
            report += 'booking: unable'
        return report
    def queryJsons(self,
                   domain,
                   constraints,
                   exactly_match=True,
                   return_name=False):
        """Returns the list of entities for a given domain
        based on the annotation of the belief state
        constraints: dict e.g. {'pricerange': 'cheap', 'area': 'west'}
        """
        # query the db
        if domain == 'taxi':
            return [{
                'taxi_colors':
                random.choice(self.dbs[domain]['taxi_colors']),
                'taxi_types':
                random.choice(self.dbs[domain]['taxi_types']),
                'taxi_phone': [random.randint(1, 9) for _ in range(10)]
            }]
        if domain == 'police':
            return self.dbs['police']
        if domain == 'hospital':
            if constraints.get('department'):
                for entry in self.dbs['hospital']:
                    if entry.get('department') == constraints.get(
                            'department'):
                        return [entry]
            else:
                return []
        valid_cons = False
        for v in constraints.values():
            if v not in ['not mentioned', '']:
                valid_cons = True
        if not valid_cons:
            return []
        match_result = []
        if 'name' in constraints:
            for db_ent in self.dbs[domain]:
                if 'name' in db_ent:
                    cons = constraints['name']
                    dbn = db_ent['name']
                    if cons == dbn:
                        db_ent = db_ent if not return_name else db_ent['name']
                        match_result.append(db_ent)
                        return match_result
        for db_ent in self.dbs[domain]:
            match = True
            for s, v in constraints.items():
                if s == 'name':
                    continue
                if s in ['people', 'stay'] or (domain == 'hotel' and s == 'day') or \
                        (domain == 'restaurant' and s in ['day', 'time']):
                    # These inform slots belong to "book info"，which do not exist in DB
                    # "book" is according to the user goal，not DB
                    continue
                skip_case = {
                    "don't care": 1,
                    "do n't care": 1,
                    'dont care': 1,
                    'not mentioned': 1,
                    'dontcare': 1,
                    '': 1
                }
                if skip_case.get(v):
                    continue
                if s not in db_ent:
                    # logging.warning('Searching warning: slot %s not in %s db'%(s, domain))
                    match = False
                    break
                # v = 'guesthouse' if v == 'guest house' else v
                # v = 'swimmingpool' if v == 'swimming pool' else v
                v = 'yes' if v == 'free' else v
                if s in ['arrive', 'leave']:
                    try:
                        h, m = v.split(
                            ':'
                        )  # raise error if time value is not xx:xx format
                        v = int(h) * 60 + int(m)
                    except Exception:
                        match = False
                        break
                    time = int(db_ent[s].split(':')[0]) * 60 + int(
                        db_ent[s].split(':')[1])
                    if s == 'arrive' and v > time:
                        match = False
                    if s == 'leave' and v < time:
                        match = False
                else:
                    if exactly_match and v != db_ent[s]:
                        match = False
                        break
                    elif v not in db_ent[s]:
                        match = False
                        break
            if match:
                match_result.append(db_ent)
        if not return_name:
            return match_result
        else:
            if domain == 'train':
                match_result = [e['id'] for e in match_result]
            else:
                match_result = [e['name'] for e in match_result]
            return match_result
    def querySQL(self, domain, constraints):
        if not self.sql_dbs:
            for dom in db_domains:
                db = 'db/{}-dbase.db'.format(dom)
                conn = sqlite3.connect(db)
                c = conn.cursor()
                self.sql_dbs[dom] = c
        sql_query = 'select * from {}'.format(domain)
        flag = True
        for key, val in constraints.items():
            if val == '' \
                    or val == 'dontcare' \
                    or val == 'not mentioned' \
                    or val == "don't care" \
                    or val == 'dont care' \
                    or val == "do n't care":
                pass
            else:
                if flag:
                    sql_query += ' where '
                    val2 = val.replace("'", "''")
                    # val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += r' ' + key + ' > ' + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += r' ' + key + ' < ' + r"'" + val2 + r"'"
                    else:
                        sql_query += r' ' + key + '=' + r"'" + val2 + r"'"
                    flag = False
                else:
                    val2 = val.replace("'", "''")
                    # val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += r' and ' + key + ' > ' + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += r' and ' + key + ' < ' + r"'" + val2 + r"'"
                    else:
                        sql_query += r' and ' + key + '=' + r"'" + val2 + r"'"
        try:  # "select * from attraction  where name = 'queens college'"
            print(sql_query)
            return self.sql_dbs[domain].execute(sql_query).fetchall()
        except Exception:
            return []  # TODO test it
 if __name__ == '__main__':
    dbPATHs = {
        'attraction': 'db/attraction_db_processed.json',
        'hospital': 'db/hospital_db_processed.json',
        'hotel': 'db/hotel_db_processed.json',
        'police': 'db/police_db_processed.json',
        'restaurant': 'db/restaurant_db_processed.json',
        'taxi': 'db/taxi_db_processed.json',
        'train': 'db/train_db_processed.json',
    }
    db = MultiWozDB(dbPATHs)
    while True:
        constraints = {}
        inp = input(
            'input belief state in fomat: domain-slot1=value1;slot2=value2...\n'
        )
        domain, cons = inp.split('-')
        for sv in cons.split(';'):
            s, v = sv.split('=')
            constraints[s] = v
        # res = db.querySQL(domain, constraints)
        res = db.queryJsons(domain, constraints, return_name=True)
        report = []
        reidx = {
            'hotel': 8,
            'restaurant': 6,
            'attraction': 5,
            'train': 1,
        }
        print(constraints)
        print(res)
        print('count:', len(res), '\nnames:', report)
--- a/modelscope/utils/nlp/space/ontology.py
+++ b/modelscope/utils/nlp/space/ontology.py
@@ -0,0 +1,204 @@
 all_domains = [
    'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital'
 ]
 all_domains_with_bracket = ['[{}]'.format(item) for item in all_domains]
 db_domains = ['restaurant', 'hotel', 'attraction', 'train']
 placeholder_tokens = [
    '<go_r>', '<go_b>', '<go_a>', '<go_d>', '<eos_u>', '<eos_r>', '<eos_b>',
    '<eos_a>', '<eos_d>', '<eos_q>', '<sos_u>', '<sos_r>', '<sos_b>',
    '<sos_a>', '<sos_d>', '<sos_q>'
 ]
 normlize_slot_names = {
    'car type': 'car',
    'entrance fee': 'price',
    'duration': 'time',
    'leaveat': 'leave',
    'arriveby': 'arrive',
    'trainid': 'id'
 }
 requestable_slots = {
    'taxi': ['car', 'phone'],
    'police': ['postcode', 'address', 'phone'],
    'hospital': ['address', 'phone', 'postcode'],
    'hotel': [
        'address', 'postcode', 'internet', 'phone', 'parking', 'type',
        'pricerange', 'stars', 'area', 'reference'
    ],
    'attraction':
    ['price', 'type', 'address', 'postcode', 'phone', 'area', 'reference'],
    'train': ['time', 'leave', 'price', 'arrive', 'id', 'reference'],
    'restaurant': [
        'phone', 'postcode', 'address', 'pricerange', 'food', 'area',
        'reference'
    ]
 }
 all_reqslot = [
    'car', 'address', 'postcode', 'phone', 'internet', 'parking', 'type',
    'pricerange', 'food', 'stars', 'area', 'reference', 'time', 'leave',
    'price', 'arrive', 'id'
 ]
 informable_slots = {
    'taxi': ['leave', 'destination', 'departure', 'arrive'],
    'police': [],
    'hospital': ['department'],
    'hotel': [
        'type', 'parking', 'pricerange', 'internet', 'stay', 'day', 'people',
        'area', 'stars', 'name'
    ],
    'attraction': ['area', 'type', 'name'],
    'train': ['destination', 'day', 'arrive', 'departure', 'people', 'leave'],
    'restaurant':
    ['food', 'pricerange', 'area', 'name', 'time', 'day', 'people']
 }
 all_infslot = [
    'type', 'parking', 'pricerange', 'internet', 'stay', 'day', 'people',
    'area', 'stars', 'name', 'leave', 'destination', 'departure', 'arrive',
    'department', 'food', 'time'
 ]
 all_slots = all_reqslot + [
    'stay', 'day', 'people', 'name', 'destination', 'departure', 'department'
 ]
 get_slot = {}
 for s in all_slots:
    get_slot[s] = 1
 # mapping slots in dialogue act to original goal slot names
 da_abbr_to_slot_name = {
    'addr': 'address',
    'fee': 'price',
    'post': 'postcode',
    'ref': 'reference',
    'ticket': 'price',
    'depart': 'departure',
    'dest': 'destination',
 }
 dialog_acts = {
    'restaurant': [
        'inform', 'request', 'nooffer', 'recommend', 'select', 'offerbook',
        'offerbooked', 'nobook'
    ],
    'hotel': [
        'inform', 'request', 'nooffer', 'recommend', 'select', 'offerbook',
        'offerbooked', 'nobook'
    ],
    'attraction': ['inform', 'request', 'nooffer', 'recommend', 'select'],
    'train':
    ['inform', 'request', 'nooffer', 'offerbook', 'offerbooked', 'select'],
    'taxi': ['inform', 'request'],
    'police': ['inform', 'request'],
    'hospital': ['inform', 'request'],
    # 'booking': ['book', 'inform', 'nobook', 'request'],
    'general': ['bye', 'greet', 'reqmore', 'welcome'],
 }
 all_acts = []
 for acts in dialog_acts.values():
    for act in acts:
        if act not in all_acts:
            all_acts.append(act)
 dialog_act_params = {
    'inform': all_slots + ['choice', 'open'],
    'request': all_infslot + ['choice', 'price'],
    'nooffer': all_slots + ['choice'],
    'recommend': all_reqslot + ['choice', 'open'],
    'select': all_slots + ['choice'],
    # 'book': ['time', 'people', 'stay', 'reference', 'day', 'name', 'choice'],
    'nobook': ['time', 'people', 'stay', 'reference', 'day', 'name', 'choice'],
    'offerbook': all_slots + ['choice'],
    'offerbooked': all_slots + ['choice'],
    'reqmore': [],
    'welcome': [],
    'bye': [],
    'greet': [],
 }
 dialog_act_all_slots = all_slots + ['choice', 'open']
 # special slot tokens in belief span
 # no need of this, just covert slot to [slot] e.g. pricerange -> [pricerange]
 slot_name_to_slot_token = {}
 # eos tokens definition
 eos_tokens = {
    'user': '<eos_u>',
    'user_delex': '<eos_u>',
    'resp': '<eos_r>',
    'resp_gen': '<eos_r>',
    'pv_resp': '<eos_r>',
    'bspn': '<eos_b>',
    'bspn_gen': '<eos_b>',
    'pv_bspn': '<eos_b>',
    'bsdx': '<eos_b>',
    'bsdx_gen': '<eos_b>',
    'pv_bsdx': '<eos_b>',
    'qspn': '<eos_q>',
    'qspn_gen': '<eos_q>',
    'pv_qspn': '<eos_q>',
    'aspn': '<eos_a>',
    'aspn_gen': '<eos_a>',
    'pv_aspn': '<eos_a>',
    'dspn': '<eos_d>',
    'dspn_gen': '<eos_d>',
    'pv_dspn': '<eos_d>'
 }
 # sos tokens definition
 sos_tokens = {
    'user': '<sos_u>',
    'user_delex': '<sos_u>',
    'resp': '<sos_r>',
    'resp_gen': '<sos_r>',
    'pv_resp': '<sos_r>',
    'bspn': '<sos_b>',
    'bspn_gen': '<sos_b>',
    'pv_bspn': '<sos_b>',
    'bsdx': '<sos_b>',
    'bsdx_gen': '<sos_b>',
    'pv_bsdx': '<sos_b>',
    'qspn': '<sos_q>',
    'qspn_gen': '<sos_q>',
    'pv_qspn': '<sos_q>',
    'aspn': '<sos_a>',
    'aspn_gen': '<sos_a>',
    'pv_aspn': '<sos_a>',
    'dspn': '<sos_d>',
    'dspn_gen': '<sos_d>',
    'pv_dspn': '<sos_d>'
 }
 # db tokens definition
 db_tokens = [
    '<sos_db>', '<eos_db>', '[book_nores]', '[book_fail]', '[book_success]',
    '[db_nores]', '[db_0]', '[db_1]', '[db_2]', '[db_3]'
 ]
 # understand tokens definition
 def get_understand_tokens(prompt_num_for_understand):
    understand_tokens = []
    for i in range(prompt_num_for_understand):
        understand_tokens.append(f'<understand_{i}>')
    return understand_tokens
 # policy tokens definition
 def get_policy_tokens(prompt_num_for_policy):
    policy_tokens = []
    for i in range(prompt_num_for_policy):
        policy_tokens.append(f'<policy_{i}>')
    return policy_tokens
 # all special tokens definition
 def get_special_tokens(other_tokens):
    special_tokens = [
        '<go_r>', '<go_b>', '<go_a>', '<go_d>', '<eos_u>', '<eos_r>',
        '<eos_b>', '<eos_a>', '<eos_d>', '<eos_q>', '<sos_u>', '<sos_r>',
        '<sos_b>', '<sos_a>', '<sos_d>', '<sos_q>'
    ] + db_tokens + other_tokens
    return special_tokens
--- a/modelscope/utils/nlp/space/scores.py
+++ b/modelscope/utils/nlp/space/scores.py
@@ -0,0 +1,6 @@
 def hierarchical_set_score(frame1, frame2):
    # deal with empty frame
    if not (frame1 and frame2):
        return 0.
    pass
    return 0.
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -0,0 +1,188 @@
 import logging
 from collections import OrderedDict
 import json
 import numpy as np
 from . import ontology
 def max_lens(X):
    lens = [len(X)]
    while isinstance(X[0], list):
        lens.append(max(map(len, X)))
        X = [x for xs in X for x in xs]
    return lens
 def list2np(X: object, padding: object = 0, dtype: object = 'int64') -> object:
    shape = max_lens(X)
    ret = np.full(shape, padding, dtype=np.int32)
    if len(shape) == 1:
        ret = np.array(X)
    elif len(shape) == 2:
        for i, x in enumerate(X):
            ret[i, :len(x)] = np.array(x)
    elif len(shape) == 3:
        for i, xs in enumerate(X):
            for j, x in enumerate(xs):
                ret[i, j, :len(x)] = np.array(x)
    return ret.astype(dtype)
 def clean_replace(s, r, t, forward=True, backward=False):
    def clean_replace_single(s, r, t, forward, backward, sidx=0):
        # idx = s[sidx:].find(r)
        idx = s.find(r)
        if idx == -1:
            return s, -1
        idx_r = idx + len(r)
        if backward:
            while idx > 0 and s[idx - 1]:
                idx -= 1
        elif idx > 0 and s[idx - 1] != ' ':
            return s, -1
        if forward:
            while \
                    idx_r < len(s) and (s[idx_r].isalpha() or s[idx_r].isdigit()):
                idx_r += 1
        elif idx_r != len(s) and (s[idx_r].isalpha() or s[idx_r].isdigit()):
            return s, -1
        return s[:idx] + t + s[idx_r:], idx_r
    sidx = 0
    while sidx != -1:
        s, sidx = clean_replace_single(s, r, t, forward, backward, sidx)
    return s
 def py2np(list):
    return np.array(list)
 def write_dict(fn, dic):
    with open(fn, 'w') as f:
        json.dump(dic, f, indent=2)
 def f1_score(label_list, pred_list):
    tp = len([t for t in pred_list if t in label_list])
    fp = max(0, len(pred_list) - tp)
    fn = max(0, len(label_list) - tp)
    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    f1 = 2 * precision * recall / (precision + recall + 1e-10)
    return f1
 class MultiWOZVocab(object):
    def __init__(self, vocab_size=0):
        """
        vocab for multiwoz dataset
        """
        self.vocab_size = vocab_size
        self.vocab_size_oov = 0  # get after construction
        self._idx2word = {}  # word + oov
        self._word2idx = {}  # word
        self._freq_dict = {}  # word + oov
        for w in [
                '[PAD]', '<go_r>', '[UNK]', '<go_b>', '<go_a>', '<eos_u>',
                '<eos_r>', '<eos_b>', '<eos_a>', '<go_d>', '<eos_d>'
        ]:
            self._absolute_add_word(w)
    def _absolute_add_word(self, w):
        idx = len(self._idx2word)
        self._idx2word[idx] = w
        self._word2idx[w] = idx
    def add_word(self, word):
        if word not in self._freq_dict:
            self._freq_dict[word] = 0
        self._freq_dict[word] += 1
    def has_word(self, word):
        return self._freq_dict.get(word)
    def _add_to_vocab(self, word):
        if word not in self._word2idx:
            idx = len(self._idx2word)
            self._idx2word[idx] = word
            self._word2idx[word] = idx
    def construct(self):
        freq_dict_sorted = sorted(
            self._freq_dict.keys(), key=lambda x: -self._freq_dict[x])
        print('Vocabulary size including oov: %d' %
              (len(freq_dict_sorted) + len(self._idx2word)))
        if len(freq_dict_sorted) + len(self._idx2word) < self.vocab_size:
            logging.warning(
                'actual label set smaller than that configured: {}/{}'.format(
                    len(freq_dict_sorted) + len(self._idx2word),
                    self.vocab_size))
        for word in ontology.all_domains + ['general']:
            word = '[' + word + ']'
            self._add_to_vocab(word)
        for word in ontology.all_acts:
            word = '[' + word + ']'
            self._add_to_vocab(word)
        for word in ontology.all_slots:
            self._add_to_vocab(word)
        for word in freq_dict_sorted:
            if word.startswith('[value_') and word.endswith(']'):
                self._add_to_vocab(word)
        for word in freq_dict_sorted:
            self._add_to_vocab(word)
        self.vocab_size_oov = len(self._idx2word)
    def load_vocab(self, vocab_path):
        self._freq_dict = json.loads(
            open(vocab_path + '.freq.json', 'r').read())
        self._word2idx = json.loads(
            open(vocab_path + '.word2idx.json', 'r').read())
        self._idx2word = {}
        for w, idx in self._word2idx.items():
            self._idx2word[idx] = w
        self.vocab_size_oov = len(self._idx2word)
        print('vocab file loaded from "' + vocab_path + '"')
        print('Vocabulary size including oov: %d' % (self.vocab_size_oov))
    def save_vocab(self, vocab_path):
        _freq_dict = OrderedDict(
            sorted(
                self._freq_dict.items(), key=lambda kv: kv[1], reverse=True))
        write_dict(vocab_path + '.word2idx.json', self._word2idx)
        write_dict(vocab_path + '.freq.json', _freq_dict)
    def encode(self, word, include_oov=True):
        if include_oov:
            if self._word2idx.get(word, None) is None:
                raise ValueError(
                    'Unknown word: %s. Vocabulary should include oovs here.'
                    % word)
            return self._word2idx[word]
        else:
            word = '<unk>' if word not in self._word2idx else word
            return self._word2idx[word]
    def sentence_encode(self, word_list):
        return [self.encode(_) for _ in word_list]
    def oov_idx_map(self, idx):
        return 2 if idx > self.vocab_size else idx
    def sentence_oov_map(self, index_list):
        return [self.oov_idx_map(_) for _ in index_list]
    def decode(self, idx, indicate_oov=False):
        if not self._idx2word.get(idx):
            raise ValueError(
                'Error idx: %d. Vocabulary should include oovs here.' % idx)
        if not indicate_oov or idx < self.vocab_size:
            return self._idx2word[idx]
        else:
            return self._idx2word[idx] + '(o)'
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1 +1,3 @@
 sofa==1.0.4.2
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
 sofa==1.0.5
 spacy>=2.3.5
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -0,0 +1,61 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SpaceForDialogIntent
 from modelscope.pipelines import DialogIntentPredictionPipeline, pipeline
 from modelscope.preprocessors import DialogIntentPredictionPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 class DialogIntentPredictionTest(unittest.TestCase):
    model_id = 'damo/nlp_space_dialog-intent-prediction'
    test_case = [
        'How do I locate my card?',
        'I still have not received my new card, I ordered over a week ago.'
    ]
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
        cache_path = snapshot_download(self.model_id)
        preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
        model = SpaceForDialogIntent(
            model_dir=cache_path,
            text_field=preprocessor.text_field,
            config=preprocessor.config)
        pipelines = [
            DialogIntentPredictionPipeline(
                model=model, preprocessor=preprocessor),
            pipeline(
                task=Tasks.dialog_intent_prediction,
                model=model,
                preprocessor=preprocessor)
        ]
        for my_pipeline, item in list(zip(pipelines, self.test_case)):
            print(my_pipeline(item))
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = DialogIntentPredictionPreprocessor(
            model_dir=model.model_dir)
        pipelines = [
            DialogIntentPredictionPipeline(
                model=model, preprocessor=preprocessor),
            pipeline(
                task=Tasks.dialog_intent_prediction,
                model=model,
                preprocessor=preprocessor)
        ]
        for my_pipeline, item in list(zip(pipelines, self.test_case)):
            print(my_pipeline(item))
 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -0,0 +1,147 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SpaceForDialogModeling
 from modelscope.pipelines import DialogModelingPipeline, pipeline
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 class DialogModelingTest(unittest.TestCase):
    model_id = 'damo/nlp_space_dialog-modeling'
    test_case = {
        'sng0073': {
            'goal': {
                'taxi': {
                    'info': {
                        'leaveat': '17:15',
                        'destination': 'pizza hut fen ditton',
                        'departure': "saint john's college"
                    },
                    'reqt': ['car', 'phone'],
                    'fail_info': {}
                }
            },
            'log': [{
                'user':
                "i would like a taxi from saint john 's college to pizza hut fen ditton .",
                'user_delex':
                'i would like a taxi from [value_departure] to [value_destination] .',
                'resp':
                'what time do you want to leave and what time do you want to arrive by ?',
                'sys':
                'what time do you want to leave and what time do you want to arrive by ?',
                'pointer': '0,0,0,0,0,0',
                'match': '',
                'constraint':
                "[taxi] destination pizza hut fen ditton departure saint john 's college",
                'cons_delex': '[taxi] destination departure',
                'sys_act': '[taxi] [request] leave arrive',
                'turn_num': 0,
                'turn_domain': '[taxi]'
            }, {
                'user': 'i want to leave after 17:15 .',
                'user_delex': 'i want to leave after [value_leave] .',
                'resp':
                'booking completed ! your taxi will be [value_car] contact number is [value_phone]',
                'sys':
                'booking completed ! your taxi will be blue honda contact number is 07218068540',
                'pointer': '0,0,0,0,0,0',
                'match': '',
                'constraint':
                "[taxi] destination pizza hut fen ditton departure saint john 's college leave 17:15",
                'cons_delex': '[taxi] destination departure leave',
                'sys_act': '[taxi] [inform] car phone',
                'turn_num': 1,
                'turn_domain': '[taxi]'
            }, {
                'user': 'thank you for all the help ! i appreciate it .',
                'user_delex': 'thank you for all the help ! i appreciate it .',
                'resp':
                'you are welcome . is there anything else i can help you with today ?',
                'sys':
                'you are welcome . is there anything else i can help you with today ?',
                'pointer': '0,0,0,0,0,0',
                'match': '',
                'constraint':
                "[taxi] destination pizza hut fen ditton departure saint john 's college leave 17:15",
                'cons_delex': '[taxi] destination departure leave',
                'sys_act': '[general] [reqmore]',
                'turn_num': 2,
                'turn_domain': '[general]'
            }, {
                'user': 'no , i am all set . have a nice day . bye .',
                'user_delex': 'no , i am all set . have a nice day . bye .',
                'resp': 'you too ! thank you',
                'sys': 'you too ! thank you',
                'pointer': '0,0,0,0,0,0',
                'match': '',
                'constraint':
                "[taxi] destination pizza hut fen ditton departure saint john 's college leave 17:15",
                'cons_delex': '[taxi] destination departure leave',
                'sys_act': '[general] [bye]',
                'turn_num': 3,
                'turn_domain': '[general]'
            }]
        }
    }
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
        cache_path = snapshot_download(self.model_id)
        preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
        model = SpaceForDialogModeling(
            model_dir=cache_path,
            text_field=preprocessor.text_field,
            config=preprocessor.config)
        pipelines = [
            DialogModelingPipeline(model=model, preprocessor=preprocessor),
            pipeline(
                task=Tasks.dialog_modeling,
                model=model,
                preprocessor=preprocessor)
        ]
        result = {}
        for step, item in enumerate(self.test_case['sng0073']['log']):
            user = item['user']
            print('user: {}'.format(user))
            result = pipelines[step % 2]({
                'user_input': user,
                'history': result
            })
            print('response : {}'.format(result['response']))
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = DialogModelingPreprocessor(model_dir=model.model_dir)
        pipelines = [
            DialogModelingPipeline(model=model, preprocessor=preprocessor),
            pipeline(
                task=Tasks.dialog_modeling,
                model=model,
                preprocessor=preprocessor)
        ]
        result = {}
        for step, item in enumerate(self.test_case['sng0073']['log']):
            user = item['user']
            print('user: {}'.format(user))
            result = pipelines[step % 2]({
                'user_input': user,
                'history': result
            })
            print('response : {}'.format(result['response']))
 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -0,0 +1,52 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForNLI
 from modelscope.pipelines import NLIPipeline, pipeline
 from modelscope.preprocessors import NLIPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 class NLITest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_nli_chinese-base'
    sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
    sentence2 = '四川商务职业学院商务管理在哪个校区？'
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_direct_file_download(self):
        cache_path = snapshot_download(self.model_id)
        tokenizer = NLIPreprocessor(cache_path)
        model = SbertForNLI(cache_path, tokenizer=tokenizer)
        pipeline1 = NLIPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
        print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
              f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
        print()
        print(
            f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
            f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = NLIPreprocessor(model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.nli, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.nli)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -0,0 +1,58 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSentimentClassification
 from modelscope.pipelines import SentimentClassificationPipeline, pipeline
 from modelscope.preprocessors import SentimentClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 class SentimentClassificationTest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
    sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_direct_file_download(self):
        cache_path = snapshot_download(self.model_id)
        tokenizer = SentimentClassificationPreprocessor(cache_path)
        model = SbertForSentimentClassification(
            cache_path, tokenizer=tokenizer)
        pipeline1 = SentimentClassificationPipeline(
            model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.sentiment_classification,
            model=model,
            preprocessor=tokenizer)
        print(f'sentence1: {self.sentence1}\n'
              f'pipeline1:{pipeline1(input=self.sentence1)}')
        print()
        print(f'sentence1: {self.sentence1}\n'
              f'pipeline1: {pipeline2(input=self.sentence1)}')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = SentimentClassificationPreprocessor(model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.sentiment_classification,
            model=model,
            preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence1))
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.sentiment_classification, model=self.model_id)
        print(pipeline_ins(input=self.sentence1))
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
        print(pipeline_ins(input=self.sentence1))
 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -4,7 +4,7 @@ import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.pipelines import WordSegmentationPipeline, pipeline
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
@@ -19,8 +19,7 @@ class WordSegmentationTest(unittest.TestCase):
    def test_run_by_direct_model_download(self):
        cache_path = snapshot_download(self.model_id)
        tokenizer = TokenClassifcationPreprocessor(cache_path)
        model = StructBertForTokenClassification(
            cache_path, tokenizer=tokenizer)
        model = SbertForTokenClassification(cache_path, tokenizer=tokenizer)
        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.word_segmentation, model=model, preprocessor=tokenizer)