!26977 Fix Transformer API Comments

Merge pull request !26977 from huangxinjing/fix_coo_and_api
4 years ago · e3ff4ef538
--- a/mindspore/parallel/nn/init.py
+++ b/mindspore/parallel/nn/init.py
@@ -15,7 +15,7 @@
 """
 NOTE:
    Transformer Networks
    This is an experimental interface that is subject to change and/or deletion.
    This is an experimental interface that is subject to change or deletion.
 """
 from .transformer import *
 from .moe import *
--- a/mindspore/parallel/nn/layers.py
+++ b/mindspore/parallel/nn/layers.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """
 The basic layer of the Transformer Networks. This is an experimental interface that is subject to
 change and/or deletion.
 change or deletion.
 """
 from functools import wraps, partial
 import inspect
@@ -30,7 +30,7 @@ from mindspore import nn
 from mindspore.nn.layer.activation import get_activation
 from mindspore.ops import functional as F
 from mindspore._checkparam import Validator
 from mindspore.ops.primitive import constexpr, Primitive
 from mindspore.ops.primitive import constexpr
 from .op_parallel_config import default_dpmp_config, OpParallelConfig

 __all__ = [
@@ -321,6 +321,16 @@ class _Linear(Cell):
    """

    @cell_attr_register
    @_args_type_validator_check(in_channels=Validator.check_positive_int,
                                out_channels=Validator.check_positive_int,
                                has_bias=Validator.check_bool,
                                activation=_valid_type_checks([type(None), str], "Linear"),
                                transpose_b=Validator.check_bool,
                                expert_num=Validator.check_positive_int,
                                param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
                                                                    "Linear"),
                                compute_dtype=_valid_value_checks([mstype.float32, mstype.float16],
                                                                  "Linear"))
    def __init__(self,
                 in_channels,
                 out_channels,
@@ -333,14 +343,8 @@ class _Linear(Cell):
                 param_init_type=mstype.float32,
                 compute_dtype=mstype.float16):
        super(_Linear, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        if param_init_type not in [mstype.float32, mstype.float16]:
            raise TypeError("The type of parameter 'param_init_type' should in [float32, float16], "
                            "but got the type : {}.".format(type(param_init_type)))

        if activation and not isinstance(activation, str):
            raise TypeError("The type of parameter 'activation' must be str, but got type {}".format(type(activation)))
        self.in_channels = in_channels
        self.out_channels = out_channels
        if isinstance(weight_init, Tensor) and (weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \
                    weight_init.shape[1] != in_channels):
            raise ValueError("The shape of parameter 'weight_init' is error, please check shape of 'weight_init'.")
@@ -364,9 +368,6 @@ class _Linear(Cell):
            self.bias_add = P.Add()
        self.act_name = activation
        self.activation = get_activation(activation) if isinstance(activation, str) else activation
        if activation is not None and not isinstance(self.activation, (Cell, Primitive)):
            raise TypeError("The type of parameter 'activation' must be str or Cell or Primitive, "
                            "but got the type {}".format(type(activation)))
        self.activation_flag = self.activation is not None
        self.dtype = compute_dtype
        self.cast = P.Cast()
@@ -432,21 +433,21 @@ class FixedSparseAttention(nn.Cell):
    2. An implementation of "strided" and "fixed" attention, as in the Sparse Transformers paper.

    Args:
        batch_size (int): Number of input batch size.
        num_heads (int): Number of attention heads.
        block_size (int): An integer determining the block size. Current implementation of sparse self-attention
                          is based on blocked sparse matrices. In which this parameter defines size of such blocks,
                          Block X Block. only supports 64 for now.
        seq_length (int): length of input sequence, only supports 1024 for now.
        num_different_global_patterns (int):An integer determining number of different global attentions layouts.
                                            While global attention can be fixed by which block/s are representative of
                                            any local window, since there are multi-heads, each head can use a
                                            different global representative, only supports 4 for now
        size_per_head (int): An integer determining embedding size of each attention head,
                             only supports 64, 128 for now.
        batch_size: (int): Number of input batch size.
        num_heads: (int): Number of attention heads.
        size_per_head: (int): An integer determining embedding size of each attention head,
            only supports 64, 128 for now.
        block_size: (int): An integer determining the block size. Current implementation of sparse self-attention
            is based on blocked sparse matrices. In which this parameter defines size of such blocks,
            Block X Block. only supports 64 for now.
        seq_length: (int): length of input sequence, only supports 1024 for now.
        num_different_global_patterns: (int):An integer determining number of different global attentions layouts.
            While global attention can be fixed by which block/s are representative of
            any local window, since there are multi-heads, each head can use a
            different global representative, only supports 4 for now
        parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`.
                                           Default `default_dpmp_config`, an instance of `OpParallelConfig` with
                                           default args.
            Default `default_dpmp_config`, an instance of `OpParallelConfig` with
            default args.

    Inputs:
        - **q** (Tensor) - Tensor query (:class:`mstype.fp16` [batch_size, seq_length, hidden_size]): Sequence of
--- a/mindspore/parallel/nn/loss.py
+++ b/mindspore/parallel/nn/loss.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """
 Parallel Loss for the Parallel Training
 This is an experimental interface that is subject to change and/or deletion.
 This is an experimental interface that is subject to change or deletion.
 """
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
--- a/mindspore/parallel/nn/moe.py
+++ b/mindspore/parallel/nn/moe.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """
 Note: Mixture of Expert (MoE) structure. This is an experimental interface that is subject to change and/or deletion.
 Note: Mixture of Expert (MoE) structure. This is an experimental interface that is subject to change or deletion.
 """
 import math
 import numpy as np
@@ -93,7 +93,8 @@ class MoE(Cell):
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        param_init_type (dtype.Number): The parameter initialization type. Can be dtype.float32 or dtype.float16.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`.
                                           Default `default_dpmp_config`, an instance of `OpParallelConfig` with default
                                           args.
--- a/mindspore/parallel/nn/transformer.py
+++ b/mindspore/parallel/nn/transformer.py
@@ -14,7 +14,7 @@
 # ============================================================================
 """
 Note:
    Transformer Networks. This is an experimental interface that is subject to change and/or deletion.
    Transformer Networks. This is interface that is subject to change or deletion.
 """
 import math
 import numpy as np
@@ -57,14 +57,20 @@ class EmbeddingOpParallelConfig(_Config):
        EmbeddingOpParallelConfig for the setting data parallel or row slice for the embedding table.

        Args:
            data_parallel (int): The data parallel way. Default: 1
            model_parallel (int): The model parallel way. Default: 1
            vocab_emb_dp (bool): Shard embedding in model parallel or data parallel. Default: True
            data_parallel(int): The data parallel way. The input data will be sliced into n parts for embedding layer
                according to this value. Default 1.
            model_parallel(int): The model parallel way. The embedding table parameters
                will be sliced according to the model parallel way. Default: 1.
            vocab_emb_dp(bool): Shard embedding in model parallel or data parallel. If true, the embedding lookup
                will be a data parallel style training and model_parallel value will be ignored.  If false, the
                embedding table will be sharded into n parts at the 0-th dimension row slice of the embedding table,
                where the n is the model parallel way determinined by this parameter. Default: True

        Supported Platforms:
            ``Ascend`` ``GPU``

        Examples:
            >>> from mindspore.parallel.nn import EmbeddingOpParallelConfig
            >>> config=EmbeddingOpParallelConfig(data_parallel=1, model_parallel=1, vocab_emb_dp=True)
    """

@@ -107,6 +113,7 @@ class EmbeddingOpParallelConfig(_Config):
                ``Ascend`` ``GPU``

            Examples:
                >>> from mindspore.parallel.nn import EmbeddingOpParallelConfig
                >>> config=EmbeddingOpParallelConfig(data_parallel=1, model_parallel=1, vocab_emb_dp=True)
                >>> parallel_config = config.dp_mp_config
        """
@@ -126,10 +133,12 @@ class TransformerRecomputeConfig(_Config):
            mp_comm_recompute (bool): Specifies whether the model parallel communication operators
                in the cell are recomputed in auto parallel or semi auto parallel mode. Default: True.
            recompute_slice_activation (bool): Slice the cell output which would remains in memory. Default: False.

        Supported Platforms:
            ``Ascend`` ``GPU``

        Examples:
            >>> from mindspore.parallel.nn import TransformerRecomputeConfig
            >>> config=TransformerRecomputeConfig(recompute=True, parallel_optimizer_comm_recompute=True,
            >>> mp_comm_recompute=True, recompute_slice_activation=True)
    """
@@ -181,25 +190,30 @@ class TransformerRecomputeConfig(_Config):
        Validator.check_bool(value, "recompute_slice_activation")
        self._recompute_slice_activation = value


 _DEFALUT_TRANSFORMER_RECOMPUTE_CONFIG = TransformerRecomputeConfig()


 class TransformerOpParallelConfig(_Config):
    r"""
        TransformerOpParallelConfig for the setting global data parallel, model parallel and fusion group.
        The parallel configure setting.

        Note:
            Except the recompute argument, other arguments will not be effective when the user doesn't set
            Except the recompute argument, other arguments will **not** be effective when the user doesn't set
            auto_parallel_context to `SEMI_AUTO_PARALLEL` or `AUTO_PARALLEL`.
            The micro_batch_num must be greater than or equal to pipeline_stage. The data_parallel\*model_parallel
            \*pipeline_stage must be equal or less equal to the device. When setting the pipeline stage and
            optimizer_shard, the config will overwrite the auto_parallel_context.

        Args:
            data_parallel (int): The data parallel way. Default: 1.
            model_parallel (int): The model parallel way. Default: 1.
            The micro_batch_num must be greater than or equal to pipeline_stage when training.
            The data_parallel\*model_parallel \*pipeline_stage must be equal or less equal to the device. When setting
            the pipeline stage and optimizer_shard, the config will overwrite the auto_parallel_context. When given the
            8 devices and the data_parallel is 1 and model_parallel is 1, the calculation will be repeated on each
            device.

       Args:
            data_parallel (int): The data parallel way. The input data will be sliced for each layer. Default: 1.
            model_parallel (int): The model parallel way. The parameters of dense layers in MultiheadAttention and
                FeedForward layer will be sliced according to the model parallel way. Default: 1.
            pipeline_stage (int): The number of the pipeline stage. Should be a positive value. Default: 1.
            micro_batch_num (int): The microe size of the batches for the pipeline training. Default: 1.
            micro_batch_num (int): The micro size of the batches for the pipeline training. Default: 1.
            optimizer_shard (bool): Whether to enable optimizer shard. Default False.
            gradient_aggregation_group (int): The fusion group size of the optimizer state sharding. Default: 4.
            recompute (Union[TransformerRecomputeConfig, bool]): The configuration of recomputation for
@@ -210,6 +224,7 @@ class TransformerOpParallelConfig(_Config):
            ``Ascend`` ``GPU``

        Examples:
            >>> from mindspore.parallel.nn import TransformerRecomputeConfig
            >>> recompute_config=TransformerRecomputeConfig(recompute=True, parallel_optimizer_comm_recompute=True,
            >>> mp_comm_recompute=True, recompute_slice_activation=True)
            >>> config=TransformerOpParallelConfig(data_parallel=1, model_parallel=1, recompute=recompute_config)
@@ -321,6 +336,7 @@ class TransformerOpParallelConfig(_Config):
                ``Ascend`` ``GPU``

            Examples:
                >>> from mindspore.parallel.nn import TransformerOpParallelConfig
                >>> config=TransformerOpParallelConfig(data_parallel=1, model_parallel=1, vocab_emb_dp=True)
                >>> parallel_config = config.dp_mp_config
        """
@@ -344,19 +360,19 @@ class FeedForward(Cell):
    where the :math:`W_1, W_2, b_1` and :math:`b_2` are trainable parameters.

    Args:
        hidden_size (int): The dimension of the inputs.
        ffn_hidden_size (int): The intermediate hidden size.
        dropout_rate (float): The dropout rate for the second linear's output.
        hidden_act (str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        expert_num (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used
        hidden_size: (int): The dimension of the inputs.
        ffn_hidden_size: (int): The intermediate hidden size.
        dropout_rate: (float): The dropout rate for the second linear's output.
        hidden_act: (str): The activation of the internal feedforward layer. Supports 'relu',
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        expert_num: (int): The number of experts used in Linear. For the case expert_num > 1, BatchMatMul is used
            and the first dimension in BatchMatMul indicate expert_num. Default: 1.
        param_init_type (dtype.Number): The parameter initialization type. Should be dtype.float32 or dtype.float16.
                                        Default: dtype.float32.
        param_init_type: (dtype.Number): The parameter initialization type. Should be dtype.float32 or dtype.float16.
            Default: dtype.float32.
        parallel_config(OpParallelConfig): The config of parallel setting, see `OpParallelConfig`.
                                           Default `default_dpmp_config`, an instance of `OpParallelConfig` with
                                           default args.
            Default `default_dpmp_config`, an instance of `OpParallelConfig` with
            default args.

    Inputs:
        - **x** (Tensor) - should be `[batch, seq_length, hidden_size] or [batch * seq_length, hidden_size]`.
@@ -474,7 +490,7 @@ class FeedForward(Cell):
 class AttentionMask(Cell):
    r"""
    Get the Lower triangular matrix from the input mask. The input mask is a 2D tensor (batch_size, seq_length)
    with 1 and 0. 1 indicates the current position is a valid token, otherwise not.
    with 1 and 0, where 1 indicates the current position is a valid token, otherwise not.

    Args:
        seq_length(int): The sequence length of the input tensor.
@@ -548,18 +564,23 @@ class AttentionMask(Cell):
 class VocabEmbedding(Cell):
    """
    The embedding lookup table from the 0-th dim of the parameter table. When the parallel_config.vocab_emb_dp is
    True and in the `AUTO_PARALLEL_MODE`, the embedding lookup will be a `parallel_config.data_parallel`
    data parallel way, or will shard the parameter at the 0-th dimension in `parallel_config.model_parallel`, so-called
    row slice of the embedding table.
    True and in the `AUTO_PARALLEL` mode, the embedding lookup will be trained by the data parallel way, as the
    parameters will be repeated on each device. If false, the embedding table will be sharded into n parts at the 0-th
    dimension of the embedding table, where the n is the model parallel way determinined by the
    parallel_config(EmbeddingOpParallelConfig).

    Note:
        When `AUTO_PARALLEL` mode is enabled, this layer support only 2-d dimension inputs, as the shard is designed
        for 2d inputs.

    Args:
        vocab_size (int): Size of the dictionary of embeddings.
        embedding_size (int): The size of each embedding vector.
        param_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
            Refer to class `initializer` for the values of string when a string
            is specified. Default: 'normal'.
        vocab_size: (int): Size of the dictionary of embeddings.
        embedding_size: (int): The size of each embedding vector.
        parallel_config(EmbeddingOpParallelConfig): The parallel config of network. Default
            `default_embedding_parallel_config`, an instance of `EmbeddingOpParallelConfig` with default args.
        param_init: (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
            Refer to class `initializer` for the values of string when a string
            is specified. Default: 'normal'.

    Inputs:
        **input_ids** (Tensor) - The tokenized inputs with datatype int32 with shape (batch_size, seq_length)
@@ -642,14 +663,14 @@ class MultiHeadAttention(Cell):
        tgt_seq_length(int): The sequence length of the key and value vector.
        hidden_size(int): The hidden size of the input.
        num_heads(int): The number of the heads.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        compute_dtype(dtype.Number): The computation type of dense. Default dtype.float16.
            Should be dtype.float32 or dtype.float16.
        param_init_type(dtype.Number): The parameter initialization type of the module. Default dtype.float32.
            Should be dtype.float32 or dtype.float16.
        softmax_compute_type(dtype.Number): The type of softmax computation module. Default dtype.float32.
            Should be dtype.float32 or dtype.float16.
        param_init_type(dtype.Number): The parameter initialization type of the module. Default dtype.float32.
            Should be dtype.float32 or dtype.float16.
        use_past(bool): Use the past state to compute, used for incremental prediction. For example, if we have two
            words and want to generate the ten more words. We just need to compute the two words's state only once,
            and generate the next word one by one. When use_past is True, there are two steps to run the prediction.
@@ -658,7 +679,7 @@ class MultiHeadAttention(Cell):
            is_first_iteration to be False by `model.add_flags_recursive(is_first_iteration=False)`. At this moment,
            pass the single step's input tensor, and loop it. Default False.
        parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
                                           an instance of `OpParallelConfig` with default args.
            an instance of `OpParallelConfig` with default args.

    Inputs:
        - **query_tensor** (Tensor) - the query vector with shape (batch_size, src_seq_length, hidden_size) or
@@ -1117,21 +1138,21 @@ class TransformerEncoderLayer(Cell):
    Args:
        batch_size(int): The batch size of the input tensor.
        hidden_size(int): The hidden size of the input.
        seq_length(int): The input sequence length.
        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
        num_heads(int): The number of the heads.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
        seq_length(int): The input sequence length.
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        layernorm_compute_type(dtype.Number): The computation type of the layernorm.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
            Should be dtype.float32 or dtype.float16. Default mstype.float32.
        param_init_type(dtype.Number): The parameter initialization type of the module.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        use_past(bool): Use the past state to compute, used for incremental prediction. For example, if we have two
            words and want to generate the ten more words. We just need to compute the two words's state only once,
            and generate the next word one by one. When use_past is True, there are two steps to run the prediction.
@@ -1139,9 +1160,10 @@ class TransformerEncoderLayer(Cell):
            `model.add_flags_recursive(is_first_iteration=True)`, and pass the full inputs. Then, set the
            is_first_iteration to be False by `model.add_flags_recursive(is_first_iteration=False)`. At this moment,
            pass the single step's input tensor, and loop it. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
                                           an instance of `OpParallelConfig` with default args.
            an instance of `OpParallelConfig` with default args.

    Inputs:
        - **x** (Tensor) - Float Tensor, shape should be [batch_size, seq_length, hidden_size] or
@@ -1433,28 +1455,29 @@ class TransformerDecoderLayer(Cell):
    the cross attention will not be effective.

    Args:
        batch_size(int): The batch size of the input tensor.
        hidden_size(int): The hidden size of the input.
        src_seq_length(int): The input source sequence length.
        tgt_seq_length(int): The input target sequence length.
        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
        num_heads(int): The number of the heads.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        batch_size(int): The batch size of the input tensor.
        src_seq_length(int): The input source sequence length.
        tgt_seq_length(int): The input target sequence length.
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
        layernorm_compute_type(dtype.Number): The computation type of the layernorm.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
            Should be dtype.float32 or dtype.float16. Default mstype.float32.
        param_init_type(dtype.Number): The parameter initialization type of the module.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(OpParallelConfig): The parallel configure. Default `default_dpmp_config`,
                                           an instance of `OpParallelConfig` with default args.
            an instance of `OpParallelConfig` with default args.

    Inputs:
        - **hidden_stats** (Tensor) - the input tensor with shape [batch_size, tgt_seq_length, hidden_size] or
@@ -1837,25 +1860,18 @@ class TransformerEncoder(Cell):
        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
        seq_length(int): The seq_length of the input tensor.
        num_heads(int): The number of the heads.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        layernorm_compute_type(dtype.Number): The computation type of the layernorm.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
            Should be dtype.float32 or dtype.float16. Default mstype.float32.
        param_init_type(dtype.Number): The parameter initialization type of the module.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        use_past(bool): Use the past state to compute, used for incremental prediction. For example, if we have two
            words and want to generate the ten more words. We just need to compute the two words's state only once,
            and generate the next word one by one. When use_past is True, there are two steps to run the prediction.
            The first step, set the is_first_iteration to be True by
            `model.add_flags_recursive(is_first_iteration=True)`, and pass the full inputs. Then, set the
            is_first_iteration to be False by `model.add_flags_recursive(is_first_iteration=False)`. At this moment,
            pass the single step's input tensor, and loop it. Default False.
        lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
            wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
            that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
@@ -1864,9 +1880,17 @@ class TransformerEncoder(Cell):
            default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
        offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
            overlap with the encoder layer.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        use_past(bool): Use the past state to compute, used for incremental prediction. For example, if we have two
            words and want to generate the ten more words. We just need to compute the two words's state only once,
            and generate the next word one by one. When use_past is True, there are two steps to run the prediction.
            The first step, set the is_first_iteration to be True by
            `model.add_flags_recursive(is_first_iteration=True)`, and pass the full inputs. Then, set the
            is_first_iteration to be False by `model.add_flags_recursive(is_first_iteration=False)`. At this moment,
            pass the single step's input tensor, and loop it. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
                                           an instance of `TransformerOpParallelConfig` with default args.
            an instance of `TransformerOpParallelConfig` with default args.

    Inputs:
        - **hidden_states** (Tensor) - Tensor, shape should be [batch_size, seq_length, hidden_size] or
@@ -1874,7 +1898,7 @@ class TransformerEncoder(Cell):
          should be [batch_size, 1, hidden_size].
        - **attention_mask** (Tensor) - Tensor, attention mask with shape [batch_size, seq_length, seq_length]
        - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
        - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
          for incremental prediction when the use_past is True. Default None.

@@ -2041,27 +2065,25 @@ class TransformerDecoder(Cell):
    attention, cross attention and feedforward layer.

    Args:
        batch_size(int): The batch size of the input tensor.
        num_layers(int): The layers of the `TransformerDecoderLayer`.
        batch_size(int): The batch size of the input tensor.
        hidden_size(int): The hidden size of the input.
        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
        src_seq_length(int): The input source sequence length.
        tgt_seq_length(int): The input target sequence length.
        num_heads(int): The number of the heads.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        layernorm_compute_type(dtype.Number): The computation type of the layernorm.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
            Should be dtype.float32 or dtype.float16. Default mstype.float32.
        param_init_type(dtype.Number): The parameter initialization type of the module.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
            overlap with the encoder layer.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
            wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
            that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
@@ -2070,9 +2092,12 @@ class TransformerDecoder(Cell):
            default setting for the pipeline is: `(layer_id + offset) // (layers / pipeline_stage)`.
            Default: None.
        use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        offset(int): The initial layer index for the `decoder`. Used for setting the fusion id and stage id, to not
            overlap with the encoder layer.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
                                           an instance of `TransformerOpParallelConfig` with default args.
            an instance of `TransformerOpParallelConfig` with default args.

    Inputs:
        - **hidden_stats** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size] or
@@ -2085,7 +2110,7 @@ class TransformerDecoder(Cell):
          src_seq_length] where tgt_seq_length is the length of the decoder. Note this args can not be passed by
          None when the net is in outermost layer. Default None.
        - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
        - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
          Used for incremental prediction when the use_past is True. Default None.

@@ -2243,19 +2268,22 @@ class Transformer(Cell):
    The details can be found in `Attention is all you need <https://arxiv.org/pdf/1706.03762v5.pdf>`_.

    Note:
        This is an experimental interface that is subject to change and/or deletion.
        This is an experimental interface that is subject to change or deletion.

    Args:
        batch_size(int): The batch size of the input tensor.
        encoder_layers(int): The layers of the `TransformerEncoderLayer`.
        decoder_layers(int): The layers of the `TransformerDecoderLayer`.
        hidden_size(int): The hidden size of the input.
        batch_size(int): The batch size of the input tensor.
        ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
        src_seq_length(int): The seq_length of the encoder's input tensor.
        tgt_seq_length(int): The seq_length of the decoder's input tensor.
        encoder_layers(int): The layers of the `TransformerEncoderLayer`. Default 3.
        decoder_layers(int): The layers of the `TransformerDecoderLayer`. Default 3.
        num_heads(int): The number of the heads. Default: 2.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1
        attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
        hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
            'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
            'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
        layernorm_compute_type(dtype.Number): The computation type of the layernorm.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
@@ -2263,20 +2291,18 @@ class Transformer(Cell):
            Should be dtype.float32 or dtype.float16. Default mstype.float32.
        param_init_type(dtype.Number): The parameter initialization type of the module.
            Should be dtype.float32 or dtype.float16. Default dtype.float32.
        hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
                         'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
                         'hsigmoid', 'logsigmoid' and so on. Default: gelu.
        use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert).
        lambda_func: A function can determine the fusion index, pipeline stages and recompute attribute. If the user
            wants to determine the pipeline stage and gradient aggregation fusion, the user can pass a function
            that accepts `network`, `layer_id`, `offset`, `parallel_config`, `layers`. The `network(Cell)`
            represents the transformer block, `layer_id(int)` means the layer index for the current module, counts from
            zero, `offset(int)` means the layer_index needs an offset, if there are other modules in the net. The
            default setting for the pipeline is: `(layer_id + offset) // ((encoder_layers + decoder_length)
            default setting for the pipeline is: `(layer_id + offset) // ((encoder_layers + decoder_layers)
            / pipeline_stage)`.
        use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
        moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig with
            default values. Please see `MoEConfig`.
        parallel_config(TransformerOpParallelConfig): The parallel configure. Default `default_transformer_config`,
                                           an instance of `TransformerOpParallelConfig` with default args.
            an instance of `TransformerOpParallelConfig` with default args.

    Inputs:
        - **encoder_inputs** (Tensor) - the input tensor with shape [batch_size, seq_length, hidden_size] or
@@ -2291,12 +2317,12 @@ class Transformer(Cell):
          where tgt_seq_length is the length of the decoder. the output of the encoder with shape [batch_size,
          seq_length, hidden_size], this should be none if the decoder layer is 0.
        - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True
          past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
        - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index. Used
          for incremental prediction when the use_past is True. Default None.

    Outputs:
        Tuple, a tuple contains(`output`, `encoder_layer_present`, `encoder_layer_present`)
        Tuple, a tuple contains(`output`, `encoder_layer_present`, `decoder_layer_present`)

        - **output** (Tensor) - If there is only encoder, the output logit of the encoder layer. The shape is
          [batch, src_seq_length, hidden_size] or [batch * src_seq_length, hidden_size], if there are encoder and