!6514 fix nn & operations api comments

Merge pull request !6514 from panfengfeng/fix_api
5 years ago · 89cd6bf07a
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -614,7 +614,7 @@ class Cell(Cell_):
        """
        Defines the computation to be performed.
        This method should be overridden by all subclasses.
        This method must be overridden by all subclasses.
        Note:
            The inputs of the top cell only allow Tensor.
@@ -748,7 +748,7 @@ class Cell(Cell_):
        Yields parameters of this cell. If `expand` is True, yield parameters of this cell and all subcells.
        Args:
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
            expand (bool): If true, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.
        Examples:
@@ -775,7 +775,7 @@ class Cell(Cell_):
        Args:
            name_prefix (str): Namespace. Default: ''.
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
            expand (bool): If true, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.
        Examples:
@@ -990,7 +990,7 @@ class Cell(Cell_):
        Set the cell backward hook function. Note that this function is only supported in Pynative Mode.
        Note:
            fn should be defined as the following code. `cell_name` is the name of registered cell.
            fn must be defined as the following code. `cell_name` is the name of registered cell.
            `grad_input` is gradient passed to the cell. `grad_output` is the gradient computed and passed to the
            next cell or primitve, which may be modified and returned.
            >>> hook_fn(cell_name, grad_input, grad_output) -> Tensor or None
--- a/mindspore/nn/dynamic_lr.py
+++ b/mindspore/nn/dynamic_lr.py
@@ -90,7 +90,7 @@ def exponential_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch,
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.
    Returns:
        list[float]. The size of list is `total_step`.
@@ -132,7 +132,7 @@ def natural_exp_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch,
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.
    Returns:
        list[float]. The size of list is `total_step`.
@@ -175,7 +175,7 @@ def inverse_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch, deca
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.
    Returns:
        list[float]. The size of list is `total_step`.
@@ -283,7 +283,7 @@ def polynomial_decay_lr(learning_rate, end_learning_rate, total_step, step_per_e
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
        power (float): A value used to calculate decayed learning rate. This parameter should be greater than 0.
        power (float): A value used to calculate decayed learning rate. This parameter must be greater than 0.
        update_decay_epoch (bool): If true, update `decay_epoch`. Default: False.
    Returns:
--- a/mindspore/nn/graph_kernels/graph_kernels.py
+++ b/mindspore/nn/graph_kernels/graph_kernels.py
@@ -106,11 +106,11 @@ class MinimumGrad(GraphKernel):
    """
    Backprop function for Minimum operator.
    Compares x and y elementwise, dout should has the same shape with x and y.
    Compares x and y elementwise, dout must has the same shape with x and y.
    Inputs:
        - **x** (Tensor) - The first input
        - **y** (Tensor) - x and y should have same shape
        - **y** (Tensor) - x and y must have same shape
        - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output
    Outputs:
@@ -274,7 +274,7 @@ class EqualCount(GraphKernel):
    """
    Computes the number of the same elements of two tensors.
    The two input tensors should have the same shape and data type.
    The two input tensors must have the same shape and data type.
    Inputs:
        x (Tensor): the first input tensor.
@@ -309,8 +309,8 @@ class ReduceMean(GraphKernel):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions. Default: False.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions. Default: False.
    Inputs:
        - **input_x** (Tensor[Number]) - The input tensor.
@@ -1000,10 +1000,10 @@ class LayerNorm(Cell):
        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
            `begin_norm_axis ... R - 1`.
        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
            `begin_norm_axis: rank(inputs)`, the value must be in [-1, rank(input)). Default: -1.
        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
            the normalized inputs accordingly, the value must be in [-1, rank(input)). Default: -1.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -279,7 +279,7 @@ class ClipByNorm(Cell):
    where :math:`L_2(X)` is the :math:`L_2`-norm of :math:`X`.
    Inputs:
        - **input** (Tensor) - Tensor of shape N-D. The type should be float32 or float16.
        - **input** (Tensor) - Tensor of shape N-D. The type must be float32 or float16.
        - **clip_norm** (Tensor) - A scalar Tensor of shape :math:`()` or :math:`(1)`.
    Outputs:
@@ -336,7 +336,7 @@ class Norm(Cell):
    Args:
        axis (Union[tuple, int]): The axis over which to compute vector norms. Default: ().
        keep_dims (bool): If True, the axis indicated in `axis` are kept with size 1. Otherwise,
        keep_dims (bool): If true, the axis indicated in `axis` are kept with size 1. Otherwise,
                   the dimensions in `axis` are removed from the output shape. Default: False.
    Inputs:
@@ -507,12 +507,12 @@ class Unfold(Cell):
    The input tensor must be a 4-D tensor and the data format is NCHW.
    Args:
        ksizes (Union[tuple[int], list[int]]): The size of sliding window, should be a tuple or a list of integers,
        ksizes (Union[tuple[int], list[int]]): The size of sliding window, must be a tuple or a list of integers,
            and the format is [1, ksize_row, ksize_col, 1].
        strides (Union[tuple[int], list[int]]): Distance between the centers of the two consecutive patches,
            should be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
            must be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
        rates (Union[tuple[int], list[int]]): In each extracted patch, the gap between the corresponding dimension
            pixel positions, should be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
            pixel positions, must be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
        padding (str): The type of padding algorithm, is a string whose value is "same" or "valid",
            not case sensitive. Default: "valid".
@@ -575,7 +575,7 @@ class MatrixDiag(Cell):
          float32, float16, int32, int8, and uint8.
    Outputs:
        Tensor, has the same type as input `x`. The shape should be x.shape + (x.shape[-1], ).
        Tensor, has the same type as input `x`. The shape must be x.shape + (x.shape[-1], ).
    Examples:
        >>> x = Tensor(np.array([1, -1]), mstype.float32)
@@ -606,7 +606,7 @@ class MatrixDiagPart(Cell):
          float32, float16, int32, int8, and uint8.
    Outputs:
        Tensor, has the same type as input `x`. The shape should be x.shape[:-2] + [min(x.shape[-2:])].
        Tensor, has the same type as input `x`. The shape must be x.shape[:-2] + [min(x.shape[-2:])].
    Examples:
        >>> x = Tensor([[[-1, 0], [0, 1]], [[-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
--- a/mindspore/nn/layer/conv.py
+++ b/mindspore/nn/layer/conv.py
@@ -160,7 +160,7 @@ class Conv2d(_Conv):
              must be 0.
            - pad: Implicit paddings on both sides of the input. The number of `padding` will be padded to the input
              Tensor borders. `padding` should be greater than or equal to 0.
              Tensor borders. `padding` must be greater than or equal to 0.
        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
                    the paddings of top, bottom, left and right are the same, equal to padding. If `padding` is a tuple
@@ -168,10 +168,10 @@ class Conv2d(_Conv):
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. If the group is equal to `in_channels` and `out_channels`,
            this 2D convolution layer also can be called 2D depthwise convolution layer. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
@@ -239,7 +239,7 @@ class Conv2d(_Conv):
        self.bias_add = P.BiasAdd()
    def _init_depthwise_conv2d(self):
        """Init depthwise conv2d op"""
        """Initialize depthwise conv2d op"""
        if context.get_context("device_target") == "Ascend" and self.group > 1:
            self.dilation = self._dilation
            validator.check_integer('group', self.group, self.in_channels, Rel.EQ)
@@ -335,15 +335,15 @@ class Conv1d(_Conv):
              must be 0.
            - pad: Implicit paddings on both sides of the input. The number of `padding` will be padded to the input
              Tensor borders. `padding` should be greater than or equal to 0.
              Tensor borders. `padding` must be greater than or equal to 0.
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): The data type is int. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): An initializer for the convolution kernel.
@@ -481,7 +481,7 @@ class Conv2dTranspose(_Conv):
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Its value should be equal to or greater than 1.
            represent height and width of movement respectively. Its value must be equal to or greater than 1.
            Default: 1.
        pad_mode (str): Select the mode of the pad. The optional values are
            "pad", "same", "valid". Default: "same".
@@ -497,10 +497,10 @@ class Conv2dTranspose(_Conv):
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater than or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_channels` and `out_channels` should be
        group (int): Splits filter into groups, `in_channels` and `out_channels` must be
            divisible by the number of groups. This does not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
@@ -662,10 +662,10 @@ class Conv1dTranspose(_Conv):
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): The data type is int. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_channels` and `out_channels` should be
        group (int): Splits filter into groups, `in_channels` and `out_channels` must be
            divisible by the number of groups. This is not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
--- a/mindspore/nn/layer/embedding.py
+++ b/mindspore/nn/layer/embedding.py
@@ -36,7 +36,7 @@ class Embedding(Cell):
    the corresponding word embeddings.
    Note:
        When 'use_one_hot' is set to True, the type of the input should be mindspore.int32.
        When 'use_one_hot' is set to True, the type of the input must be mindspore.int32.
    Args:
        vocab_size (int): Size of the dictionary of embeddings.
@@ -49,7 +49,7 @@ class Embedding(Cell):
    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(\text{batch_size}, \text{input_length})`. The elements of
          the Tensor should be integer and not larger than vocab_size. Otherwise the corresponding embedding vector will
          the Tensor must be integer and not larger than vocab_size. Otherwise the corresponding embedding vector will
          be zero.
    Outputs:
@@ -120,7 +120,7 @@ class EmbeddingLookup(Cell):
        specified 'offset = 0' to lookup table.
        When 'target' is set to 'DEVICE', this module will use P.GatherV2() which
        specified 'axis = 0' to lookup table.
        In field slice mode, the manual_shapes should be given. It is a tuple ,where
        In field slice mode, the manual_shapes must be given. It is a tuple ,where
        the element is vocab[i], vocab[i] is the row numbers for i-th
        part.
@@ -128,16 +128,16 @@ class EmbeddingLookup(Cell):
        vocab_size (int): Size of the dictionary of embeddings.
        embedding_size (int): The size of each embedding vector.
        param_init (str): The initialize way of embedding table. Default: 'normal'.
        target (str): Specify the target where the op is executed. The value should in
        target (str): Specifies the target where the op is executed. The value must in
            ['DEVICE', 'CPU']. Default: 'CPU'.
        slice_mode (str): The slicing way in semi_auto_parallel/auto_parallel. The value should get through
        slice_mode (str): The slicing way in semi_auto_parallel/auto_parallel. The value must get through
            nn.EmbeddingLookup. Default: nn.EmbeddingLookup.BATCH_SLICE.
        manual_shapes (tuple): The accompaniment array in field slice mode.
    Inputs:
        - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
          Specifies the indices of elements of the original Tensor. Values can be out of range of embedding_table,
          and the exceeding part will be filled with 0 in the output. Input_indices should only be a 2d tensor in
          and the exceeding part will be filled with 0 in the output. Input_indices must only be a 2d tensor in
          this interface.
    Outputs:
--- a/mindspore/nn/layer/image.py
+++ b/mindspore/nn/layer/image.py
@@ -193,8 +193,8 @@ class SSIM(Cell):
        k2 (float): The constant used to generate c2 in the contrast comparison function. Default: 0.03.
    Inputs:
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.
    Outputs:
        Tensor, has the same dtype as img1. It is a 1-D tensor with shape N, where N is the batch num of img1.
@@ -267,8 +267,8 @@ class MSSSIM(Cell):
        k2 (float): The constant used to generate c2 in the contrast comparison function. Default: 0.03.
    Inputs:
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.
    Outputs:
        Tensor, has the same dtype as img1. It is a 1-D tensor with shape N, where N is the batch num of img1.
@@ -352,8 +352,8 @@ class PSNR(Cell):
          Default: 1.0.
    Inputs:
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.
    Outputs:
        Tensor, with dtype mindspore.float32. It is a 1-D tensor with shape N, where N is the batch num of img1.
--- a/mindspore/nn/layer/lstm.py
+++ b/mindspore/nn/layer/lstm.py
@@ -78,7 +78,7 @@ class LSTM(Cell):
        - **input** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`).
        - **hx** (tuple) - A tuple of two Tensors (h_0, c_0) both of data type mindspore.float32 or
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
          Data type of `hx` should be the same as `input`.
          Data type of `hx` must be the same as `input`.
    Outputs:
        Tuple, a tuple constains (`output`, (`h_n`, `c_n`)).
@@ -208,7 +208,7 @@ class LSTMCell(Cell):
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
        - **c** - data type mindspore.float32 or
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
          Data type of `h' and 'c' should be the same of `input`.
          Data type of `h' and 'c' must be the same of `input`.
    Outputs:
        `output`, `h_n`, `c_n`, 'reserve', 'state'.
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@@ -36,8 +36,8 @@ class ReduceLogSumExp(Cell):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions.
                          Default : False.
    Inputs:
@@ -357,16 +357,16 @@ class MatMul(Cell):
    will be broadcasted and must be broadcastable.
    Args:
        transpose_x1 (bool): If True, `a` is transposed before multiplication. Default: False.
        transpose_x2 (bool): If True, `b` is transposed before multiplication. Default: False.
        transpose_x1 (bool): If true, `a` is transposed before multiplication. Default: False.
        transpose_x2 (bool): If true, `b` is transposed before multiplication. Default: False.
    Inputs:
        - **input_x1** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(*A, N, C)`,
          where :math:`*A` represents the batch size of `x1` which can be multidimensional.
          If `transpose_a` is True, its shape should be :math:`(*A, N, C)` after transposing.
          If `transpose_a` is True, its shape must be :math:`(*A, N, C)` after transposing.
        - **input_x2** (Tensor) - The second tensor to be multiplied. The shape of the tensor is :math:`(*B, C, M)`,
          where :math:`*B` represents the batch size of `x2` which can be multidimensional.
          If `transpose_b` is True, its shape should be :math:`(*B, C, M)` after transposing.
          If `transpose_b` is True, its shape must be :math:`(*B, C, M)` after transposing.
    Outputs:
        Tensor, the shape of the output tensor is :math:`(*L, N, M)`. :math:`*L` is the batch size after broadcasting.
--- a/mindspore/nn/layer/pooling.py
+++ b/mindspore/nn/layer/pooling.py
@@ -159,7 +159,7 @@ class AvgPool2d(_PoolNd):
    Args:
        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value.
            The data type of kernel_size should be int and the value represents the height and width,
            The data type of kernel_size must be int and the value represents the height and width,
            or a tuple of two int numbers that represent height and width respectively.
            Default: 1.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -65,14 +65,14 @@ class Conv2dBnAct(Cell):
            and width of the 2D convolution window. Single int means the value is for both height and width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (int): Specifies stride for all spatial dimensions with the same value. The value of stride should be
        stride (int): Specifies stride for all spatial dimensions with the same value. The value of stride must be
            greater than or equal to 1 and lower than any one of the height and width of the input. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): Specifying the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater than
        dilation (int): Specifies the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
            there will be :math:`k - 1` pixels skipped for each sampling location. Its value must be greater than
            or equal to 1 and lower than any one of the height and width of the input. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
@@ -85,14 +85,14 @@ class Conv2dBnAct(Cell):
            Initializer and string are the same as 'weight_init'. Refer to the values of
            Initializer for more details. Default: 'zeros'.
        has_bn (bool): Specifies to used batchnorm or not. Default: False.
        momentum (float): Momentum for moving average.Momentum value should be [0, 1].Default:0.9
        momentum (float): Momentum for moving average.Momentum value must be [0, 1].Default:0.9
        eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                     1e-5.
        activation (Cell): Specifies activation type. The optional values are as following:
            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
        alpha (float): Slope of the activation function at x < 0. Default: 0.2.
        after_fake(bool): Determin whether there should be a fake quantization operation after Cond2dBnAct.
        after_fake(bool): Determin whether there must be a fake quantization operation after Cond2dBnAct.
    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -176,7 +176,7 @@ class DenseBnAct(Cell):
        activation (string): Specifies activation type. The optional values are as following:
            'Softmax', 'LogSoftmax', 'ReLU', 'ReLU6', 'Tanh', 'GELU', 'Sigmoid',
            'PReLU', 'LeakyReLU', 'h-Swish', and 'h-Sigmoid'. Default: None.
        after_fake(bool): Determin whether there should be a fake quantization operation after DenseBnAct.
        after_fake(bool): Determin whether there must be a fake quantization operation after DenseBnAct.
    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
@@ -227,7 +227,7 @@ class BatchNormFoldCell(Cell):
    Batch normalization folded.
    Args:
        momentum (float): Momentum value should be [0, 1]. Default: 0.9.
        momentum (float): Momentum value must be [0, 1]. Default: 0.9.
        epsilon (float): A small float number to avoid dividing by 0. 1e-5 if dtype in
            float32 else 1e-3. Default: 1e-5.
        freeze_bn (int): Delay in steps at which computation switches from regular batch
@@ -250,7 +250,7 @@ class BatchNormFoldCell(Cell):
    """
    def __init__(self, momentum=0.9, epsilon=1e-5, freeze_bn=0):
        """init batch norm fold layer"""
        """Initialize batch norm fold layer"""
        super(BatchNormFoldCell, self).__init__()
        self.epsilon = epsilon
        self.is_gpu = context.get_context('device_target') == "GPU"
@@ -323,7 +323,7 @@ class FakeQuantWithMinMax(Cell):
                 symmetric=False,
                 narrow_range=False,
                 quant_delay=0):
        """init FakeQuantWithMinMax layer"""
        """Initialize FakeQuantWithMinMax layer"""
        super(FakeQuantWithMinMax, self).__init__()
        validator.check_type("min_init", min_init, [int, float])
        validator.check_type("max_init", max_init, [int, float])
@@ -470,7 +470,7 @@ class Conv2dBnFoldQuant(Cell):
                 narrow_range=False,
                 quant_delay=0,
                 freeze_bn=100000):
        """init Conv2dBnFoldQuant layer"""
        """Initialize Conv2dBnFoldQuant layer"""
        super(Conv2dBnFoldQuant, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
@@ -611,8 +611,8 @@ class Conv2dBnWithoutFoldQuant(Cell):
        stride (int): Specifies stride for all spatial dimensions with the same value. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): Specifying the dilation rate to use for dilated convolution. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
        dilation (int): Specifies the dilation rate to use for dilated convolution. Default: 1.
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        eps (float): Parameters for BatchNormal. Default: 1e-5.
@@ -743,8 +743,8 @@ class Conv2dQuant(Cell):
        stride (int): Specifies stride for all spatial dimensions with the same value. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): Specifying the dilation rate to use for dilated convolution. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
        dilation (int): Specifies the dilation rate to use for dilated convolution. Default: 1.
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
--- a/mindspore/nn/learning_rate_schedule.py
+++ b/mindspore/nn/learning_rate_schedule.py
@@ -32,10 +32,10 @@ class LearningRateSchedule(Cell):
        """
        Defines the computation to get the current learning rate.
        This method should be overridden by all subclasses.
        This method must be overridden by all subclasses.
        Note:
            The output should be a Tensor of scalar.
            The output must be a Tensor of scalar.
        Inputs:
            Tensor. The current step number.
@@ -73,7 +73,7 @@ class ExponentialDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        decay_rate (float): The decay rate.
        decay_steps (int): A value used to calculate decayed learning rate.
        is_stair (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
        is_stair (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.
    Inputs:
        Tensor. The current step number.
@@ -127,7 +127,7 @@ class NaturalExpDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        decay_rate (float): The decay rate.
        decay_steps (int): A value used to calculate decayed learning rate.
        is_stair (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
        is_stair (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.
    Inputs:
        Tensor. The current step number.
@@ -292,8 +292,8 @@ class PolynomialDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        end_learning_rate (float): The end value of learning rate.
        decay_steps (int): A value used to calculate decayed learning rate.
        power (float): A value used to calculate decayed learning rate. This parameter should be greater than 0.
        update_decay_steps (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
        power (float): A value used to calculate decayed learning rate. This parameter must be greater than 0.
        update_decay_steps (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.
    Inputs:
        Tensor. The current step number.
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -83,9 +83,9 @@ class L1Loss(_Loss):
            Default: "mean".
    Inputs:
        - **input_data** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. The data type should be float16 or
        - **input_data** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. The data type must be float16 or
          float32.
        - **target_data** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. The data type should be float16 or
        - **target_data** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. The data type must be float16 or
          float32.
    Outputs:
@@ -344,14 +344,14 @@ class CosineEmbeddingLoss(_Loss):
    Args:
        margin (float): Should be in [-1.0, 1.0]. Default 0.0.
        reduction (str): Specifies which reduction to be applied to the output. It should be one of
        reduction (str): Specifies which reduction to be applied to the output. It must be one of
          "none", "mean", and "sum", meaning no reduction, reduce mean and sum on output, respectively. Default "mean".
    Inputs:
        - **input_x1** (Tensor) - Input tensor.
        - **input_x2** (Tensor) - Its shape and data type should be the same as `input_x1`'s shape and data type.
        - **input_x2** (Tensor) - Its shape and data type must be the same as `input_x1`'s shape and data type.
        - **y** (Tensor) - Contains value 1 or -1. Suppose the shape of `input_x1` is
          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` should be :math:`(x_1, x_3, x_4, ..., x_R)`.
          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` must be :math:`(x_1, x_3, x_4, ..., x_R)`.
    Outputs:
        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, otherwise a scalar value
--- a/mindspore/nn/metrics/_evaluation.py
+++ b/mindspore/nn/metrics/_evaluation.py
@@ -77,7 +77,7 @@ class EvaluationBase(Metric):
        A interface describes the behavior of clearing the internal evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        """
        raise NotImplementedError
@@ -86,7 +86,7 @@ class EvaluationBase(Metric):
        A interface describes the behavior of updating the internal evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        Args:
            inputs: The first item is predicted array and the second item is target array.
@@ -98,6 +98,6 @@ class EvaluationBase(Metric):
        A interface describes the behavior of computing the evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        """
        raise NotImplementedError
--- a/mindspore/nn/metrics/loss.py
+++ b/mindspore/nn/metrics/loss.py
@@ -46,7 +46,7 @@ class Loss(Metric):
        Args:
            inputs: Inputs contain only one element, the element is loss. The dimension of
                loss should be 0 or 1.
                loss must be 0 or 1.
        Raises:
            ValueError: If the length of inputs is not 1.
--- a/mindspore/nn/metrics/metric.py
+++ b/mindspore/nn/metrics/metric.py
@@ -85,7 +85,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of clearing the internal evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        """
        raise NotImplementedError('Must define clear function to use this base class')
@@ -95,7 +95,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of computing the evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        """
        raise NotImplementedError('Must define eval function to use this base class')
@@ -105,7 +105,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of updating the internal evaluation result.
        Note:
            All subclasses should override this interface.
            All subclasses must override this interface.
        Args:
            inputs: A variable-length input argument list.
--- a/mindspore/nn/metrics/precision.py
+++ b/mindspore/nn/metrics/precision.py
@@ -34,7 +34,7 @@ class Precision(EvaluationBase):
        \text{precision} = \frac{\text{true_positive}}{\text{true_positive} + \text{false_positive}}
    Note:
        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` should be 0 or 1.
        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` must be 0 or 1.
    Args:
        eval_type (str): Metric to calculate accuracy over a dataset, for classification or
--- a/mindspore/nn/metrics/recall.py
+++ b/mindspore/nn/metrics/recall.py
@@ -34,7 +34,7 @@ class Recall(EvaluationBase):
        \text{recall} = \frac{\text{true_positive}}{\text{true_positive} + \text{false_negative}}
    Note:
        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` should be 0 or 1.
        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` must be 0 or 1.
    Args:
        eval_type (str): Metric to calculate the recall over a dataset, for classification or
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -166,10 +166,10 @@ class Adam(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -177,16 +177,16 @@ class Adam(Optimizer):
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" is in the keys, the value must be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
              which in the 'order_params' should be in one of group parameters.
              which in the 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
@@ -201,7 +201,7 @@ class Adam(Optimizer):
        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
            If true, update the gradients using NAG.
            If false, update the gradients without using NAG. Default: False.
        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.
    Inputs:
@@ -290,10 +290,10 @@ class AdamWeightDecay(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -301,16 +301,16 @@ class AdamWeightDecay(Optimizer):
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" is in the keys, the value must be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
              which in the 'order_params' should be in one of group parameters.
              which in the 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
@@ -319,7 +319,7 @@ class AdamWeightDecay(Optimizer):
            Should be in range (0.0, 1.0).
        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
            Should be greater than 0.
        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
    Inputs:
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -93,29 +93,29 @@ class FTRL(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Using different learning rate by separating parameters is currently not supported.
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
        learning_rate (float): The learning rate value, should be zero or positive, dynamic learning rate is currently
        learning_rate (float): The learning rate value, must be zero or positive, dynamic learning rate is currently
            not supported. Default: 0.001.
        lr_power (float): Learning rate power controls how the learning rate decreases during training, must be less
            than or equal to zero. Use fixed learning rate if lr_power is zero. Default: -0.5.
        l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
        l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0.
        use_locking (bool): If True, use locks for updating operation. Default: False.
        loss_scale (float): Value for the loss scale. It should be equal to or greater than 1.0. Default: 1.0.
        use_locking (bool): If true, use locks for updating operation. Default: False.
        loss_scale (float): Value for the loss scale. It must be equal to or greater than 1.0. Default: 1.0.
        weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0.
    Inputs:
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -199,10 +199,10 @@ class Lamb(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -210,16 +210,16 @@ class Lamb(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
            Should be in range (0.0, 1.0).
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -112,10 +112,10 @@ class LazyAdam(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr" and "weight_decay" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -123,16 +123,16 @@ class LazyAdam(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -68,10 +68,10 @@ class Momentum(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -79,21 +79,21 @@ class Momentum(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        momentum (float): Hyperparameter of type float, means momentum for the moving average.
            It should be at least 0.0.
        weight_decay (int, float): Weight decay (L2 penalty). It should be equal to or greater than 0.0. Default: 0.0.
        loss_scale (int, float): A floating point value for the loss scale. It should be greater than 0.0. Default: 1.0.
            It must be at least 0.0.
        weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0.0. Default: 0.0.
        loss_scale (int, float): A floating point value for the loss scale. It must be greater than 0.0. Default: 1.0.
        use_nesterov (bool): Enable Nesterov momentum. Default: False.
    Inputs:
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -58,13 +58,13 @@ class Optimizer(Cell):
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
            updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
            updated, the element in `parameters` must be class `Parameter`. When the `parameters` is a list of `dict`,
            the "params", "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -72,13 +72,13 @@ class Optimizer(Cell):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
        weight_decay (float): A floating point value for the weight decay. It must be equal to or greater than 0.
            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
        loss_scale (float): A floating point value for the loss scale. It must be greater than 0. If the
            type of `loss_scale` input is int, it will be converted to float. Default: 1.0.
    Raises:
@@ -315,7 +315,7 @@ class Optimizer(Cell):
                        raise ValueError("The Tensor type dynamic learning rate in group should be the same size.")
    def _init_group_params(self, parameters, learning_rate, weight_decay):
        """Init learning rate or weight decay in group params."""
        """Initialize learning rate or weight decay in group params."""
        self._parse_group_params(parameters, learning_rate)
        default_lr = self._build_single_lr(learning_rate, 'learning_rate')
--- a/mindspore/nn/optim/proximal_ada_grad.py
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@@ -71,10 +71,10 @@ class ProximalAdagrad(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -82,9 +82,9 @@ class ProximalAdagrad(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
@@ -92,13 +92,13 @@ class ProximalAdagrad(Optimizer):
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 0.001.
        l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
        l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0.
        use_locking (bool): If True, use locks for updating operation. Default: False.
        loss_scale (float): Value for the loss scale. It should be greater than 0.0. Default: 1.0.
        use_locking (bool): If true, use locks for updating operation. Default: False.
        loss_scale (float): Value for the loss scale. It must be greater than 0.0. Default: 1.0.
        weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0.
    Inputs:
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -91,10 +91,10 @@ class RMSProp(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -102,16 +102,16 @@ class RMSProp(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 0.1.
        decay (float): Decay rate. Should be equal to or greater than 0. Default: 0.9.
@@ -119,8 +119,9 @@ class RMSProp(Optimizer):
                          greater than 0. Default: 0.0.
        epsilon (float): Term added to the denominator to improve numerical stability. Should be greater than
                         0. Default: 1e-10.
        use_locking (bool): Enable a lock to protect the update of variable and accumlation tensors. Default: False.
        centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False.
        use_locking (bool):  Whether to enable a lock to protect the variable and accumlation tensors from being
                             updated. Default: False.
        centered (bool): If true, gradients are normalized by the estimated variance of the gradient. Default: False.
        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.
        weight_decay (float): Weight decay (L2 penalty). Should be equal to or greater than 0. Default: 0.0.
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -63,10 +63,10 @@ class SGD(Optimizer):
    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.
            - params: Required. The value should be a list of `Parameter`.
            - params: Required. The value must be a list of `Parameter`.
            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@@ -74,24 +74,24 @@ class SGD(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.
            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
              in the value of 'order_params' must be in one of group parameters.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 0.1.
        momentum (float): A floating point value the momentum. should be at least 0.0. Default: 0.0.
        dampening (float): A floating point value of dampening for momentum. should be at least 0.0. Default: 0.0.
        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
        momentum (float): A floating point value the momentum. must be at least 0.0. Default: 0.0.
        dampening (float): A floating point value of dampening for momentum. must be at least 0.0. Default: 0.0.
        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
        nesterov (bool): Enables the Nesterov momentum. If use nesterov, momentum must be positive,
                         and dampening must equal to 0.0. Default: False.
        loss_scale (float): A floating point value for the loss scale, which should be larger
        loss_scale (float): A floating point value for the loss scale, which must be larger
                            than 0.0. Default: 1.0.
    Inputs:
--- a/mindspore/nn/probability/bijector/bijector.py
+++ b/mindspore/nn/probability/bijector/bijector.py
@@ -121,7 +121,7 @@ class Bijector(Cell):
        This __call__ may go into two directions:
        If args[0] is a distribution instance, the call will generate a new distribution derived from
        the input distribution.
        Otherwise, input[0] should be the name of a Bijector function, e.g. "forward", then this call will
        Otherwise, input[0] must be the name of a Bijector function, e.g. "forward", then this call will
        go in the construct and invoke the correstpoding Bijector function.
        Args:
--- a/mindspore/nn/probability/bnn_layers/conv_variational.py
+++ b/mindspore/nn/probability/bnn_layers/conv_variational.py
@@ -211,7 +211,7 @@ class ConvReparam(_ConvVariational):
            - pad: Implicit paddings on both sides of the input. The number
              of `padding` will be padded to the input Tensor borders.
              `padding` should be greater than or equal to 0.
              `padding` must be greater than or equal to 0.
        padding (Union[int, tuple[int]]): Implicit paddings on both sides of
            the input. Default: 0.
@@ -219,27 +219,27 @@ class ConvReparam(_ConvVariational):
            of 2 integers. This parameter specifies the dilation rate of the
            dilated convolution. If set to be :math:`k > 1`,
            there will be :math:`k - 1` pixels skipped for each sampling
            location. Its value should be greater or equal to 1 and bounded
            location. Its value must be greater or equal to 1 and bounded
            by the height and width of the input. Default: 1.
        group (int): Split filter into groups, `in_ channels` and
            `out_channels` should be divisible by the number of groups.
        group (int): Splits filter into groups, `in_ channels` and
            `out_channels` must be divisible by the number of groups.
            Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector.
            Default: False.
        weight_prior_fn: The prior distribution for weight.
            It should return a mindspore distribution instance.
            It must return a mindspore distribution instance.
            Default: NormalPrior. (which creates an instance of standard
            normal distribution). The current version only supports normal distribution.
        weight_posterior_fn: The posterior distribution for sampling weight.
            It should be a function handle which returns a mindspore
            It must be a function handle which returns a mindspore
            distribution instance. Default: lambda name, shape: NormalPosterior(name=name, shape=shape).
            The current version only supports normal distribution.
        bias_prior_fn: The prior distribution for bias vector. It should return
        bias_prior_fn: The prior distribution for bias vector. It must return
            a mindspore distribution. Default: NormalPrior(which creates an
            instance of standard normal distribution). The current version
            only supports normal distribution.
        bias_posterior_fn: The posterior distribution for sampling bias vector.
            It should be a function handle which returns a mindspore
            It must be a function handle which returns a mindspore
            distribution instance. Default: lambda name, shape: NormalPosterior(name=name, shape=shape).
            The current version only supports normal distribution.
--- a/mindspore/nn/probability/bnn_layers/dense_variational.py
+++ b/mindspore/nn/probability/bnn_layers/dense_variational.py
@@ -166,19 +166,19 @@ class DenseReparam(_DenseVariational):
            can be a string (eg. 'relu') or a Cell (eg. nn.ReLU()). Note that if the type of activation is Cell, it must
            be instantiated beforehand. Default: None.
        weight_prior_fn: The prior distribution for weight.
            It should return a mindspore distribution instance.
            It must return a mindspore distribution instance.
            Default: NormalPrior. (which creates an instance of standard
            normal distribution). The current version only supports normal distribution.
        weight_posterior_fn: The posterior distribution for sampling weight.
            It should be a function handle which returns a mindspore
            It must be a function handle which returns a mindspore
            distribution instance. Default: lambda name, shape: NormalPosterior(name=name, shape=shape).
            The current version only supports normal distribution.
        bias_prior_fn: The prior distribution for bias vector. It should return
        bias_prior_fn: The prior distribution for bias vector. It must return
            a mindspore distribution. Default: NormalPrior(which creates an
            instance of standard normal distribution). The current version
            only supports normal distribution.
        bias_posterior_fn: The posterior distribution for sampling bias vector.
            It should be a function handle which returns a mindspore
            It must be a function handle which returns a mindspore
            distribution instance. Default: lambda name, shape: NormalPosterior(name=name, shape=shape).
            The current version only supports normal distribution.
--- a/mindspore/nn/probability/distribution/bernoulli.py
+++ b/mindspore/nn/probability/distribution/bernoulli.py
@@ -32,7 +32,7 @@ class Bernoulli(Distribution):
        name (str): The name of the distribution. Default: 'Bernoulli'.
    Note:
        `probs` should be a proper probability (0 < p < 1).
        `probs` must be a proper probability (0 < p < 1).
        `dist_spec_args` is `probs`.
    Examples:
--- a/mindspore/nn/probability/distribution/categorical.py
+++ b/mindspore/nn/probability/distribution/categorical.py
@@ -50,7 +50,7 @@ class Categorical(Distribution):
        >>>
        >>>         # Similar calls can be made to logits
        >>>         ans = self.ca.probs
        >>>         # value should be Tensor(mstype.float32, bool, mstype.int32)
        >>>         # value must be Tensor(mstype.float32, bool, mstype.int32)
        >>>         ans = self.ca.log_prob(value)
        >>>
        >>>         # Usage of enumerate_support
--- a/mindspore/nn/probability/distribution/distribution.py
+++ b/mindspore/nn/probability/distribution/distribution.py
@@ -34,9 +34,9 @@ class Distribution(Cell):
        param (dict): The parameters used to initialize the distribution.
    Note:
        Derived class should override operations such as `_mean`, `_prob`,
        Derived class must override operations such as `_mean`, `_prob`,
        and `_log_prob`. Required arguments, such as `value` for `_prob`,
        should be passed in through `args` or `kwargs`. `dist_spec_args` which specifies
        must be passed in through `args` or `kwargs`. `dist_spec_args` which specifies
        a new distribution are optional.
        `dist_spec_args` is unique for each type of distribution. For example, `mean` and `sd`
--- a/mindspore/nn/probability/distribution/exponential.py
+++ b/mindspore/nn/probability/distribution/exponential.py
@@ -33,9 +33,9 @@ class Exponential(Distribution):
        name (str): The name of the distribution. Default: 'Exponential'.
    Note:
        `rate` should be strictly greater than 0.
        `rate` must be strictly greater than 0.
        `dist_spec_args` is `rate`.
        `dtype` should be a float type because Exponential distributions are continuous.
        `dtype` must be a float type because Exponential distributions are continuous.
    Examples:
        >>> # To initialize an Exponential distribution of the rate 0.5.
@@ -216,7 +216,7 @@ class Exponential(Distribution):
            rate (Tensor): The rate of the distribution. Default: self.rate.
        Note:
            `value` should be greater or equal to zero.
            `value` must be greater or equal to zero.
        .. math::
            log_pdf(x) = \log(rate) - rate * x if x >= 0 else 0
@@ -239,7 +239,7 @@ class Exponential(Distribution):
            rate (Tensor): The rate of the distribution. Default: self.rate.
        Note:
            `value` should be greater or equal to zero.
            `value` must be greater or equal to zero.
        .. math::
            cdf(x) = 1.0 - \exp(-1 * \lambda * x) if x >= 0 else 0
@@ -261,7 +261,7 @@ class Exponential(Distribution):
            rate (Tensor): The rate of the distribution. Default: self.rate.
        Note:
            `value` should be greater or equal to zero.
            `value` must be greater or equal to zero.
        .. math::
            log_survival_function(x) = -1 * \lambda * x if x >= 0 else 0
--- a/mindspore/nn/probability/distribution/geometric.py
+++ b/mindspore/nn/probability/distribution/geometric.py
@@ -36,7 +36,7 @@ class Geometric(Distribution):
        name (str): The name of the distribution. Default: 'Geometric'.
    Note:
        `probs` should be a proper probability (0 < p < 1).
        `probs` must be a proper probability (0 < p < 1).
        `dist_spec_args` is `probs`.
    Examples:
--- a/mindspore/nn/probability/distribution/normal.py
+++ b/mindspore/nn/probability/distribution/normal.py
@@ -35,9 +35,9 @@ class Normal(Distribution):
        name (str): The name of the distribution. Default: 'Normal'.
    Note:
        `sd` should be greater than zero.
        `sd` must be greater than zero.
        `dist_spec_args` are `mean` and `sd`.
        `dtype` should be a float type because Normal distributions are continuous.
        `dtype` must be a float type because Normal distributions are continuous.
    Examples:
        >>> # To initialize a Normal distribution of the mean 3.0 and the standard deviation 4.0.
--- a/mindspore/nn/probability/distribution/uniform.py
+++ b/mindspore/nn/probability/distribution/uniform.py
@@ -34,9 +34,9 @@ class Uniform(Distribution):
        name (str): The name of the distribution. Default: 'Uniform'.
    Note:
        `low` should be stricly less than `high`.
        `low` must be stricly less than `high`.
        `dist_spec_args` are `high` and `low`.
        `dtype` should be float type because Uniform distributions are continuous.
        `dtype` must be float type because Uniform distributions are continuous.
    Examples:
        >>> # To initialize a Uniform distribution of the lower bound 0.0 and the higher bound 1.0.
--- a/mindspore/nn/probability/dpn/vae/cvae.py
+++ b/mindspore/nn/probability/dpn/vae/cvae.py
@@ -31,8 +31,8 @@ class ConditionalVAE(Cell):
    Note:
        When encoder and decoder ard defined, the shape of the encoder's output tensor and decoder's input tensor
        should be :math:`(N, hidden\_size)`.
        The latent_size should be less than or equal to the hidden_size.
        must be :math:`(N, hidden\_size)`.
        The latent_size must be less than or equal to the hidden_size.
    Args:
        encoder(Cell): The Deep Neural Network (DNN) model defined as encoder.
@@ -103,7 +103,7 @@ class ConditionalVAE(Cell):
        Args:
            sample_y (Tensor): Define the label of samples. Tensor of shape (generate_nums, ) and type mindspore.int32.
            generate_nums (int): The number of samples to generate.
            shape(tuple): The shape of sample, which should be the format of (generate_nums, C, H, W) or (-1, C, H, W).
            shape(tuple): The shape of sample, which must be the format of (generate_nums, C, H, W) or (-1, C, H, W).
        Returns:
            Tensor, the generated samples.
--- a/mindspore/nn/probability/dpn/vae/vae.py
+++ b/mindspore/nn/probability/dpn/vae/vae.py
@@ -29,8 +29,8 @@ class VAE(Cell):
    Note:
        When the encoder and decoder are defined, the shape of the encoder's output tensor and decoder's input tensor
        should be :math:`(N, hidden\_size)`.
        The latent_size should be less than or equal to the hidden_size.
        must be :math:`(N, hidden\_size)`.
        The latent_size must be less than or equal to the hidden_size.
    Args:
        encoder(Cell): The Deep Neural Network (DNN) model defined as encoder.
@@ -89,7 +89,7 @@ class VAE(Cell):
        Args:
            generate_nums (int): The number of samples to generate.
            shape(tuple): The shape of sample, it should be (generate_nums, C, H, W) or (-1, C, H, W).
            shape(tuple): The shape of sample, it must be (generate_nums, C, H, W) or (-1, C, H, W).
        Returns:
            Tensor, the generated samples.
--- a/mindspore/nn/probability/toolbox/uncertainty_evaluation.py
+++ b/mindspore/nn/probability/toolbox/uncertainty_evaluation.py
@@ -48,8 +48,8 @@ class UncertaintyEvaluation:
        epochs (int): Total number of iterations on the data. Default: 1.
        epi_uncer_model_path (str): The save or read path of the epistemic uncertainty model. Default: None.
        ale_uncer_model_path (str): The save or read path of the aleatoric uncertainty model. Default: None.
        save_model (bool): Whether to save the uncertainty model or not, if True, the epi_uncer_model_path
                        and ale_uncer_model_path should not be None. If False, the model to evaluate will be loaded from
        save_model (bool): Whether to save the uncertainty model or not, if true, the epi_uncer_model_path
                        and ale_uncer_model_path must not be None. If false, the model to evaluate will be loaded from
                        the the path of the uncertainty model; if the path is not given , it will not save or load the
                        uncertainty model. Default: False.
@@ -192,7 +192,7 @@ class UncertaintyEvaluation:
        Evaluate the epistemic uncertainty of inference results, which also called model uncertainty.
        Args:
            eval_data (Tensor): The data samples to be evaluated, the shape should be (N,C,H,W).
            eval_data (Tensor): The data samples to be evaluated, the shape must be (N,C,H,W).
        Returns:
            numpy.dtype, the epistemic uncertainty of inference results of data samples.
@@ -205,7 +205,7 @@ class UncertaintyEvaluation:
        Evaluate the aleatoric uncertainty of inference results, which also called data uncertainty.
        Args:
            eval_data (Tensor): The data samples to be evaluated, the shape should be (N,C,H,W).
            eval_data (Tensor): The data samples to be evaluated, the shape must be (N,C,H,W).
        Returns:
            numpy.dtype, the aleatoric uncertainty of inference results of data samples.
@@ -258,7 +258,7 @@ class EpistemicUncertaintyModel(Cell):
 class AleatoricUncertaintyModel(Cell):
    """
    The aleatoric uncertainty (also called data uncertainty) is caused by input data, to obtain this
    uncertainty, the loss function should be modified in order to add variance into loss.
    uncertainty, the loss function must be modified in order to add variance into loss.
    See more details in `What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision?
    <https://arxiv.org/abs/1703.04977>`_.
--- a/mindspore/nn/probability/transforms/transform_bnn.py
+++ b/mindspore/nn/probability/transforms/transform_bnn.py
@@ -99,9 +99,9 @@ class TransformToBNN:
                {"in_channels": dp.in_channels, "out_channels": dp.out_channels, "pad_mode": dp.pad_mode,
                "kernel_size": dp.kernel_size, "stride": dp.stride, "has_bias": dp.has_bias}.
            add_dense_args (dict): The new arguments added to BNN full connection layer. Note that the arguments in
                `add_dense_args` should not duplicate arguments in `get_dense_args`. Default: None.
                `add_dense_args` must not duplicate arguments in `get_dense_args`. Default: None.
            add_conv_args (dict): The new arguments added to BNN convolutional layer. Note that the arguments in
                `add_conv_args` should not duplicate arguments in `get_conv_args`. Default: None.
                `add_conv_args` must not duplicate arguments in `get_conv_args`. Default: None.
        Returns:
            Cell, a trainable BNN model wrapped by TrainOneStepCell.
@@ -143,7 +143,7 @@ class TransformToBNN:
            bnn_layer_type (Cell): The type of BNN layer to be transformed to. The optional values are
                DenseReparam and ConvReparam.
            get_args: The arguments gotten from the DNN layer. Default: None.
            add_args (dict): The new arguments added to BNN layer. Note that the arguments in `add_args` should not
            add_args (dict): The new arguments added to BNN layer. Note that the arguments in `add_args` must not
                duplicate arguments in `get_args`. Default: None.
        Returns:
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -91,7 +91,7 @@ class WithGradCell(Cell):
        network (Cell): The target network to wrap. The network only supports single output.
        loss_fn (Cell): Primitive loss function used to compute gradients. Default: None.
        sens (Union[None, Tensor, Scalar, Tuple ...]): The sensitive for backpropagation, the type and shape
            should be same as the `network` output. If None, we will fill one to a same type shape of
            must be same as the `network` output. If None, we will fill one to a same type shape of
            output value. Default: None.
    Inputs:
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -331,7 +331,7 @@ class DistributedGradReducer(Cell):
    def construct(self, grads):
        """
        Under certain circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
        result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
        result of AllReduce is unreliable. To solve the problem, grads must be cast to float32 before AllReduce,
        and cast back after the operation.
        Args:
--- a/mindspore/nn/wrap/loss_scale.py
+++ b/mindspore/nn/wrap/loss_scale.py
@@ -64,7 +64,7 @@ class DynamicLossScaleUpdateCell(Cell):
    executed on host).
    Args:
        loss_scale_value (float): Init loss scale.
        loss_scale_value (float): Initializes loss scale.
        scale_factor (int): Coefficient of increase and decrease.
        scale_window (int): Maximum continuous training steps that do not have overflow.
@@ -139,7 +139,7 @@ class FixedLossScaleUpdateCell(Cell):
    For usage, refer to `DynamicLossScaleUpdateCell`.
    Args:
        loss_scale_value (float): Init loss scale.
        loss_scale_value (float): Initializes loss scale.
    Examples:
        >>> net_with_loss = Net()
@@ -173,8 +173,8 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell):
    Cell as args. The loss scale value can be updated in both host side or device side. The
    TrainOneStepWithLossScaleCell will be compiled to be graph which takes `*inputs` as input data.
    The Tensor type of `scale_sense` is acting as loss scaling value. If you want to update it on host side,
    the value should be provided. If  the Tensor type of `scale_sense` is not given, the loss scale update logic
    should be provied by Cell type of `scale_sense`.
    the value must be provided. If  the Tensor type of `scale_sense` is not given, the loss scale update logic
    must be provied by Cell type of `scale_sense`.
    Args:
        network (Cell): The training network. The network only supports single output.
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -27,7 +27,7 @@ class AbsGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AbsGrad"""
        """Initialize AbsGrad"""
    def infer_shape(self, y, dy):
        return y
@@ -46,7 +46,7 @@ class ACosGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init ACosGrad"""
        """Initialize ACosGrad"""
    def infer_shape(self, x, dout):
        validator.check("x shape", x, "dout shape", dout, Rel.EQ, self.name)
@@ -63,7 +63,7 @@ class AcoshGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AcoshGrad"""
        """Initialize AcoshGrad"""
    def infer_shape(self, x, dout):
        validator.check("x shape", x, "dout shape", dout, Rel.EQ, self.name)
@@ -85,7 +85,7 @@ class AsinGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """Init AsinGrad"""
        """Initialize AsinGrad"""
    def infer_shape(self, x, dout):
        validator.check("x shape", x, "dout shape", dout, Rel.EQ, self.name)
@@ -102,7 +102,7 @@ class AsinhGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AsinhGrad"""
        """Initialize AsinhGrad"""
    def infer_shape(self, x, dout):
        validator.check("x shape", x, "dout shape", dout, Rel.EQ, self.name)
@@ -119,7 +119,7 @@ class ReciprocalGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init ReciprocalGrad"""
        """Initialize ReciprocalGrad"""
    def infer_shape(self, x_shape, dout_shape):
        validator.check("x shape", x_shape, "dout shape", dout_shape, Rel.EQ, self.name)
@@ -136,7 +136,7 @@ class RsqrtGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init RsqrtGrad"""
        """Initialize RsqrtGrad"""
    def infer_shape(self, x_shape, dout_shape):
        validator.check("x shape", x_shape, "dout shape", dout_shape, Rel.EQ, self.name)
@@ -153,7 +153,7 @@ class SoftmaxGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init SoftmaxGrad"""
        """Initialize SoftmaxGrad"""
    def infer_shape(self, x_shape, dout_shape):
        validator.check("x shape", x_shape, "dout shape", dout_shape, Rel.EQ, self.name)
@@ -170,7 +170,7 @@ class SqrtGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init SqrtGrad"""
        """Initialize SqrtGrad"""
    def infer_shape(self, x_shape, dout_shape):
        validator.check("x shape", x_shape, "dout shape", dout_shape, Rel.EQ, self.name)
@@ -254,7 +254,7 @@ class ConcatOffset(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, N=2, axis=0):
        """init ConcatOffset"""
        """Initialize ConcatOffset"""
    def __infer__(self, input_x):
        axis = self.axis
@@ -307,7 +307,7 @@ class Conv2DBackpropFilter(PrimitiveWithInfer):
                 stride=(1, 1),
                 dilation=(1, 1, 1, 1),
                 group=1):
        """init Convolution"""
        """Initialize Convolution"""
        self.init_prim_io_names(inputs=['out_backprop', 'input', 'filter_sizes'], outputs=['output'])
        self.out_channel = out_channel
        self.kernel_size = kernel_size
@@ -373,7 +373,7 @@ class DepthwiseConv2dNativeBackpropFilter(PrimitiveWithInfer):
                 stride=1,
                 dilation=1,
                 group=1):
        """init Convolution"""
        """Initialize Convolution"""
        self.init_prim_io_names(inputs=['input', 'filter_size', 'dout'], outputs=['output'])
        self.channel_multiplier = channel_multiplier
        self.kernel_size = kernel_size
@@ -434,7 +434,7 @@ class DepthwiseConv2dNativeBackpropInput(PrimitiveWithInfer):
                 stride=1,
                 dilation=1,
                 group=1):
        """init Convolution"""
        """Initialize Convolution"""
        self.init_prim_io_names(inputs=['input_size', 'filter', 'dout'], outputs=['output'])
        self.channel_multiplier = channel_multiplier
        self.kernel_size = kernel_size
@@ -588,7 +588,7 @@ class GeluGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init GeluGrad"""
        """Initialize GeluGrad"""
    def infer_shape(self, y_backprop_shape, x_shape, y_shape):
        return x_shape
@@ -726,12 +726,12 @@ class MaxPoolGradGrad(_PoolGrad):
              will be returned without padding. Extra pixels will be discarded.
    Inputs:
        - **origin_input** (Tensor) - Tensor with data format "NCHW", data type should be float16.
        - **origin_input** (Tensor) - Tensor with data format "NCHW", data type must be float16.
        - **origin_output** (Tensor) - Data type same as `origin_input`.
        - **grad** (Tensor) - Data type same as `origin_input`.
    Outputs:
        Tensor, With data type same as `origin_input`.
        Tensor, with data type same as `origin_input`.
    """
@@ -753,7 +753,7 @@ class MaximumGrad(Primitive):
    @prim_attr_register
    def __init__(self, grad_x=True, grad_y=True):
        """Init MaximumGrad"""
        """Initialize MaximumGrad"""
    def __call__(self, x, y, dout):
        raise NotImplementedError
@@ -799,12 +799,12 @@ class MaxPoolGradGradWithArgmax(_PoolGrad):
              will be returned without padding. Extra pixels will be discarded.
    Inputs:
        - **x** (Tensor) - Tensor with data format "NCHW", data type should be float16.
        - **x** (Tensor) - Tensor with data format "NCHW", data type must be float16.
        - **grad** (Tensor) - Data type same as `x`.
        - **argmax** (Tensor) - Data type should be uint16 or int64.
        - **argmax** (Tensor) - Data type must be uint16 or int64.
    Outputs:
        Tensor, With data type same as `x`.
        Tensor, with data type same as `x`.
    """
@@ -829,7 +829,7 @@ class MinimumGrad(Primitive):
    @prim_attr_register
    def __init__(self, grad_x=True, grad_y=True):
        """Init MinimumGrad"""
        """Initialize MinimumGrad"""
    def __call__(self, x, y, dout):
        raise NotImplementedError
@@ -844,8 +844,8 @@ class L2NormalizeGrad(PrimitiveWithInfer):
        epsilon (float): A small value added for numerical stability. Default: 1e-4.
    Inputs:
        - **input_x** (Tensor) - Should be the input `weight` of forward operator L2Normalize.
        - **out** (Tensor) - Should be the output of forward operator L2Normalize.
        - **input_x** (Tensor) - Must be the input `weight` of forward operator L2Normalize.
        - **out** (Tensor) - Must be the output of forward operator L2Normalize.
        - **dout** (Tensor) - The backprop of the next layer.
    Outputs:
@@ -897,7 +897,7 @@ class LogSoftmaxGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, axis=-1):
        """init LogSoftmaxGrad"""
        """Initialize LogSoftmaxGrad"""
        validator.check_value_type("axis", axis, [int], self.name)
    def infer_shape(self, dout, logits):
@@ -1106,8 +1106,8 @@ class PReLUGrad(PrimitiveWithInfer):
    Inputs:
        - **y_backprop** (Tensor) - Representing the backprop of the next layer.
        - **input_x** (Tensor) - Should be the input `input_x` of forward operator PRelu.
        - **weight** (Tensor) - Float Tensor, w > 0, should be the input `weight` of forward operator PRelu.
        - **input_x** (Tensor) - Must be the input `input_x` of forward operator PRelu.
        - **weight** (Tensor) - Float Tensor, w > 0, must be the input `weight` of forward operator PRelu.
    Outputs:
        Tensor, with the same type as `input_x`.
@@ -1135,7 +1135,7 @@ class ReluGrad(Primitive):
    @prim_attr_register
    def __init__(self):
        """init ReluGrad"""
        """Initialize ReluGrad"""
        self.init_prim_io_names(inputs=['y_backprop', 'x'], outputs=['output'])
    def __call__(self, y_backprop, x):
@@ -1185,7 +1185,7 @@ class EluGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """Init EluGrad"""
        """Initialize EluGrad"""
    def infer_shape(self, y_grad_shape, x_shape):
        return x_shape
@@ -1224,7 +1224,7 @@ class ResizeNearestNeighborGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, align_corners=False):
        """Init ResizeNearestNeighborGrad"""
        """Initialize ResizeNearestNeighborGrad"""
        self.init_prim_io_names(inputs=['grads', 'size'], outputs=['y'])
    def __infer__(self, grads, size):
@@ -1247,7 +1247,7 @@ class ROIAlignGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, xdiff_shape, pooled_height, pooled_width, spatial_scale, sample_num=2):
        """init ROIAlignGrad"""
        """Initialize ROIAlignGrad"""
        validator.check_value_type("pooled_height", pooled_height, [int], self.name)
        validator.check_value_type("pooled_width", pooled_width, [int], self.name)
        validator.check_value_type("spatial_scale", spatial_scale, [float], self.name)
@@ -1319,7 +1319,7 @@ class SigmoidCrossEntropyWithLogitsGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """Init SigmoidCrossEntropyWithLogitsGrad"""
        """Initialize SigmoidCrossEntropyWithLogitsGrad"""
        self.init_prim_io_names(inputs=['x', 'y', 'dout'], outputs=['x_grad'])
    def infer_shape(self, x_shape, y_shape, dout_shape):
@@ -1338,7 +1338,7 @@ class SliceGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init SliceGrad"""
        """Initialize SliceGrad"""
        self.init_prim_io_names(inputs=['dy', 'x', 'begin', 'size'], outputs=['dx'])
    def __infer__(self, dy, x, begin, size):
@@ -1392,7 +1392,7 @@ class StridedSliceGrad(PrimitiveWithInfer):
                 ellipsis_mask=0,
                 new_axis_mask=0,
                 shrink_axis_mask=0):
        """init StrideSliceGrad"""
        """Initialize StrideSliceGrad"""
        validator.check_value_type('begin_mask', begin_mask, [int], self.name)
        validator.check_value_type('end_mask', end_mask, [int], self.name)
        validator.check_value_type('ellipsis_mask', ellipsis_mask, [int], self.name)
@@ -1440,7 +1440,7 @@ class StridedSliceGradAICPU(PrimitiveWithInfer):
                 ellipsis_mask=0,
                 new_axis_mask=0,
                 shrink_axis_mask=0):
        """init StrideSliceGrad"""
        """Initialize StrideSliceGrad"""
        validator.check_value_type('begin_mask', begin_mask, [int], self.name)
        validator.check_value_type('end_mask', end_mask, [int], self.name)
        validator.check_value_type('ellipsis_mask', ellipsis_mask, [int], self.name)
@@ -1504,7 +1504,7 @@ class MirrorPadGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, mode="REFLECT"):
        """init MirrorPad"""
        """Initialize MirrorPad"""
        validator.check_string('mode', mode, ['REFLECT', 'SYMMETRIC'], self.name)
        self.mode = mode
@@ -1528,7 +1528,7 @@ class MirrorPadGrad(PrimitiveWithInfer):
 class EmbeddingLookupCommGrad(PrimitiveWithInfer):
    """
    Perform the gradient for the communication part of EmbeddingLookup operator.
    Performs the gradient for the communication part of EmbeddingLookup operator.
    This works ONLY when 'reduce_scatter_flag' is True in 'EmbeddingLookup'. Roughly speaking,
    this primitive is implemented by StridedSlice --> _HostAllGather --> Concat. This primitive runs on host.
@@ -1542,7 +1542,7 @@ class EmbeddingLookupCommGrad(PrimitiveWithInfer):
    def __infer__(self, dy, split_num):
        """
        This primitive is implemented by three steps:
            1) Split the 'dy' along dimension 0 into 'split_num' parts.
            1) Splits the 'dy' along dimension 0 into 'split_num' parts.
            2) For each part, perform _HostAllGather((0, 1, 2, 3, 4, 5, 6, 7)) on the host.
            3) After _HostAllGather, there are still 'split_num' parts in each process. Then, perform Concat on them
              along dimension 0.
@@ -1600,7 +1600,7 @@ class AtanGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AtanGrad"""
        """Initialize AtanGrad"""
    def infer_shape(self, x, dout):
        validator.check("x shape", x, "dout shape", dout, Rel.EQ, self.name)
--- a/mindspore/ops/operations/_inner_ops.py
+++ b/mindspore/ops/operations/_inner_ops.py
@@ -34,7 +34,7 @@ class StridedSliceAICPU(PrimitiveWithInfer):
    Note:
        The stride may be negative value, which causes reverse slicing.
        The shape of `begin`, `end` and `strides` should be the same.
        The shape of `begin`, `end` and `strides` must be the same.
    Args:
        begin_mask (int): Starting index of the slice. Default: 0.
@@ -85,7 +85,7 @@ class StridedSliceAICPU(PrimitiveWithInfer):
                 ellipsis_mask=0,
                 new_axis_mask=0,
                 shrink_axis_mask=0):
        """init StrideSlice"""
        """Initialize StrideSlice"""
        self.init_prim_io_names(inputs=['x', 'begin', 'end', 'strides'], outputs=['output'])
        validator.check_value_type('begin_mask', begin_mask, [int], self.name)
        validator.check_value_type('end_mask', end_mask, [int], self.name)
@@ -155,16 +155,16 @@ class StridedSliceAICPU(PrimitiveWithInfer):
 class ExtractImagePatches(PrimitiveWithInfer):
    """
    Extract patches from images.
    Extracts patches from images.
    The input tensor must be a 4-D tensor and the data format is NHWC.
    Args:
        ksizes (Union[tuple[int], list[int]]): The size of sliding window, should be a tuple or a list of integers,
        ksizes (Union[tuple[int], list[int]]): The size of sliding window, must be a tuple or a list of integers,
            and the format is [1, ksize_row, ksize_col, 1].
        strides (Union[tuple[int], list[int]]): Distance between the centers of the two consecutive patches,
            should be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
            must be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
        rates (Union[tuple[int], list[int]]): In each extracted patch, the gap between the corresponding dimension
            pixel positions, should be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
            pixel positions, must be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
        padding (str): The type of padding algorithm, is a string whose value is "same" or "valid",
            not case sensitive. Default: "valid".
@@ -311,11 +311,11 @@ class Quant(PrimitiveWithInfer):
        scale (float) : Specifies the scaling ratio.
        offset (float): Specifies the offset.
        sqrt_mode (bool) : Specifies whether to perform square root on `scale`. Default: False.
        round_mode (str): Specifies the way to round. Should be one of ["Round", "Floor", "Ceil", "Trunc"].
        round_mode (str): Specifies the way to round. Must be one of ["Round", "Floor", "Ceil", "Trunc"].
          Default: "Round".
    Inputs:
        - **input_x** (Tensor) : Input tensor. Its data type should be mindspore.float16 or mindspore.float32.
        - **input_x** (Tensor) : Input tensor. Its data type must be mindspore.float16 or mindspore.float32.
    Outputs:
        - Tensor: The quantized output tensor of type mindspore.int8.
@@ -367,9 +367,9 @@ class Dequant(PrimitiveWithInfer):
        relu_flag (bool): Specifies whether to perform ReLU. Default: False.
    Inputs:
        - **input_x** (Tensor) : Input tensor. Should be mindspore.int32.
        - **input_x** (Tensor) : Input tensor. Must be mindspore.int32.
        - **deq_scale** (Tensor) : Specifies the scaling ratio.
          Data type should be mindspore.float16 or mindspore.uint64
          Data type must be mindspore.float16 or mindspore.uint64
    Outputs:
        - Tensor: The quantized output tensor of type mindspore.float16.
@@ -463,7 +463,7 @@ class MatrixDiag(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init MatrixDiag"""
        """Initialize MatrixDiag"""
    def infer_dtype(self, x_dtype, assist_dtype):
        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
@@ -499,7 +499,7 @@ class MatrixDiagPart(PrimitiveWithInfer):
        - **assist** (Tensor) - A eye tensor of the same type as `x`. With shape same as `x`.
    Outputs:
        Tensor, data type same as input `x`. The shape should be x.shape[:-2] + [min(x.shape[-2:])].
        Tensor, data type same as input `x`. The shape must be x.shape[:-2] + [min(x.shape[-2:])].
    Examples:
        >>> x = Tensor([[[-1, 0], [0, 1]], [[-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
@@ -511,7 +511,7 @@ class MatrixDiagPart(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init MatrixDiagPart"""
        """Initialize MatrixDiagPart"""
    def infer_dtype(self, x_dtype, assist_dtype):
        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
@@ -532,7 +532,7 @@ class MatrixDiagPart(PrimitiveWithInfer):
 class MatrixSetDiag(PrimitiveWithInfer):
    r"""
    Modify the batched diagonal part of a batched tensor.
    Modifies the batched diagonal part of a batched tensor.
    Inputs:
        - **x** (Tensor) - The batched tensor. It can be one of the following data types:
@@ -554,7 +554,7 @@ class MatrixSetDiag(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init MatrixSetDiag"""
        """Initialize MatrixSetDiag"""
    def infer_dtype(self, x_dtype, diagonal_dtype, assist_dtype):
        valid_type = [mstype.float16, mstype.float32, mstype.int32, mstype.int8, mstype.uint8]
--- a/mindspore/ops/operations/_quant_ops.py
+++ b/mindspore/ops/operations/_quant_ops.py
@@ -44,10 +44,10 @@ __all__ = ["MinMaxUpdatePerLayer",
 class MinMaxUpdatePerLayer(PrimitiveWithInfer):
    r"""
    Update min and max per layer.
    Updates min and max per layer.
    Args:
        ema (bool): Use EMA algorithm update value min and max. Default: False.
        ema (bool): Uses EMA algorithm update value min and max. Default: False.
        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
    Inputs:
@@ -56,7 +56,7 @@ class MinMaxUpdatePerLayer(PrimitiveWithInfer):
        - **max** (Tensor) : Value of the max range of the input data x.
    Outputs:
        - Tensor: Simulate quantize tensor of x.
        - Tensor: Simulates quantize tensor of x.
    Examples:
        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
@@ -68,7 +68,7 @@ class MinMaxUpdatePerLayer(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, ema=False, ema_decay=0.999):
        """init FakeQuantMinMaxPerLayerUpdate OP"""
        """Initialize FakeQuantMinMaxPerLayerUpdate OP"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import minmax_update_perlayer
        if ema and not ema_decay:
@@ -101,10 +101,10 @@ class MinMaxUpdatePerLayer(PrimitiveWithInfer):
 class MinMaxUpdatePerChannel(PrimitiveWithInfer):
    r"""
     Update min and max per channel.
     Updates min and max per channel.
    Args:
        ema (bool): Use EMA algorithm update value min and max. Default: False.
        ema (bool): Uses EMA algorithm update value min and max. Default: False.
        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
        channel_axis (int): Quantization by channel axis. Ascend backend only supports 0 or 1. Default: 1.
@@ -114,7 +114,7 @@ class MinMaxUpdatePerChannel(PrimitiveWithInfer):
        - **max** (Tensor) : Value of the max range of the input data x.
    Outputs:
        - Tensor: Simulate quantize tensor of x.
        - Tensor: Simulates quantize tensor of x.
    Examples:
        >>> x = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
@@ -127,7 +127,7 @@ class MinMaxUpdatePerChannel(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, ema=False, ema_decay=0.999, channel_axis=1):
        """init FakeQuantPerChannelUpdate OP for Ascend"""
        """Initialize FakeQuantPerChannelUpdate OP for Ascend"""
        self.is_ascend = context.get_context('device_target') == "Ascend"
        if self.is_ascend:
            from mindspore.ops._op_impl._custom_op import minmax_update_perchannel
@@ -169,11 +169,11 @@ class MinMaxUpdatePerChannel(PrimitiveWithInfer):
 class FakeQuantPerLayer(PrimitiveWithInfer):
    r"""
    Simulate the quantize and dequantize operations in training time.
    Simulates the quantize and dequantize operations in training time.
    Args:
        num_bits (int) : Number bits for quantization aware. Default: 8.
        ema (bool): Use EMA algorithm update value min and max. Default: False.
        ema (bool): Uses EMA algorithm update value min and max. Default: False.
        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
        quant_delay (int): Quantilization delay parameter. Before delay step in training time not update
            simulate quantization aware funcion. After delay step in training time begin simulate the aware
@@ -188,7 +188,7 @@ class FakeQuantPerLayer(PrimitiveWithInfer):
        - **max** (Tensor) : Value of the max range of the input data x.
    Outputs:
        - Tensor: Simulate quantize tensor of x.
        - Tensor: Simulates quantize tensor of x.
    Examples:
        >>> input_tensor = Tensor(np.random.rand(3, 16, 5, 5), mstype.float32)
@@ -207,7 +207,7 @@ class FakeQuantPerLayer(PrimitiveWithInfer):
                 symmetric=False,
                 narrow_range=False,
                 training=True):
        """init FakeQuantPerLayer OP"""
        """Initialize FakeQuantPerLayer OP"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import fake_quant_perlayer
        if num_bits not in self.support_quant_bit:
@@ -309,11 +309,11 @@ class FakeQuantPerLayerGrad(PrimitiveWithInfer):
 class FakeQuantPerChannel(PrimitiveWithInfer):
    r"""
    Simulate the quantize and dequantize operations in training time base on per channel.
    Simulates the quantize and dequantize operations in training time base on per channel.
    Args:
        num_bits (int) : Number bits to quantilization. Default: 8.
        ema (bool): Use EMA algorithm update tensor min and tensor max. Default: False.
        ema (bool): Uses EMA algorithm update tensor min and tensor max. Default: False.
        ema_decay (int) : EMA algorithm decay parameter. Default: 0.999.
        quant_delay (int): Quantilization delay  parameter. Before delay step in training time not
            update the weight data to simulate quantize operation. After delay step in training time
@@ -351,7 +351,7 @@ class FakeQuantPerChannel(PrimitiveWithInfer):
                 narrow_range=False,
                 training=True,
                 channel_axis=1):
        """init FakeQuantPerChannel OP"""
        """Initialize FakeQuantPerChannel OP"""
        self.is_ascend = context.get_context('device_target') == "Ascend"
        if self.is_ascend:
            from mindspore.ops._op_impl._custom_op import fake_quant_perchannel
@@ -426,7 +426,7 @@ class FakeQuantPerChannelGrad(PrimitiveWithInfer):
                 symmetric=False,
                 narrow_range=False,
                 channel_axis=1):
        """init FakeQuantPerChannelGrad Fill"""
        """Initialize FakeQuantPerChannelGrad Fill"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import fake_quant_perchannel_grad
        if num_bits not in self.support_quant_bit:
@@ -468,7 +468,7 @@ class BatchNormFold(PrimitiveWithInfer):
    Batch normalization folded.
    Args:
        momentum (float): Momentum value should be [0, 1]. Default: 0.9.
        momentum (float): Momentum value must be [0, 1]. Default: 0.9.
        epsilon (float): A small float number to avoid dividing by 0. 1e-5 if dtype in
            float32 else 1e-3. Default: 1e-5.
        is_training (bool): In training mode set True, else set False. Default: True.
@@ -501,7 +501,7 @@ class BatchNormFold(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0):
        """init batch norm fold layer"""
        """Initialize batch norm fold layer"""
        self.momentum = validator.check_number_range('momentum', momentum, 0, 1, Rel.INC_BOTH, self.name)
        self.epsilon = validator.check_float_positive('epsilon', epsilon, self.name)
        self.is_training = validator.check_value_type('is_training', is_training, (bool,), self.name)
@@ -543,7 +543,7 @@ class BatchNormFoldGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, epsilon=1e-5, is_training=True, freeze_bn=0):
        """init BatchNormGrad layer"""
        """Initialize BatchNormGrad layer"""
        self.is_training = validator.check_value_type('is_training', is_training, (bool,), self.name)
        self.freeze_bn = validator.check_value_type('freeze_bn', freeze_bn, (int,), self.name)
        self.epsilon = validator.check_float_positive('epsilon', epsilon, self.name)
@@ -574,7 +574,7 @@ class BatchNormFoldGrad(PrimitiveWithInfer):
 class CorrectionMul(PrimitiveWithInfer):
    """
    Scale the weights with a correction factor to the long term statistics
    Scales the weights with a correction factor to the long term statistics
    prior to quantization. This ensures that there is no jitter in the quantized weights
    due to batch to batch variation.
@@ -596,7 +596,7 @@ class CorrectionMul(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, channel_axis=0):
        """init correction mul layer"""
        """Initialize correction mul layer"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import correction_mul
        self.channel_axis = channel_axis
@@ -630,7 +630,7 @@ class CorrectionMulGrad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, channel_axis=0):
        """init correction mul layer"""
        """Initialize correction mul layer"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import correction_mul_grad
        self.channel_axis = channel_axis
@@ -670,7 +670,7 @@ class CorrectionMulGradReduce(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, channel_axis=0):
        """init correction mul reduce layer"""
        """Initialize correction mul reduce layer"""
        if context.get_context('device_target') == "Ascend":
            from mindspore.ops._op_impl._custom_op import correction_mul_grad
        self.channel_axis = channel_axis
@@ -686,7 +686,7 @@ class CorrectionMulGradReduce(PrimitiveWithInfer):
 class BatchNormFold2(PrimitiveWithInfer):
    """
    Scale the bias with a correction factor to the long term statistics
    Scales the bias with a correction factor to the long term statistics
    prior to quantization. This ensures that there is no jitter in the quantized bias
    due to batch to batch variation.
@@ -720,7 +720,7 @@ class BatchNormFold2(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, freeze_bn=0):
        """init conv2d fold layer"""
        """Initialize conv2d fold layer"""
        self.freeze_bn = validator.check_value_type('freeze_bn', freeze_bn, (int,), self.name)
        self.init_prim_io_names(inputs=['x', 'beta', 'gamma', 'batch_std', 'batch_mean',
                                        'running_std', 'running_mean', 'global_step'],
@@ -767,7 +767,7 @@ class BatchNormFold2Grad(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, freeze_bn=0):
        """init MulFold layer"""
        """Initialize MulFold layer"""
        self.freeze_bn = freeze_bn
        self.init_prim_io_names(inputs=['dout', 'x', 'gamma',
                                        'batch_std', 'batch_mean',
@@ -811,7 +811,7 @@ class BatchNormFoldD(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0):
        """init _BatchNormFold layer"""
        """Initialize _BatchNormFold layer"""
        from mindspore.ops._op_impl._custom_op import batchnorm_fold
        self.momentum = validator.check_number_range('momentum', momentum, 0, 1, Rel.INC_BOTH, self.name)
        self.epsilon = validator.check_float_positive('epsilon', epsilon, self.name)
@@ -840,7 +840,7 @@ class BatchNormFoldGradD(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, epsilon=1e-5, is_training=True, freeze_bn=0):
        """init _BatchNormFoldGrad layer"""
        """Initialize _BatchNormFoldGrad layer"""
        from mindspore.ops._op_impl._custom_op import batchnorm_fold_grad
        self.epsilon = validator.check_float_positive('epsilon', epsilon, self.name)
        self.is_training = validator.check_value_type('is_training', is_training, (bool,), self.name)
@@ -867,7 +867,7 @@ class BatchNormFoldGradD(PrimitiveWithInfer):
 class BatchNormFold2_D(PrimitiveWithInfer):
    """
    Scale the bias with a correction factor to the long term statistics
    Scales the bias with a correction factor to the long term statistics
    prior to quantization. This ensures that there is no jitter in the quantized bias
    due to batch to batch variation.
@@ -889,7 +889,7 @@ class BatchNormFold2_D(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, freeze_bn=0):
        """init conv2d fold layer"""
        """Initialize conv2d fold layer"""
        from mindspore.ops._op_impl._custom_op import batchnorm_fold2
        self.init_prim_io_names(inputs=['x', 'beta', 'gamma', 'batch_std', 'batch_mean', 'running_std'],
                                outputs=['y'])
@@ -916,7 +916,7 @@ class BatchNormFold2GradD(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, freeze_bn=False):
        """init MulFold layer"""
        """Initialize MulFold layer"""
        from mindspore.ops._op_impl._custom_op import batchnorm_fold2_grad
        self.freeze_bn = freeze_bn
        self.init_prim_io_names(
@@ -954,7 +954,7 @@ class BatchNormFold2GradReduce(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, freeze_bn=False):
        """init MulFold layer"""
        """Initialize MulFold layer"""
        from mindspore.ops._op_impl._custom_op import batchnorm_fold2_grad_reduce
        self.freeze_bn = freeze_bn
        self.init_prim_io_names(inputs=['dout', 'x'],
--- a/mindspore/ops/operations/_thor_ops.py
+++ b/mindspore/ops/operations/_thor_ops.py
@@ -88,7 +88,7 @@ class CusBatchMatMul(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusBatchMatMul"""
        """Initialize CusBatchMatMul"""
        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.batch_matmul_impl import CusBatchMatMul
@@ -121,7 +121,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusCholeskyTrsm"""
        """Initialize CusCholeskyTrsm"""
        self.init_prim_io_names(inputs=['x1'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.cholesky_trsm_impl import CusCholeskyTrsm
@@ -140,7 +140,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
 class CusFusedAbsMax1(PrimitiveWithInfer):
    """
    Compute the abs max of Tensor input.
    Computes the abs max of Tensor input.
    The rank of input tensors must be `4` or `2`.
    Inputs:
@@ -157,7 +157,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, origin_shape=[-1, -1]):
        """init CusFusedAbsMax1"""
        """Initialize CusFusedAbsMax1"""
        self.init_prim_io_names(inputs=['x1'], outputs=['y'])
        self.origin_shape = origin_shape
        from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import CusFusedAbsMax1
@@ -176,7 +176,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
 class CusImg2Col(PrimitiveWithInfer):
    """
    Img2col the feature map and the result in reorganized in NC1HWC0.
    Img2cols the feature map and the result in reorganized in NC1HWC0.
    Args:
        - **strides** (listInt) - the stride of the ops.
@@ -193,7 +193,7 @@ class CusImg2Col(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, ksizes, strides, dilates=(1, 1, 1, 1), mode="NC1HWC0"):
        """init CusImg2Col"""
        """Initialize CusImg2Col"""
        self.init_prim_io_names(inputs=['x1'], outputs=['y'])
        self.ksizes = ksizes
        self.strides = strides
@@ -239,7 +239,7 @@ class CusMatMulCubeDenseLeft(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusMatMulCubeDenseLeft"""
        """Initialize CusMatMulCubeDenseLeft"""
        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
@@ -274,7 +274,7 @@ class CusMatMulCubeFraczRightMul(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusMatMulCubeFraczRightMul"""
        """Initialize CusMatMulCubeFraczRightMul"""
        self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
@@ -292,14 +292,14 @@ class CusMatMulCube(PrimitiveWithInfer):
    The rank of input tensors must be `2`.
    Args:
        transpose_a (bool): If True, `a` is transposed before multiplication. Default: False.
        transpose_b (bool): If True, `b` is transposed before multiplication. Default: False.
        transpose_a (bool): If true, `a` is transposed before multiplication. Default: False.
        transpose_b (bool): If true, `b` is transposed before multiplication. Default: False.
    Inputs:
        - **input_x** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(N, C)`. If
          `transpose_a` is True, its shape should be :math:`(N, C)` after transposing.
          `transpose_a` is True, its shape must be :math:`(N, C)` after transposing.
        - **input_y** (Tensor) - The second tensor to be multiplied. The shape of the tensor is :math:`(C, M)`. If
          `transpose_b` is True, its shape should be :math:`(C, M)` after transpose.
          `transpose_b` is True, its shape must be :math:`(C, M)` after transpose.
    Outputs:
        Tensor, the shape of the output tensor is :math:`(N, M)`.
@@ -313,7 +313,7 @@ class CusMatMulCube(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, transpose_a=False, transpose_b=False):
        """init CusMatMulCube"""
        """Initialize CusMatMulCube"""
        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
        self.transpose_a = transpose_a
        self.transpose_b = transpose_b
@@ -355,7 +355,7 @@ class CusMatrixCombine(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusMatrixCombine"""
        """Initialize CusMatrixCombine"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
@@ -389,7 +389,7 @@ class CusTranspose02314(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusTranspose02314"""
        """Initialize CusTranspose02314"""
        self.init_prim_io_names(inputs=['x1'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.transpose02314_impl import CusTranspose02314
@@ -435,7 +435,7 @@ class CusMatMulCubeDenseRight(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusMatMulCubeDenseRight"""
        """Initialize CusMatMulCubeDenseRight"""
        self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
@@ -470,7 +470,7 @@ class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init CusMatMulCubeFraczLeftCast"""
        """Initialize CusMatMulCubeFraczLeftCast"""
        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
@@ -483,7 +483,7 @@ class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
 class Im2Col(PrimitiveWithInfer):
    """
    extract image pathes from image.
    extracts image pathes from image.
    The rank of input_x1 must be `4`, data_format is "NCHW".
@@ -504,7 +504,7 @@ class Im2Col(PrimitiveWithInfer):
                 pad=0,
                 stride=1,
                 dilation=1):
        """init Im2Col"""
        """Initialize Im2Col"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
        self.kernel_size = _check_positive_int_or_tuple('kernel_size', kernel_size, self.name)
        self.add_prim_attr('kernel_size', self.kernel_size)
@@ -564,7 +564,7 @@ class Im2Col(PrimitiveWithInfer):
 class UpdateThorGradient(PrimitiveWithInfer):
    """
    Update Thor Gradient with Approximate Fisher info matrix(for GPU backend).
    Updates Thor Gradient with Approximate Fisher info matrix(for GPU backend).
    The rank of input_x1 must be `3`, which indicates the A matrix.
    The rank of input_x2 must be `2`, which indicates the 1st-order gradient.
@@ -593,7 +593,7 @@ class UpdateThorGradient(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, split_dim=0):
        """init UpdateThorGradient"""
        """Initialize UpdateThorGradient"""
        self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
        self.split_dim = split_dim
        self.add_prim_attr('split_dim', self.split_dim)
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@@ -416,7 +416,7 @@ class _AlltoAll(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, split_count, split_dim, concat_dim, group=GlobalComm.WORLD_COMM_GROUP):
        """init AlltoAll"""
        """Initialize AlltoAll"""
        validator.check_value_type('group', _get_group(group), (str,), self.name)
        self.split_count = split_count
        self.split_dim = split_dim
@@ -520,7 +520,7 @@ class _GetTensorSlice(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init ChunkTensor"""
        """Initialize ChunkTensor"""
    def infer_value(self, x, dev_mat, tensor_map):
        from mindspore.parallel._tensor import _load_tensor
--- a/mindspore/ops/operations/control_ops.py
+++ b/mindspore/ops/operations/control_ops.py
@@ -27,8 +27,8 @@ class ControlDepend(Primitive):
    In many cases, we need to control the execution order of operations. ControlDepend is designed for this.
    ControlDepend will instruct the execution engine to run the operations in a specific order. ControlDepend
    tells the engine that the destination operations should depend on the source operation which means the source
    operations should be executed before the destination.
    tells the engine that the destination operations must depend on the source operation which means the source
    operations must be executed before the destination.
    Note:
        This operation does not work in `PYNATIVE_MODE`.
@@ -86,7 +86,7 @@ class GeSwitch(PrimitiveWithInfer):
    Inputs:
        - **data** (Union[Tensor, Number]) - The data to be used for switch control.
        - **pred** (Tensor) - It should be a scalar whose type is bool and shape is `()`, It is used as condition for
        - **pred** (Tensor) - It must be a scalar whose type is bool and shape is `()`, It is used as condition for
          switch control.
    Outputs:
        tuple. Output is tuple(false_output, true_output). The Elements in the tuple has the same shape of input data.
@@ -142,10 +142,10 @@ class Merge(PrimitiveWithInfer):
    """
    Merges all input data to one.
    One and only one of the inputs should be selected as the output
    One and only one of the inputs must be selected as the output
    Inputs:
        - **inputs** (Union(Tuple, List)) - The data to be merged. All tuple elements should have the same data type.
        - **inputs** (Union(Tuple, List)) - The data to be merged. All tuple elements must have the same data type.
    Outputs:
        tuple. Output is tuple(`data`, `output_index`). The `data` has the same shape of `inputs` element.
--- a/mindspore/ops/operations/debug_ops.py
+++ b/mindspore/ops/operations/debug_ops.py
@@ -22,7 +22,7 @@ from ..primitive import prim_attr_register, PrimitiveWithInfer, Primitive
 def _check_summary_param(name, value, class_name):
    """Check the name and value is valid for summary."""
    """Checks the name and value is valid for summary."""
    n_type = name['dtype']
    n_value = name['value']
    validator.check_value_type('name', n_type, [type(mstype.string)], class_name)
@@ -42,11 +42,11 @@ SUMMARY_RETURN_VALUE = {'dtype': mstype.int32, 'shape': [1], 'value': None}
 class ScalarSummary(PrimitiveWithInfer):
    """
    Output a scalar to a protocol buffer through a scalar summary operator.
    Outputs a scalar to a protocol buffer through a scalar summary operator.
    Inputs:
        - **name** (str) - The name of the input variable, it should not be an empty string.
        - **value** (Tensor) - The value of scalar, and the shape of value should be [] or [1].
        - **name** (str) - The name of the input variable, it must not be an empty string.
        - **value** (Tensor) - The value of scalar, and the shape of value must be [] or [1].
    Examples:
        >>> class SummaryDemo(nn.Cell):
@@ -80,11 +80,11 @@ class ScalarSummary(PrimitiveWithInfer):
 class ImageSummary(PrimitiveWithInfer):
    """
    Output image tensor to protocol buffer through image summary operator.
    Outputs image tensor to protocol buffer through image summary operator.
    Inputs:
        - **name** (str) - The name of the input variable, it should not be an empty string.
        - **value** (Tensor) - The value of image, the rank of tensor should be 4.
        - **name** (str) - The name of the input variable, it must not be an empty string.
        - **value** (Tensor) - The value of image, the rank of tensor must be 4.
    Examples:
        >>> class Net(nn.Cell):
@@ -117,11 +117,11 @@ class ImageSummary(PrimitiveWithInfer):
 class TensorSummary(PrimitiveWithInfer):
    """
    Output a tensor to a protocol buffer through a tensor summary operator.
    Outputs a tensor to a protocol buffer through a tensor summary operator.
    Inputs:
        - **name** (str) - The name of the input variable.
        - **value** (Tensor) - The value of tensor, and the rank of tensor should be greater than 0.
        - **value** (Tensor) - The value of tensor, and the rank of tensor must be greater than 0.
    Examples:
        >>> class SummaryDemo(nn.Cell):
@@ -155,11 +155,11 @@ class TensorSummary(PrimitiveWithInfer):
 class HistogramSummary(PrimitiveWithInfer):
    """
    Output tensor to protocol buffer through histogram summary operator.
    Outputs tensor to protocol buffer through histogram summary operator.
    Inputs:
        - **name** (str) - The name of the input variable.
        - **value** (Tensor) - The value of tensor, and the rank of tensor should be greater than 0.
        - **value** (Tensor) - The value of tensor, and the rank of tensor must be greater than 0.
    Examples:
        >>> class SummaryDemo(nn.Cell):
@@ -193,7 +193,7 @@ class HistogramSummary(PrimitiveWithInfer):
 class InsertGradientOf(PrimitiveWithInfer):
    """
    Attach callback to graph node that will be invoked on the node's gradient.
    Attaches callback to graph node that will be invoked on the node's gradient.
    Args:
        f (Function): MindSpore's Function. Callback function.
@@ -252,7 +252,7 @@ class HookBackward(PrimitiveWithInfer):
    is only supported in Pynative Mode.
    Note:
        The hook function should be defined like `hook_fn(grad) -> Tensor or None`,
        The hook function must be defined like `hook_fn(grad) -> Tensor or None`,
        where grad is the gradient passed to the primitive and gradient may be
        modified and passed to next primitive. The difference between a hook function and
        callback of InsertGradientOf is that a hook function is executed in the python
@@ -305,7 +305,7 @@ class HookBackward(PrimitiveWithInfer):
 class Print(PrimitiveWithInfer):
    """
    Output tensor or string to stdout.
    Outputs tensor or string to stdout.
    Note:
        In pynative mode, please use python print function.
@@ -344,7 +344,7 @@ class Print(PrimitiveWithInfer):
 class Debug(Primitive):
    """
    Print tensor value.
    Prints tensor value.
    Inputs:
        - **value** (Tensor) - The value of tensor.
@@ -395,7 +395,7 @@ class Assert(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, summarize=3):
        """init Assert"""
        """Initialize Assert"""
        self.summarize = validator.check_value_type("summarize", summarize, [int], self.name)
    def infer_shape(self, condition, inputs):
--- a/mindspore/ops/operations/image_ops.py
+++ b/mindspore/ops/operations/image_ops.py
@@ -26,7 +26,7 @@ class CropAndResize(PrimitiveWithInfer):
    Extracts crops from the input image tensor and resizes them.
    Note:
        In case that the output shape depends on crop_size, the crop_size should be constant.
        In case that the output shape depends on crop_size, the crop_size must be constant.
    Args:
        method (str):  	An optional string that specifies the sampling method for resizing.
@@ -79,7 +79,7 @@ class CropAndResize(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, method="bilinear", extrapolation_value=0.0):
        """init CropAndResize"""
        """Initialize CropAndResize"""
        self.init_prim_io_names(inputs=['x', 'boxes', 'box_index', 'crop_size'], outputs=['y'])
        validator.check_value_type("method", method, [str], self.name)
        validator.check_string("method", method, ["bilinear", "nearest", "bilinear_v2"], self.name)
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -70,7 +70,7 @@ class _BinaryOp(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init _BinaryOp"""
        """Initialize _BinaryOp"""
        self.init_prim_io_names(inputs=['x', 'y'], outputs=['output'])
    def infer_shape(self, x_shape, y_shape):
@@ -99,7 +99,7 @@ class _BitwiseBinaryOp(_MathBinaryOp):
    @prim_attr_register
    def __init__(self):
        """init _BitwiseBinaryOp"""
        """Initialize _BitwiseBinaryOp"""
        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
    @staticmethod
@@ -166,7 +166,7 @@ class AssignAdd(PrimitiveWithInfer):
    Inputs:
        - **variable** (Parameter) - The `Parameter`.
        - **value** (Union[numbers.Number, Tensor]) - The value to be added to the `variable`.
          It should have the same shape as `variable` if it is a Tensor.
          It must have the same shape as `variable` if it is a Tensor.
    Examples:
        >>> class Net(Cell):
@@ -190,7 +190,7 @@ class AssignAdd(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AssignAdd"""
        """Initialize AssignAdd"""
        self.init_prim_io_names(inputs=['ref', 'value'], outputs=['output'])
    def infer_shape(self, variable, value):
@@ -216,7 +216,7 @@ class AssignSub(PrimitiveWithInfer):
    Inputs:
        - **variable** (Parameter) - The `Parameter`.
        - **value** (Union[numbers.Number, Tensor]) - The value to be subtracted from the `variable`.
          It should have the same shape as `variable` if it is a Tensor.
          It must have the same shape as `variable` if it is a Tensor.
    Examples:
        >>> class Net(Cell):
@@ -241,7 +241,7 @@ class AssignSub(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init AssignSub"""
        """Initialize AssignSub"""
    def infer_shape(self, variable, value):
        return value
@@ -257,8 +257,8 @@ class _Reduce(PrimitiveWithInfer):
    Definition of base class of reduction class operators.
    Args:
         keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                           If False, don't keep these dimensions.
         keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                           If false, don't keep these dimensions.
    """
    __mindspore_signature__ = (
@@ -268,7 +268,7 @@ class _Reduce(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, keep_dims=False):
        """init Reduce"""
        """Initialize Reduce"""
        validator.check_value_type('keep_dims', keep_dims, [bool], self.name)
        self.init_prim_io_names(inputs=['input_x', 'axis'], outputs=['y'])
        self.add_prim_attr("io_format", "ND")
@@ -320,8 +320,8 @@ class ReduceMean(_Reduce):
     The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions. Default: False.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions. Default: False.
    Inputs:
        - **input_x** (Tensor[Number]) - The input tensor.
@@ -352,8 +352,8 @@ class ReduceSum(_Reduce):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions. Default: False.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions. Default: False.
    Inputs:
         - **input_x** (Tensor[Number]) - The input tensor.
@@ -378,7 +378,7 @@ class ReduceSum(_Reduce):
    @prim_attr_register
    def __init__(self, keep_dims=False):
        """init ReduceSum"""
        """Initialize ReduceSum"""
        super(ReduceSum, self).__init__(keep_dims)
        self.__setattr_flag__ = True
@@ -390,8 +390,8 @@ class ReduceAll(_Reduce):
    The dtype of the tensor to be reduced is bool.
    Args:
       keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                         If False, don't keep these dimensions.
       keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                         If false, don't keep these dimensions.
                         Default : False, don't keep these reduced dimensions.
    Inputs:
@@ -426,8 +426,8 @@ class ReduceAny(_Reduce):
    The dtype of the tensor to be reduced is bool.
    Args:
       keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                         If False, don't keep these dimensions.
       keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                         If false, don't keep these dimensions.
                         Default : False, don't keep these reduced dimensions.
    Inputs:
@@ -462,8 +462,8 @@ class ReduceMax(_Reduce):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions.
                          Default : False, don't keep these reduced dimensions.
    Inputs:
@@ -501,8 +501,8 @@ class ReduceMin(_Reduce):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions.
                          Default : False, don't keep these reduced dimensions.
    Inputs:
@@ -534,8 +534,8 @@ class ReduceProd(_Reduce):
    The dtype of the tensor to be reduced is number.
    Args:
        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
                          If False, don't keep these dimensions.
        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
                          If false, don't keep these dimensions.
                          Default : False, don't keep these reduced dimensions.
    Inputs:
@@ -565,8 +565,8 @@ class CumProd(PrimitiveWithInfer):
    Compute the cumulative product of the tensor x along axis.
    Args:
        exclusive (bool): If True, perform exclusive cumulative product. Default: False.
        reverse (bool): If True, reverse the result along axis. Default: False
        exclusive (bool): If true, perform exclusive cumulative product. Default: False.
        reverse (bool): If true, reverse the result along axis. Default: False
    Inputs:
        - **input_x** (Tensor[Number]) - The input tensor.
@@ -616,14 +616,14 @@ class MatMul(PrimitiveWithInfer):
    The rank of input tensors must be `2`.
    Args:
        transpose_a (bool): If True, `a` is transposed before multiplication. Default: False.
        transpose_b (bool): If True, `b` is transposed before multiplication. Default: False.
        transpose_a (bool): If true, `a` is transposed before multiplication. Default: False.
        transpose_b (bool): If true, `b` is transposed before multiplication. Default: False.
    Inputs:
        - **input_x** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(N, C)`. If
          `transpose_a` is True, its shape should be :math:`(N, C)` after transposing.
          `transpose_a` is True, its shape must be :math:`(N, C)` after transposing.
        - **input_y** (Tensor) - The second tensor to be multiplied. The shape of the tensor is :math:`(C, M)`. If
          `transpose_b` is True, its shape should be :math:`(C, M)` after transpose.
          `transpose_b` is True, its shape must be :math:`(C, M)` after transpose.
    Outputs:
        Tensor, the shape of the output tensor is :math:`(N, M)`.
@@ -690,17 +690,17 @@ class BatchMatMul(MatMul):
    The two input tensors must have the same rank and the rank must be not less than `3`.
    Args:
        transpose_a (bool): If True, the last two dimensions of `a` is transposed before multiplication.
        transpose_a (bool): If true, the last two dimensions of `a` is transposed before multiplication.
            Default: False.
        transpose_b (bool): If True, the last two dimensions of `b` is transposed before multiplication.
        transpose_b (bool): If true, the last two dimensions of `b` is transposed before multiplication.
            Default: False.
    Inputs:
        - **input_x** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(*B, N, C)`,
          where :math:`*B` represents the batch size which can be multidimensional, :math:`N` and :math:`C` are the
          size of the last two dimensions. If `transpose_a` is True, its shape should be :math:`(*B, C, N)`.
          size of the last two dimensions. If `transpose_a` is True, its shape must be :math:`(*B, C, N)`.
        - **input_y** (Tensor) - The second tensor to be multiplied. The shape of the tensor is :math:`(*B, C, M)`. If
          `transpose_b` is True, its shape should be :math:`(*B, M, C)`.
          `transpose_b` is True, its shape must be :math:`(*B, M, C)`.
    Outputs:
        Tensor, the shape of the output tensor is :math:`(*B, N, M)`.
@@ -735,8 +735,8 @@ class CumSum(PrimitiveWithInfer):
    Computes the cumulative sum of input tensor along axis.
    Args:
        exclusive (bool): If True, perform exclusive mode. Default: False.
        reverse (bool): If True, perform inverse cumulative sum. Default: False.
        exclusive (bool): If true, perform exclusive mode. Default: False.
        reverse (bool): If true, perform inverse cumulative sum. Default: False.
    Inputs:
        - **input** (Tensor) - The input tensor to accumulate.
@@ -758,7 +758,7 @@ class CumSum(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, exclusive=False, reverse=False):
        """init cumsum"""
        """Initialize cumsum"""
        cls_name = self.name
        validator.check_value_type('exclusive', exclusive, [bool], cls_name)
        validator.check_value_type('reverse', reverse, [bool], cls_name)
@@ -781,7 +781,7 @@ class AddN(PrimitiveWithInfer):
    """
    Computes addition of all input tensors element-wise.
    All input tensors should have the same shape.
    All input tensors must have the same shape.
    Inputs:
        - **input_x** (Union(tuple[Tensor], list[Tensor])) - The input tuple or list
@@ -932,7 +932,7 @@ class Neg(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Neg"""
        """Initialize Neg"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
    def infer_shape(self, input_x):
@@ -980,7 +980,7 @@ class InplaceAdd(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, indices):
        """init InplaceAdd"""
        """Initialize InplaceAdd"""
        self.init_prim_io_names(inputs=['x', 'v'], outputs=['y'])
        self.indices = indices
        validator.check_value_type('indices', indices, [tuple, int], self.name)
@@ -1038,7 +1038,7 @@ class InplaceSub(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, indices):
        """init InplaceSub"""
        """Initialize InplaceSub"""
        self.init_prim_io_names(inputs=['x', 'v'], outputs=['y'])
        self.indices = indices
        validator.check_value_type('indices', indices, [tuple, int], self.name)
@@ -1198,7 +1198,7 @@ class Square(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Square"""
        """Initialize Square"""
        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -1222,7 +1222,7 @@ class Rsqrt(PrimitiveWithInfer):
    Computes reciprocal of square root of input tensor element-wise.
    Inputs:
        - **input_x** (Tensor) - The input of Rsqrt. Each element should be a non-negative number.
        - **input_x** (Tensor) - The input of Rsqrt. Each element must be a non-negative number.
    Outputs:
        Tensor, has the same type and shape as `input_x`.
@@ -1236,7 +1236,7 @@ class Rsqrt(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Rsqrt"""
        """Initialize Rsqrt"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -1274,7 +1274,7 @@ class Sqrt(PrimitiveWithCheck):
    @prim_attr_register
    def __init__(self):
        """init Sqrt"""
        """Initialize Sqrt"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def check_dtype(self, x_type):
@@ -1308,7 +1308,7 @@ class Reciprocal(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Reciprocal"""
        """Initialize Reciprocal"""
        if context.get_context("device_target") == "GPU":
            self.target = "GPU"
        else:
@@ -1395,7 +1395,7 @@ class Exp(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Exp"""
        """Initialize Exp"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
    def infer_shape(self, x_shape):
@@ -1433,7 +1433,7 @@ class Expm1(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Exp"""
        """Initialize Exp"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
    def infer_shape(self, x_shape):
@@ -1576,7 +1576,7 @@ class Erf(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Erf"""
        """Initialize Erf"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
    def infer_shape(self, x_shape):
@@ -1606,7 +1606,7 @@ class Erfc(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Erfc"""
        """Initialize Erfc"""
        self.init_prim_io_names(inputs=['x'], outputs=['y'])
    def infer_shape(self, x_shape):
@@ -1750,7 +1750,7 @@ class Div(_MathBinaryOp):
          a bool or a tensor whose data type is number or bool.
        - **input_y** (Union[Tensor, Number, bool]) - When the first input is a tensor, The second input
          could be a number, a bool, or a tensor whose data type is number or bool. When the first input
          is a number or a bool, the second input should be a tensor whose data type is number or bool.
          is a number or a bool, the second input must be a tensor whose data type is number or bool.
    Outputs:
        Tensor, the shape is the same as the one after broadcasting,
@@ -1923,7 +1923,7 @@ class Mod(_MathBinaryOp):
        - **input_x** (Union[Tensor, Number]) - The first input is a number or a tensor whose data type is number.
        - **input_y** (Union[Tensor, Number]) - When the first input is a tensor, The second input
          could be a number or a tensor whose data type is number. When the first input is a number,
          the second input should be a tensor whose data type is number.
          the second input must be a tensor whose data type is number.
    Outputs:
        Tensor, the shape is the same as the one after broadcasting,
@@ -2121,7 +2121,7 @@ class Acosh(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Acosh"""
        """Initialize Acosh"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2150,7 +2150,7 @@ class Cosh(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Cosh"""
        """Initialize Cosh"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2179,7 +2179,7 @@ class Asinh(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Asinh"""
        """Initialize Asinh"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2208,7 +2208,7 @@ class Sinh(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Sinh"""
        """Initialize Sinh"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2297,7 +2297,7 @@ class ApproximateEqual(_LogicBinaryOp):
    @prim_attr_register
    def __init__(self, tolerance=1e-05):
        """Init ApproximateEqual"""
        """Initialize ApproximateEqual"""
        validator.check_value_type("tolerance", tolerance, [float], self.name)
    def infer_shape(self, x_shape, y_shape):
@@ -2315,7 +2315,7 @@ class EqualCount(PrimitiveWithInfer):
    """
    Computes the number of the same elements of two tensors.
    The two input tensors should have the same data type and shape.
    The two input tensors must have the same data type and shape.
    Inputs:
        - **input_x** (Tensor) - The first input tensor.
@@ -2334,7 +2334,7 @@ class EqualCount(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init EqualCount"""
        """Initialize EqualCount"""
        self.init_prim_io_names(inputs=['x', 'y'], outputs=['output'])
    def infer_shape(self, x_shape, y_shape):
@@ -2550,7 +2550,7 @@ class LogicalNot(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init LogicalNot"""
        """Initialize LogicalNot"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -2568,9 +2568,9 @@ class LogicalAnd(_LogicBinaryOp):
    Inputs of `input_x` and `input_y` comply with the implicit type conversion rules to make the data types consistent.
    The inputs must be two tensors or one tensor and one bool.
    When the inputs are two tensors, the shapes of them could be broadcast,
    and the data types of them should be bool.
    and the data types of them must be bool.
    When the inputs are one tensor and one bool, the bool object could only be a constant,
    and the data type of the tensor should be bool.
    and the data type of the tensor must be bool.
    Inputs:
        - **input_x** (Union[Tensor, bool]) - The first input is a bool or a tensor whose data type is bool.
@@ -2599,9 +2599,9 @@ class LogicalOr(_LogicBinaryOp):
    Inputs of `input_x` and `input_y` comply with the implicit type conversion rules to make the data types consistent.
    The inputs must be two tensors or one tensor and one bool.
    When the inputs are two tensors, the shapes of them could be broadcast,
    and the data types of them should be bool.
    and the data types of them must be bool.
    When the inputs are one tensor and one bool, the bool object could only be a constant,
    and the data type of the tensor should be bool.
    and the data type of the tensor must be bool.
    Inputs:
        - **input_x** (Union[Tensor, bool]) - The first input is a bool or a tensor whose data type is bool.
@@ -2641,7 +2641,7 @@ class IsNan(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init IsNan"""
        """Initialize IsNan"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -2669,7 +2669,7 @@ class IsInf(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init IsInf"""
        """Initialize IsInf"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -2698,7 +2698,7 @@ class IsFinite(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init IsFinite"""
        """Initialize IsFinite"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -2729,7 +2729,7 @@ class FloatStatus(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init FloatStatus"""
        """Initialize FloatStatus"""
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -2760,7 +2760,7 @@ class NPUAllocFloatStatus(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init NPUAllocFloatStatus"""
        """Initialize NPUAllocFloatStatus"""
        self.add_prim_attr("_side_effect_flag", True)
    def infer_shape(self):
@@ -2795,7 +2795,7 @@ class NPUGetFloatStatus(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init NPUGetFloatStatus"""
        """Initialize NPUGetFloatStatus"""
        self.add_prim_attr("_side_effect_flag", True)
    def infer_shape(self, x_shape):
@@ -2838,7 +2838,7 @@ class NPUClearFloatStatus(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init NPUClearFloatStatus"""
        """Initialize NPUClearFloatStatus"""
        self.add_prim_attr("_side_effect_flag", True)
    def infer_shape(self, x_shape):
@@ -2870,7 +2870,7 @@ class Cos(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Cos"""
        """Initialize Cos"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2898,7 +2898,7 @@ class ACos(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init ACos"""
        """Initialize ACos"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2926,7 +2926,7 @@ class Sin(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """Init Sin."""
        """Initialize Sin."""
    def infer_shape(self, x_shape):
        return x_shape
@@ -2955,7 +2955,7 @@ class Asin(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Asin"""
        """Initialize Asin"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -3006,7 +3006,7 @@ class NMSWithMask(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, iou_threshold=0.5):
        """Init NMSWithMask"""
        """Initialize NMSWithMask"""
        validator.check_value_type("iou_threshold", iou_threshold, [float], self.name)
        self.init_prim_io_names(inputs=['bboxes'], outputs=['selected_boxes', 'selected_idx', 'selected_mask'])
        self.is_ge = context.get_context("enable_ge")
@@ -3043,7 +3043,7 @@ class Abs(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Abs"""
        """Initialize Abs"""
        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -3115,7 +3115,7 @@ class Round(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Round"""
        """Initialize Round"""
        self.init_prim_io_names(inputs=['input_x'], outputs=['output'])
    def infer_shape(self, x_shape):
@@ -3131,7 +3131,7 @@ class Tan(PrimitiveWithInfer):
    Computes tangent of `input_x` element-wise.
    Inputs:
        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. Data type should be
        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. Data type must be
          float16, float32 or int32.
    Outputs:
@@ -3145,7 +3145,7 @@ class Tan(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Tan"""
        """Initialize Tan"""
    def infer_shape(self, x_shape):
        return x_shape
@@ -3268,7 +3268,7 @@ class SquareSumAll(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init SquareSumAll"""
        """Initialize SquareSumAll"""
    def infer_shape(self, x_shape, y_shape):
        validator.check("x1_shape", x_shape, "x2_shape", y_shape, Rel.EQ, self.name)
@@ -3366,7 +3366,7 @@ class BesselI0e(PrimitiveWithInfer):
        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
    Outputs:
        Tensor, has the same shape as `input_x`. Data type should be float16 or float32.
        Tensor, has the same shape as `input_x`. Data type must be float16 or float32.
    Examples:
        >>> bessel_i0e = P.BesselI0e()
@@ -3377,7 +3377,7 @@ class BesselI0e(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init BesselI0e"""
        """Initialize BesselI0e"""
    def infer_shape(self, x):
        return x
@@ -3395,7 +3395,7 @@ class BesselI1e(PrimitiveWithInfer):
        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
    Outputs:
        Tensor, has the same shape as `input_x`. Data type should be float16 or float32.
        Tensor, has the same shape as `input_x`. Data type must be float16 or float32.
    Examples:
        >>> bessel_i1e = P.BesselI1e()
@@ -3406,7 +3406,7 @@ class BesselI1e(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init BesselI1e"""
        """Initialize BesselI1e"""
    def infer_shape(self, x):
        return x
@@ -3494,7 +3494,7 @@ class Eps(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Eps"""
        """Initialize Eps"""
        self.init_prim_io_names(inputs=['input_x'], outputs=['y'])
    def __infer__(self, input_x):
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -23,7 +23,7 @@ from ..primitive import Primitive, PrimitiveWithCheck, PrimitiveWithInfer, prim_
 class Assign(PrimitiveWithCheck):
    """
    Assign `Parameter` with a value.
    Assigns `Parameter` with a value.
    Inputs of `variable` and `value` comply with the implicit type conversion rules to make the data types consistent.
    If they have different data types, lower priority data type will be converted to
@@ -67,7 +67,7 @@ class Assign(PrimitiveWithCheck):
 class BoundingBoxEncode(PrimitiveWithInfer):
    """
    Encode bounding boxes locations.
    Encodes bounding boxes locations.
    Args:
        means (tuple): Means for encoding bounding boxes calculation. Default: (0.0, 0.0, 0.0, 0.0).
@@ -118,7 +118,7 @@ class BoundingBoxEncode(PrimitiveWithInfer):
 class BoundingBoxDecode(PrimitiveWithInfer):
    """
    Decode bounding boxes locations.
    Decodes bounding boxes locations.
    Args:
        means (tuple): The means of deltas calculation. Default: (0.0, 0.0, 0.0, 0.0).
@@ -175,14 +175,14 @@ class BoundingBoxDecode(PrimitiveWithInfer):
 class CheckValid(PrimitiveWithInfer):
    """
    Check bounding box.
    Checks bounding box.
    Check whether the bounding box cross data and data border are valid.
    Checks whether the bounding box cross data and data border are valid.
    Inputs:
        - **bboxes** (Tensor) - Bounding boxes tensor with shape (N, 4). Data type should be float16 or float32.
        - **bboxes** (Tensor) - Bounding boxes tensor with shape (N, 4). Data type must be float16 or float32.
        - **img_metas** (Tensor) - Raw image size information with the format of (height, width, ratio).
          Data type should be float16 or float32.
          Data type must be float16 or float32.
    Outputs:
        Tensor, the valided tensor.
@@ -228,9 +228,9 @@ class CheckValid(PrimitiveWithInfer):
 class IOU(PrimitiveWithInfer):
    r"""
    Calculate intersection over union for boxes.
    Calculates intersection over union for boxes.
    Compute the intersection over union (IOU) or the intersection over foreground (IOF) based on the ground-truth and
    Computes the intersection over union (IOU) or the intersection over foreground (IOF) based on the ground-truth and
    predicted regions.
    .. math::
@@ -288,7 +288,7 @@ class IOU(PrimitiveWithInfer):
 class MakeRefKey(Primitive):
    """
    Make a RefKey instance by string. RefKey stores the name of Parameter, can be passed through the functions,
    Makes a RefKey instance by string. RefKey stores the name of Parameter, can be passed through the functions,
    and used for Assign target.
    Args:
@@ -328,7 +328,7 @@ class MakeRefKey(Primitive):
 class Partial(Primitive):
    """
    Make a partial function instance, used for pynative mode.
    Makes a partial function instance, used for pynative mode.
    Inputs:
        - **args** (Union[FunctionType, Tensor]) - The function and bind arguments.
@@ -390,7 +390,7 @@ class CheckBprop(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, prim_to_check=""):
        """init CheckBprop"""
        """Initialize CheckBprop"""
        self.prim_to_check = prim_to_check
    def infer_shape(self, xshapes, yshapes):
@@ -437,7 +437,7 @@ class CheckBprop(PrimitiveWithInfer):
 class ConfusionMatrix(PrimitiveWithInfer):
    r"""
    Calculate the confusion matrix from labels and predictions.
    Calculates the confusion matrix from labels and predictions.
    Args:
        num_classes (int): The num of classes.
@@ -484,10 +484,10 @@ class ConfusionMatrix(PrimitiveWithInfer):
 class PopulationCount(PrimitiveWithInfer):
    r"""
    Calculate population count.
    Calculates population count.
    Inputs:
        - **input** (Tensor) -  The data type should be int16 or uint16.
        - **input** (Tensor) -  The data type must be int16 or uint16.
    Outputs:
        Tensor, with the sam  shape as the input.
@@ -512,7 +512,7 @@ class PopulationCount(PrimitiveWithInfer):
 class Push(PrimitiveWithInfer):
    """
    Pushing the inputs of the corresponding optimizer to parameter server.
    Pushes the inputs of the corresponding optimizer to parameter server.
    Args:
        optim_type (string): The optimizer type. Default: 'ApplyMomentum'.
@@ -529,7 +529,7 @@ class Push(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, optim_type='ApplyMomentum', only_shape_indices=None):
        """init Push"""
        """Initialize Push"""
        self.add_prim_attr("primitive_target", "CPU")
        self.add_prim_attr("_side_effect", True)
        self.init_prim_io_names(inputs=['optim_inputs', 'optim_input_shapes'], outputs=['key'])
@@ -542,7 +542,7 @@ class Push(PrimitiveWithInfer):
 class Pull(PrimitiveWithInfer):
    """
    Pulling weight from parameter server.
    Pulls weight from parameter server.
    Inputs:
        - **key** (Tensor) - The key of the weight.
@@ -554,7 +554,7 @@ class Pull(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init Pull"""
        """Initialize Pull"""
        self.add_prim_attr("primitive_target", "CPU")
        self.init_prim_io_names(inputs=['key', 'weight'], outputs=['output'])
@@ -566,7 +566,7 @@ class Pull(PrimitiveWithInfer):
 class identity(Primitive):
    """
    Make a identify primitive, used for pynative mode.
    Makes a identify primitive, used for pynative mode.
    Inputs:
        - **x** (Any) - identity input value.
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@@ -27,8 +27,8 @@ class StandardNormal(PrimitiveWithInfer):
    Generates random numbers according to the standard Normal (or Gaussian) random number distribution.
    Args:
        seed (int): Random seed. Must be non-negative. Default: 0.
        seed2 (int): Random seed2. Must be non-negative. Default: 0.
        seed (int): Random seed, must be non-negative. Default: 0.
        seed2 (int): Random seed2, must be non-negative. Default: 0.
    Inputs:
        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
@@ -44,7 +44,7 @@ class StandardNormal(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init StandardNormal"""
        """Initialize StandardNormal"""
        self.init_prim_io_names(inputs=['shape'], outputs=['output'])
        validator.check_integer("seed", seed, 0, Rel.GE, self.name)
        validator.check_integer("seed2", seed2, 0, Rel.GE, self.name)
@@ -89,7 +89,7 @@ class StandardLaplace(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init StandardLaplace"""
        """Initialize StandardLaplace"""
        self.init_prim_io_names(inputs=['shape'], outputs=['output'])
        validator.check_value_type('seed', seed, [int], self.name)
        validator.check_value_type('seed2', seed2, [int], self.name)
@@ -117,18 +117,18 @@ class Gamma(PrimitiveWithInfer):
        \text{P}(x|α,β) = \frac{\exp(-x/β)}{{β^α}\cdot{\Gamma(α)}}\cdot{x^{α-1}},
    Args:
        seed (int): Random seed. Must be non-negative. Default: 0.
        seed2 (int): Random seed2. Must be non-negative. Default: 0.
        seed (int): Random seed, must be non-negative. Default: 0.
        seed2 (int): Random seed2, must be non-negative. Default: 0.
    Inputs:
        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
        - **alpha** (Tensor) - The α distribution parameter. It should be greater than 0.
        - **alpha** (Tensor) - The α distribution parameter. It must be greater than 0.
          It is also known as the shape parameter with float32 data type.
        - **beta** (Tensor) - The β distribution parameter. It should be greater than 0.
        - **beta** (Tensor) - The β distribution parameter. It must be greater than 0.
          It is also known as the scale parameter with float32 data type.
    Outputs:
        Tensor. The shape should be the broadcasted shape of Input "shape" and shapes of alpha and beta.
        Tensor. The shape must be the broadcasted shape of Input "shape" and shapes of alpha and beta.
        The dtype is float32.
    Examples:
@@ -141,7 +141,7 @@ class Gamma(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init Gamma"""
        """Initialize Gamma"""
        self.init_prim_io_names(inputs=['shape', 'alpha', 'beta'], outputs=['output'])
        validator.check_integer("seed", seed, 0, Rel.GE, self.name)
        validator.check_integer("seed2", seed2, 0, Rel.GE, self.name)
@@ -172,16 +172,16 @@ class Poisson(PrimitiveWithInfer):
        \text{P}(i|μ) = \frac{\exp(-μ)μ^{i}}{i!},
    Args:
        seed (int): Random seed. Must be non-negative. Default: 0.
        seed2 (int): Random seed2. Must be non-negative. Default: 0.
        seed (int): Random seed, must be non-negative. Default: 0.
        seed2 (int): Random seed2, must be non-negative. Default: 0.
    Inputs:
        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
        - **mean** (Tensor) - μ parameter the distribution was constructed with. The parameter defines mean number
          of occurrences of the event. It should be greater than 0. With float32 data type.
          of occurrences of the event. It must be greater than 0. With float32 data type.
    Outputs:
        Tensor. Its shape should be the broadcasted shape of `shape` and the shape of `mean`.
        Tensor. Its shape must be the broadcasted shape of `shape` and the shape of `mean`.
        The dtype is int32.
    Examples:
@@ -193,7 +193,7 @@ class Poisson(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init Poisson"""
        """Initialize Poisson"""
        self.init_prim_io_names(inputs=['shape', 'mean'], outputs=['output'])
        validator.check_integer("seed", seed, 0, Rel.GE, self.name)
        validator.check_integer("seed2", seed2, 0, Rel.GE, self.name)
@@ -223,11 +223,11 @@ class UniformInt(PrimitiveWithInfer):
        \text{P}(i|a,b) = \frac{1}{b-a+1},
    Note:
        The number in tensor minval should be strictly less than maxval at any position after broadcasting.
        The number in tensor minval must be strictly less than maxval at any position after broadcasting.
    Args:
        seed (int): Random seed. Must be non-negative. Default: 0.
        seed2 (int): Random seed2. Must be non-negative. Default: 0.
        seed (int): Random seed, must be non-negative. Default: 0.
        seed2 (int): Random seed2, must be non-negative. Default: 0.
    Inputs:
        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
@@ -249,7 +249,7 @@ class UniformInt(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init UniformInt"""
        """Initialize UniformInt"""
        self.init_prim_io_names(inputs=['shape', 'minval', 'maxval'], outputs=['output'])
        validator.check_integer("seed", seed, 0, Rel.GE, self.name)
        validator.check_integer("seed2", seed2, 0, Rel.GE, self.name)
@@ -279,8 +279,8 @@ class UniformReal(PrimitiveWithInfer):
    Produces random floating-point values i, uniformly distributed to the interval [0, 1).
    Args:
        seed (int): Random seed. Must be non-negative. Default: 0.
        seed2 (int): Random seed2. Must be non-negative. Default: 0.
        seed (int): Random seed, must be non-negative. Default: 0.
        seed2 (int): Random seed2, must be non-negative. Default: 0.
    Inputs:
        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
@@ -296,7 +296,7 @@ class UniformReal(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, seed=0, seed2=0):
        """Init UniformReal"""
        """Initialize UniformReal"""
        self.init_prim_io_names(inputs=['shape'], outputs=['output'])
        validator.check_integer("seed", seed, 0, Rel.GE, self.name)
        validator.check_integer("seed2", seed2, 0, Rel.GE, self.name)
@@ -325,13 +325,13 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
    sample, while the mask tensor denotes which elements in the index tensor are valid.
    Args:
        count (int): Number of items expected to get and the number should be greater than 0. Default: 256.
        count (int): Number of items expected to get and the number must be greater than 0. Default: 256.
        seed (int): Random seed. Default: 0.
        seed2 (int): Random seed2. Default: 0.
    Inputs:
        - **input_x** (Tensor[bool]) - The input tensor.
            The input tensor rank should be greater than or equal to 1 and less than or equal to 5.
            The input tensor rank must be greater than or equal to 1 and less than or equal to 5.
    Outputs:
        Two tensors, the first one is the index tensor and the other one is the mask tensor.
@@ -347,7 +347,7 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, count=256, seed=0, seed2=0):
        """Init RandomChoiceWithMask"""
        """Initialize RandomChoiceWithMask"""
        validator.check_value_type("count", count, [int], self.name)
        validator.check_integer("count", count, 0, Rel.GT, self.name)
        validator.check_value_type('seed', seed, [int], self.name)
@@ -368,7 +368,7 @@ class RandomCategorical(PrimitiveWithInfer):
    Generates random samples from a given categorical distribution tensor.
    Args:
        dtype (mindspore.dtype): The type of output. Its value should be one of mindspore.int16,
        dtype (mindspore.dtype): The type of output. Its value must be one of mindspore.int16,
            mindspore.int32 and mindspore.int64. Default: mindspore.int64.
    Inputs:
@@ -395,7 +395,7 @@ class RandomCategorical(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self, dtype=mstype.int64):
        """Init RandomCategorical"""
        """Initialize RandomCategorical"""
        self.dtype = dtype
        valid_values = (mstype.int32, mstype.int16, mstype.int64)
--- a/mindspore/ops/operations/sparse_ops.py
+++ b/mindspore/ops/operations/sparse_ops.py
@@ -24,7 +24,7 @@ from ..primitive import PrimitiveWithInfer, prim_attr_register
 class SparseToDense(PrimitiveWithInfer):
    """
    Convert a sparse representation into a dense tensor.
    Converts a sparse representation into a dense tensor.
    Inputs:
        - **indices** (Tensor) - The indices of sparse representation.
@@ -43,7 +43,7 @@ class SparseToDense(PrimitiveWithInfer):
    @prim_attr_register
    def __init__(self):
        """init index_select"""
        """Initialize index_select"""
        self.init_prim_io_names(inputs=['indices', 'values', 'dense_shape'], outputs=['output'])
    def __infer__(self, indices, values, dense_shape):