!4121 Third round of the enhancement of API comments

Merge pull request !4121 from Simson/enhancement-API
5 years ago · 2c2fe9bed9
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -33,7 +33,7 @@ from ..common.tensor import Tensor

 class Cell:
    """
    Base class for all neural network.
    Base class for all neural networks.

    A 'Cell' could be a single neural network cell, such as conv2d, relu, batch_norm, etc. or a composition of
    cells to constructing a network.
@@ -42,8 +42,8 @@ class Cell:
        In general, the autograd algorithm will automatically generate the implementation of the gradient function,
        but if bprop method is implemented, the gradient function
        will be replaced by the bprop. The bprop implementation will receive a Tensor `dout` containing the gradient
        of the loss w.r.t. the output, and a Tensor `out` containing the forward result. The bprop need to compute the
        gradient of the loss w.r.t. the inputs, gradient of the loss w.r.t. Parameter variables is not supported
        of the loss w.r.t. the output, and a Tensor `out` containing the forward result. The bprop needs to compute the
        gradient of the loss w.r.t. the inputs, gradient of the loss w.r.t. Parameter variables are not supported
        currently.

    Args:
@@ -138,7 +138,7 @@ class Cell:
        """
        Update the all child cells' self.param_prefix.

        After invoked, can get all the cell's children's name prefix by '_param_prefix'.
        After being invoked, it can get all the cell's children's name prefix by '_param_prefix'.
        """
        cells_name = self.cells_and_names()

@@ -147,9 +147,9 @@ class Cell:

    def update_cell_type(self, cell_type):
        """
        Update current cell type mainly identify if quantization aware training network.
        Update the current cell type mainly identify if quantization aware training network.

        After invoked, can set the cell type to 'cell_type'.
        After being invoked, it can set the cell type to 'cell_type'.
        """
        self.cell_type = cell_type

@@ -346,7 +346,7 @@ class Cell:
        Please refer to the usage in source code of `mindspore.common._Executor.compile`.

        Args:
            params (dict): The parameters dictionary used for init data graph.
            params (dict): The parameters dictionary used for initializing the data graph.
        """
        if params is None:
            params = self.parameters_dict()
@@ -499,7 +499,7 @@ class Cell:
        """
        Adds a child cell to the current cell.

        Inserts a subcell with given name to current cell.
        Inserts a subcell with a given name to the current cell.

        Args:
            child_name (str): Name of the child cell.
@@ -534,7 +534,7 @@ class Cell:

    def init_parameters_data(self, auto_parallel_mode=False):
        """
        Init all parameters' data and replace the original saved parameters in cell.
        Initialize all parameters and replace the original saved parameters in cell.

        Notes:
            trainable_params() and other similar interfaces may return different parameter instance after
@@ -655,7 +655,7 @@ class Cell:
        Yields parameters of this cell. If `expand` is True, yield parameters of this cell and all subcells.

        Args:
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, yields only parameters
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@@ -682,7 +682,7 @@ class Cell:

        Args:
            name_prefix (str): Namespace. Default: ''.
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, yields only parameters
            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@@ -772,7 +772,7 @@ class Cell:
        return self._scope

    def generate_scope(self):
        """Generate the scope for every cell object in the network."""
        """Generate the scope for each cell object in the network."""
        for name, cell in self._children_scope_recursive():
            cell._set_scope(name)

@@ -819,14 +819,14 @@ class Cell:
        `mindspore.train.amp.build_train_network`.

        Note:
            Call multiple times will overwrite the previous.
            Multiple calls will overwrite.

        Args:
            dst_type (:class:`mindspore.dtype`): Transfer Cell to Run with dst_type.
                dst_type can be `mindspore.dtype.float16` or `mindspore.dtype.float32`.

        Raises:
            ValueError: If dst_type is not float32 or float16.
            ValueError: If dst_type is not float32 nor float16.
        """
        if dst_type not in (mstype.float16, mstype.float32):
            raise ValueError("dst_type should inside float32 or float16.")
@@ -871,8 +871,8 @@ class Cell:
        Set the cell to auto parallel mode.

        Note:
            If a cell needs to use auto parallel or semi auto parallel mode for training, evaluation or prediction,
            this interface needs to be called for the cell.
            If a cell needs to use the auto parallel or semi auto parallel mode for training, evaluation or prediction,
            this interface needs to be called by the cell.
        """
        self._auto_parallel_mode = True
        self.add_flags(auto_parallel=True)
@@ -890,9 +890,9 @@ class Cell:
        Set the cell backward hook function. Note that this function is only supported in Pynative Mode.

        Note:
            fn should be defined as following code shows, `cell_name` is the name of registered cell,
            `grad_input` is gradient passed to the cell, `grad_output` is the gradient computed and pass to
            next cell or primitve, which may be modified and return.
            fn should be defined as the following code. `cell_name` is the name of registered cell.
            `grad_input` is gradient passed to the cell. `grad_output` is the gradient computed and passed to the
            next cell or primitve, which may be modified and returned.
            >>> hook_fn(cell_name, grad_input, grad_output) -> Tensor or None

        Args:
@@ -907,7 +907,7 @@ class Cell:
        Set whether the trainable parameter is updated by parameter server.

        Note:
            This only works when running task in parameter server mode.
            It only works when a running task is in the parameter server mode.

        Args:
            recurse (bool): Whether sets the trainable parameters of subcells. Default: True.
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -172,7 +172,7 @@ class Dense(Cell):
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (str): activate function applied to the output of the fully connected layer, eg. 'relu'.
        activation (str): activate function applied to the output of the fully connected layer, eg. 'ReLU'.
            Default: None.

    Raises:
--- a/mindspore/nn/layer/container.py
+++ b/mindspore/nn/layer/container.py
@@ -236,7 +236,7 @@ class CellList(_CellListBase, Cell):
        Appends cells from a Python iterable to the end of the list.

        Raises:
            TypeError: If the cells is not a list of subcells.
            TypeError: If the cells are not a list of subcells.
        """
        if not isinstance(cells, list):
            raise TypeError('Cells {} should be list of subcells'.format(cells))
--- a/mindspore/nn/layer/conv.py
+++ b/mindspore/nn/layer/conv.py
@@ -111,11 +111,11 @@ class Conv2d(_Conv):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -132,7 +132,7 @@ class Conv2d(_Conv):
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the height
            and width of the 2D convolution window. Single int means the value if for both height and width of
            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -141,13 +141,13 @@ class Conv2d(_Conv):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

            - same: Adopts the way of completion. Output height and width will be the same as the input.
              Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible. Otherwise, the
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

            - valid: Adopts the way of discarding. The possibly largest height and width of output will be return
            - valid: Adopts the way of discarding. The possibly largest height and width of output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

@@ -155,9 +155,9 @@ class Conv2d(_Conv):
              Tensor borders. `padding` should be greater than or equal to 0.

        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
                    the padding of top, bottom, left and right is same, equal to padding. If `padding` is tuple with
                    four integer, the padding of top, bottom, left and right equal to padding[0], padding[1],
                    padding[2], padding[3] with corresponding. Default: 0.
                    the padding of top, bottom, left and right is the same, equal to padding. If `padding` is a tuple
                    with four integers, the padding of top, bottom, left and right will be equal to padding[0],
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
@@ -167,7 +167,7 @@ class Conv2d(_Conv):
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -274,10 +274,10 @@ class Conv1d(_Conv):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_w})`, where :math:`\text{ks_w}` are width of the convolution kernel.
    of kernel and it has shape :math:`(\text{ks_w})`, where :math:`\text{ks_w}` is the width of the convolution kernel.
    The full kernel has shape :math:`(C_{out}, C_{in} // \text{group}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -285,8 +285,8 @@ class Conv1d(_Conv):
    :math:`\left \lfloor{1 + \frac{W_{in} + 2 \times \text{padding} - \text{ks_w} -
    (\text{ks_w} - 1) \times (\text{dilation} - 1) }{\text{stride}}} \right \rfloor` respectively.

    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
    The first introduction of convolution layer can be found in paper `Gradient Based Learning Applied to Document
    Recognition <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@@ -298,13 +298,13 @@ class Conv1d(_Conv):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

            - same: Adopts the way of completion. Output width will be the same as the input.
              Total number of padding will be calculated for horizontal
            - same: Adopts the way of completion. The output width will be the same as the input.
              The total number of padding will be calculated in the horizontal
              direction and evenly distributed to left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

            - valid: Adopts the way of discarding. The possibly largest width of output will be return
            - valid: Adopts the way of discarding. The possible largest width of the output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

@@ -320,8 +320,8 @@ class Conv1d(_Conv):
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): An initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -443,8 +443,8 @@ class Conv2dTranspose(_Conv):
    r"""
    2D transposed convolution layer.

    Compute a 2D transposed convolution, which is also know as a deconvolution
    (although it is not actual deconvolution).
    Compute a 2D transposed convolution, which is also known as a deconvolution
    (although it is not an actual deconvolution).

    Input is typically of shape :math:`(N, C, H, W)`, where :math:`N` is batch size and :math:`C` is channel number.

@@ -452,7 +452,7 @@ class Conv2dTranspose(_Conv):
        in_channels (int): The number of channels in the input space.
        out_channels (int): The number of channels in the output space.
        kernel_size (Union[int, tuple]): int or tuple with 2 integers, which specifies the  height
            and width of the 2D convolution window. Single int means the value is for both height and width of
            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -467,19 +467,19 @@ class Conv2dTranspose(_Conv):

            - valid: Adopted the way of discarding.
        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
                    the padding of top, bottom, left and right is same, equal to padding. If `padding` is tuple with
                    four integer, the padding of top, bottom, left and right equal to padding[0], padding[1],
                    padding[2], padding[3] with corresponding. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the dilation rate
                    the padding of top, bottom, left and right is the same, equal to padding. If `padding` is a tuple
                    with four integers, the padding of top, bottom, left and right will be equal to padding[0],
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
                                      be greater or equal to 1 and bounded by the height and width of the
                                      be greater than or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_channels` and `out_channels` should be
            divisible by the number of groups. This is not support for Davinci devices when group > 1. Default: 1.
            divisible by the number of groups. This does not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -614,8 +614,8 @@ class Conv1dTranspose(_Conv):
    r"""
    1D transposed convolution layer.

    Compute a 1D transposed convolution, which is also know as a deconvolution
    (although it is not actual deconvolution).
    Compute a 1D transposed convolution, which is also known as a deconvolution
    (although it is not an actual deconvolution).

    Input is typically of shape :math:`(N, C, W)`, where :math:`N` is batch size and :math:`C` is channel number.

@@ -805,11 +805,11 @@ class DepthwiseConv2d(Cell):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -826,7 +826,7 @@ class DepthwiseConv2d(Cell):
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the height
            and width of the 2D convolution window. Single int means the value if for both height and width of
            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -835,13 +835,13 @@ class DepthwiseConv2d(Cell):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

            - same: Adopts the way of completion. Output height and width will be the same as the input.
              Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible. Otherwise, the
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

            - valid: Adopts the way of discarding. The possibly largest height and width of output will be return
            - valid: Adopts the way of discarding. The possibly largest height and width of output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -248,7 +248,7 @@ class BatchNorm1d(_BatchNorm):
        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
        momentum (float): A floating hyperparameter of the momentum for the
            running_mean and running_var computation. Default: 0.9.
        affine (bool): A bool value when set to True, gamma and beta can be learnable. Default: True.
        affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
@@ -262,9 +262,9 @@ class BatchNorm1d(_BatchNorm):
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
            use the mean value and variance value of specified value. If None, training process will use the mean and
            variance of current batch data and track the running mean and variance, eval process will use the running
            mean and variance. Default: None.
            use the mean value and variance value of specified value. If None, the training process will use the mean
            and variance of current batch data and track the running mean and variance, the evaluation process will use
            the running mean and variance. Default: None.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in})`.
@@ -324,7 +324,7 @@ class BatchNorm2d(_BatchNorm):
        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
        momentum (float): A floating hyperparameter of the momentum for the
            running_mean and running_var computation. Default: 0.9.
        affine (bool): A bool value when set to True, gamma and beta can be learnable. Default: True.
        affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
@@ -338,9 +338,9 @@ class BatchNorm2d(_BatchNorm):
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
            use the mean value and variance value of specified value. If None, training process will use the mean and
            variance of current batch data and track the running mean and variance, eval process will use the running
            mean and variance. Default: None.
            use the mean value and variance value of specified value. If None, the training process will use the mean
            and variance of current batch data and track the running mean and variance, the evaluation process will use
            the running mean and variance. Default: None.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
--- a/mindspore/nn/layer/pooling.py
+++ b/mindspore/nn/layer/pooling.py
@@ -84,16 +84,16 @@ class MaxPool2d(_PoolNd):
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -158,23 +158,23 @@ class AvgPool2d(_PoolNd):
        pad_mode for training only supports "same" and "valid".

    Args:
        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value,
            is an int number that represents height and width are both kernel_size,
        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value.
            The data type of kernel_size should be int and the value represents the height and width,
            or a tuple of two int numbers that represent height and width respectively.
            Default: 1.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.


    Inputs:
@@ -238,16 +238,16 @@ class AvgPool1d(_PoolNd):
        kernel_size (int): The size of kernel window used to take the average value, Default: 1.
        stride (int): The distance of kernel moving, an int number that represents
            the width of movement is strides, Default: 1.
        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.


    Inputs:
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -56,27 +56,27 @@ class Conv2dBnAct(Cell):
    r"""
    A combination of convolution, Batchnorm, activation layer.

    For a more Detailed overview of Conv2d op.
    This part is a more detailed overview of Conv2d op.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple]): The data type is int or tuple with 2 integers. Specifies the height
            and width of the 2D convolution window. Single int means the value if for both height and width of
            and width of the 2D convolution window. Single int means the value is for both height and width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (int): Specifies stride for all spatial dimensions with the same value. Value of stride should be
            greater or equal to 1 but bounded by the height and width of the input. Default: 1.
        stride (int): Specifies stride for all spatial dimensions with the same value. The value of stride should be
            greater than or equal to 1 and lower than any one of the height and width of the input. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): Specifying the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater
            or equal to 1 and bounded by the height and width of the input. Default: 1.
            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater than
            or equal to 1 and lower than any one of the height and width of the input. Default: 1.
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -160,9 +160,9 @@ class Conv2dBnAct(Cell):

 class DenseBnAct(Cell):
    r"""
    A combination of Dense, Batchnorm, activation layer.
    A combination of Dense, Batchnorm, and the activation layer.

    For a more Detailed overview of Dense op.
    This part is a more detailed overview of Dense op.

    Args:
        in_channels (int): The number of channels in the input space.
@@ -172,11 +172,11 @@ class DenseBnAct(Cell):
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (Cell): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
        has_bn (bool): Specifies to used batchnorm or not. Default: False.
        activation (Cell): The regularization function applied to the output of the layer, eg. 'ReLU'. Default: None.
        has_bn (bool): Specifies to use batchnorm or not. Default: False.
        activation (string): Specifies activation type. The optional values are as following:
            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
            'Softmax', 'LogSoftmax', 'ReLU', 'ReLU6', 'Tanh', 'GELU', 'Sigmoid',
            'PReLU', 'LeakyReLU', 'h-Swish', and 'h-Sigmoid'. Default: None.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
@@ -292,10 +292,10 @@ class FakeQuantWithMinMax(Cell):
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        channel_axis (int): Quantization by channel axis. Default: 1.
        num_channels (int): declarate the min and max channel size, Default: 1.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of FakeQuantWithMinMax.
@@ -398,7 +398,7 @@ class Conv2dBnFoldQuant(Cell):
    r"""
    2D convolution with BatchNormal op folded layer.

    For a more Detailed overview of Conv2d op.
    This part is a more detailed overview of Conv2d op.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@@ -419,13 +419,13 @@ class Conv2dBnFoldQuant(Cell):
            mean vector. Default: 'zeros'.
        var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
            variance vector. Default: 'ones'.
        fake (bool): Conv2dBnFoldQuant Cell add FakeQuantWithMinMax op or not. Default: True.
        fake (bool): Whether Conv2dBnFoldQuant Cell adds FakeQuantWithMinMax op. Default: True.
        per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        freeze_bn (int): Quantization freeze BatchNormal op according by global step. Default: 100000.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): The Quantization delay parameters according to the global step. Default: 0.
        freeze_bn (int): The quantization freeze BatchNormal op is according to the global step. Default: 100000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -585,7 +585,7 @@ class Conv2dBnWithoutFoldQuant(Cell):
    r"""
    2D convolution + batchnorm without fold with fake quant op layer.

    For a more Detailed overview of Conv2d op.
    This part is a more detailed overview of Conv2d op.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@@ -605,10 +605,10 @@ class Conv2dBnWithoutFoldQuant(Cell):
            Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: 'zeros'.
        per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -711,7 +711,7 @@ class Conv2dQuant(Cell):
    r"""
    2D convolution with fake quant op layer.

    For a more Detailed overview of Conv2d op.
    This part is a more detailed overview of Conv2d op.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@@ -728,10 +728,10 @@ class Conv2dQuant(Cell):
            Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: 'zeros'.
        per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -826,7 +826,7 @@ class DenseQuant(Cell):
    r"""
    The fully connected layer with fake quant op.

    For a more Detailed overview of Dense op.
    This part is a more detailed overview of Dense op.

    Args:
        in_channels (int): The dimension of the input space.
@@ -836,12 +836,12 @@ class DenseQuant(Cell):
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
        activation (str): The regularization function applied to the output of the layer, eg. 'relu'. Default: None.
        per_channel (bool): FakeQuantWithMinMax Parameters. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -906,7 +906,7 @@ class DenseQuant(Cell):
                                                     quant_delay=quant_delay)

    def construct(self, x):
        """Use operators to construct to Dense layer."""
        """Use operators to construct the Dense layer."""
        output = self.fake_quant_weight(self.weight)
        output = self.matmul(x, output)
        if self.has_bias:
@@ -942,16 +942,16 @@ class ActQuant(_QuantActivation):

    Add Fake Quant OP after activation. Not Recommand to used these cell for Fake Quant Op
    Will climp the max range of the activation and the relu6 do the same operation.
    For a more Detailed overview of ReLU6 op.
    This part is a more detailed overview of ReLU6 op.

    Args:
        activation (Cell): Activation cell class.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global steps. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of ReLU6Quant.
@@ -997,16 +997,16 @@ class LeakyReLUQuant(_QuantActivation):
    r"""
    LeakyReLUQuant activation function. Add Fake Quant OP after HSwish OP.

    For a more Detailed overview of HSwish op.
    This part is a more detailed overview of HSwish op.

    Args:
        activation (Cell): Activation cell class.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of LeakyReLUQuant.
@@ -1067,16 +1067,16 @@ class HSwishQuant(_QuantActivation):
    r"""
    HSwishQuant activation function. Add Fake Quant OP after HSwish OP.

    For a more Detailed overview of HSwish op.
    This part is a more detailed overview of HSwish op.

    Args:
        activation (Cell): Activation cell class.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of HSwishQuant.
@@ -1136,16 +1136,16 @@ class HSigmoidQuant(_QuantActivation):
    r"""
    HSigmoidQuant activation function. Add Fake Quant OP before and after HSigmoid OP.

    For a more Detailed overview of HSigmoid op.
    This part is a more detailed overview of HSigmoid op.

    Args:
        activation (Cell): Activation cell class.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of HSigmoidQuant.
@@ -1205,15 +1205,15 @@ class TensorAddQuant(Cell):
    r"""
    Add Fake Quant OP after TensorAdd OP.

    For a more Detailed overview of TensorAdd op.
    This part is a more detailed overview of TensorAdd op.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of TensorAddQuant.
@@ -1257,15 +1257,15 @@ class MulQuant(Cell):
    r"""
    Add Fake Quant OP after Mul OP.

    For a more Detailed overview of Mul op.
    This part is a more detailed overview of Mul op.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        per_channel (bool):  Quantization granularity based on layer or on channel. Default: False.
        num_bits (int): Quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): Quantization algorithm use symmetric or not. Default: False.
        narrow_range (bool): Quantization algorithm use narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.
        num_bits (int): The quantization number bit, support 4 and 8bit. Default: 8.
        symmetric (bool): The quantization algorithm is symmetric or not. Default: False.
        narrow_range (bool): The quantization algorithm uses narrow range or not. Default: False.
        quant_delay (int): Quantization delay parameters according to the global step. Default: 0.

    Inputs:
        - **x** (Tensor) - The input of MulQuant.
@@ -1317,7 +1317,7 @@ class QuantBlock(Cell):
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
        activation (str): The regularization function applied to the output of the layer, eg. 'relu'. Default: None.
        batchnorm (bool): Specifies to used batchnorm or not. Default: None.
        activation (string): Specifies activation type. The optional values are as following:
            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -353,17 +353,18 @@ class CosineEmbeddingLoss(_Loss):

    Args:
        margin (float): Should be in [-1.0, 1.0]. Default 0.0.
        reduction (str): Specifies which reduction to apply to the output. It should be one of
          "none", "mean", "sum", meaning no reduction, reduce mean or sum on output, respectively. Default "mean".
        reduction (str): Specifies which reduction to be applied to the output. It should be one of
          "none", "mean", and "sum", meaning no reduction, reduce mean and sum on output, respectively. Default "mean".

    Inputs:
        - **input_x1** (Tensor) - Input tensor.
        - **input_x2** (Tensor) - Its shape and data type should be the same as `input_x1`'s shape and data type.
        - **y** (Tensor) - Contains value 1 or -1. Suppose `input_x1` shape is
          :math:`(x_1, x_2, x_3,..., x_R)`, then `target` shape should be :math:`(x_1, x_3, x_4, ..., x_R)`.
        - **y** (Tensor) - Contains value 1 or -1. Suppose the shape of `input_x1` is
          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` should be :math:`(x_1, x_3, x_4, ..., x_R)`.

    Outputs:
        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, loss value otherwise.
        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, otherwise a scalar value
          will be returned.

    Examples:
        >>> x1 = Tensor(np.array([[0.3, 0.8], [0.4, 0.3]]), mindspore.float32)
--- a/mindspore/nn/metrics/accuracy.py
+++ b/mindspore/nn/metrics/accuracy.py
@@ -21,9 +21,9 @@ class Accuracy(EvaluationBase):
    r"""
    Calculates the accuracy for classification and multilabel data.

    The accuracy class creates two local variables, correct number and total number that are used to compute the
    The accuracy class creates two local variables, the correct number and the total number that are used to compute the
    frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
    idempotent operation that simply divides correct number by total number.
    idempotent operation that simply divides the correct number by the total number.

    .. math::
        \text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
@@ -58,17 +58,17 @@ class Accuracy(EvaluationBase):

        Args:
            inputs: Input `y_pred` and `y`. `y_pred` and `y` are a `Tensor`, a list or an array.
                For 'classification' evaluation type, `y_pred` is in most cases (not strictly) a list
                For the 'classification' evaluation type, `y_pred` is in most cases (not strictly) a list
                of floating numbers in range :math:`[0, 1]`
                and the shape is :math:`(N, C)`, where :math:`N` is the number of cases and :math:`C`
                is the number of categories. Shape of `y` can be :math:`(N, C)` with values 0 and 1 if one-hot
                encoding is used or the shape is :math:`(N,)` with integer values if index of category is used.
                For 'multilabel' evaluation type, `y_pred` and `y` can only be one-hot encoding with
                values 0 or 1. Indices with 1 indicate positive category. The shape of `y_pred` and `y`
                values 0 or 1. Indices with 1 indicate the positive category. The shape of `y_pred` and `y`
                are both :math:`(N, C)`.

        Raises:
            ValueError: If the number of the input is not 2.
            ValueError: If the number of the inputs is not 2.
        """
        if len(inputs) != 2:
            raise ValueError('Accuracy need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -132,7 +132,7 @@ def _check_param_value(beta1, beta2, eps, prim_name):

 class Adam(Optimizer):
    r"""
    Updates gradients by Adaptive Moment Estimation (Adam) algorithm.
    Updates gradients by the Adaptive Moment Estimation (Adam) algorithm.

    The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.

@@ -157,9 +157,9 @@ class Adam(Optimizer):
        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.

        To improve parameter groups performance, the customized order of parameters can be supported.
        To improve parameter groups performance, the customized order of parameters is supported.

        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network.
        The sparse strategy is applied while the SparseGatherV2 operator is used for forward network.
        The sparse feature is under continuous development. The sparse
        behavior is currently performed on the CPU.

@@ -170,36 +170,36 @@ class Adam(Optimizer):

            - params: Required. The value should be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.

            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
              which in the 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or graph for the learning rate.
            When the learning_rate is a Iterable or a Tensor with dimension of 1, use dynamic learning rate, then
            When the learning_rate is a Iterable or a Tensor with dimension of 1, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor with
            dimension of 0, use fixed learning rate. Other cases are not supported. The float learning rate should be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). Default:
                       0.9.
        beta2 (float): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). Default:
                       0.999.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
                       Default: 0.9.
        beta2 (float): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
                       Default: 0.999.
        eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                     1e-8.
        use_locking (bool): Whether to enable a lock to protect updating variable tensors.
            If True, updating of the var, m, and v tensors will be protected by a lock.
            If False, the result is unpredictable. Default: False.
        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
            If True, updates the gradients using NAG.
            If False, updates the gradients without using NAG. Default: False.
            If True, update the gradients using NAG.
            If False, update the gradients without using NAG. Default: False.
        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.

@@ -278,11 +278,11 @@ class Adam(Optimizer):

 class AdamWeightDecay(Optimizer):
    """
    Implements Adam algorithm weight decay fix.
    Implements the Adam algorithm to fix the weight decay.

    Note:
        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
        weight decay is posigive. When not separating parameter groups, the `weight_decay` in the API will be applied
        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.

        To improve parameter groups performance, the customized order of parameters can be supported.
@@ -294,27 +294,27 @@ class AdamWeightDecay(Optimizer):

            - params: Required. The value should be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.

            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.
            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
              which in the 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or graph for the learning rate.
            When the learning_rate is a Iterable or a Tensor with dimension of 1, use dynamic learning rate, then
            When the learning_rate is a Iterable or a Tensor with dimension of 1, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor with
            dimension of 0, use fixed learning rate. Other cases are not supported. The float learning rate should be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
            Should be in range (0.0, 1.0).
        beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
        beta2 (float): The exponential decay rate for the 2nd moment estimations. Default: 0.999.
            Should be in range (0.0, 1.0).
        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
            Should be greater than 0.
--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -201,8 +201,8 @@ class DataWrapper(Cell):

    Args:
        network (Cell): The training network for dataset.
        dataset_types (list): The type of dataset. The list contains describes the types of the inputs.
        dataset_shapes (list): The shapes of dataset. The list contains multiple sublists that describes
        dataset_types (list): The type of dataset. The list contains the types of the inputs.
        dataset_shapes (list): The shapes of dataset. The list contains multiple sublists that describe
            the shape of the inputs.
        queue_name (str): The identification of dataset channel which specifies the dataset channel to supply
            data for the network.
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -663,16 +663,16 @@ class MaxPoolGradGrad(_PoolGrad):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **origin_input** (Tensor) - Tensor with data format "NCHW", data type should be float16.
@@ -736,16 +736,16 @@ class MaxPoolGradGradWithArgmax(_PoolGrad):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **x** (Tensor) - Tensor with data format "NCHW", data type should be float16.
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -756,11 +756,11 @@ class Conv2D(PrimitiveWithInfer):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -1029,7 +1029,7 @@ class _Pool(PrimitiveWithInfer):
           of two `int` for height and width. Default: 1.
        strides (Union[int, tuple[int]]): The stride of the window, that should be
            a tuple of two `int` for height and width. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".
    """

@@ -1104,16 +1104,16 @@ class MaxPool(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -1151,16 +1151,16 @@ class MaxPoolWithArgmax(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.


    Inputs:
@@ -1233,16 +1233,16 @@ class AvgPool(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

            - same: Adopts the way of completion. Output height and width will be the same as
              the input. Total number of padding will be calculated for horizontal and vertical
              direction and evenly distributed to top and bottom, left and right if possible.
            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

            - valid: Adopts the way of discarding. The possibly largest height and width of output
              will be return without padding. Extra pixels will be discarded.
            - valid: Adopts the way of discarding. The possible largest height and width of output
              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.