From 5bd1a198ad0b8ac24727e45708cffe5973e29a24 Mon Sep 17 00:00:00 2001
From: lihongkang <[lihongkang1@huawei.com]>
Date: Fri, 8 Jan 2021 18:44:19 +0800
Subject: [PATCH] fix bugs

---
 mindspore/nn/layer/activation.py      | 46 +++++++++++---
 mindspore/nn/layer/basic.py           | 20 +++---
 mindspore/nn/loss/loss.py             | 26 ++++----
 mindspore/nn/optim/momentum.py        | 14 ++---
 mindspore/nn/optim/rmsprop.py         | 14 ++---
 mindspore/nn/optim/sgd.py             | 14 ++---
 mindspore/ops/composite/random_ops.py |  6 +-
 mindspore/ops/operations/array_ops.py | 17 ++---
 mindspore/ops/operations/math_ops.py  | 24 +++++++
 mindspore/ops/operations/nn_ops.py    | 90 +++++++++++++++++----------
 10 files changed, 179 insertions(+), 92 deletions(-)

diff --git a/mindspore/nn/layer/activation.py b/mindspore/nn/layer/activation.py
index 67d7726338..3e8f7704a2 100644
--- a/mindspore/nn/layer/activation.py
+++ b/mindspore/nn/layer/activation.py
@@ -96,7 +96,11 @@ class LogSoftmax(Cell):
     The input is transformed by the Softmax function and then by the log function to lie in range[-inf,0).
 
     Logsoftmax is defined as:
-    :math:`\text{logsoftmax}(x_i) = \log \left(\frac{\exp(x_i)}{\sum_{j=0}^{n-1} \exp(x_j)}\right)`,
+
+    .. math::
+
+        \text{logsoftmax}(x_i) = \log \left(\frac{\exp(x_i)}{\sum_{j=0}^{n-1} \exp(x_j)}\right),
+
     where :math:`x_{i}` is the :math:`i`-th slice in the given dimension of the input Tensor.
 
     Args:
@@ -177,8 +181,13 @@ class ReLU(Cell):
     r"""
     Rectified Linear Unit activation function.
 
-    Applies the rectified linear unit function element-wise. It returns
-    element-wise :math:`\max(0, x)`, specially, the neurons with the negative output
+    Applies the rectified linear unit function element-wise.
+
+    .. math::
+
+        \text{ReLU}(x) = (x)^+ = \max(0, x),
+
+    It returns element-wise :math:`\max(0, x)`, specially, the neurons with the negative output
     will be suppressed and the active neurons will stay the same.
 
     The picture about ReLU looks like this `ReLU <https://en.wikipedia.org/wiki/
@@ -215,7 +224,13 @@ class ReLU6(Cell):
 
     ReLU6 is similar to ReLU with a upper limit of 6, which if the inputs are greater than 6, the outputs
     will be suppressed to 6.
-    It computes element-wise as :math:`\min(\max(0, x), 6)`. The input is a Tensor of any valid shape.
+    It computes element-wise as
+
+    .. math::
+
+        \min(\max(0, x), 6).
+
+    The input is a Tensor of any valid shape.
 
     Inputs:
         - **input_data** (Tensor) - The input of ReLU6.
@@ -338,7 +353,12 @@ class GELU(Cell):
     Applies GELU function to each element of the input. The input is a Tensor with any valid shape.
 
     GELU is defined as:
-    :math:`GELU(x_i) = x_i*P(X < x_i)`, where :math:`P` is the cumulative distribution function
+
+    .. math::
+
+        GELU(x_i) = x_i*P(X < x_i),
+
+    where :math:`P` is the cumulative distribution function
     of standard Gaussian distribution and :math:`x_i` is the element of the input.
 
     The picture about GELU looks like this `GELU <https://en.wikipedia.org/wiki/
@@ -417,7 +437,12 @@ class Sigmoid(Cell):
     Applies sigmoid-type activation element-wise.
 
     Sigmoid function is defined as:
-    :math:`\text{sigmoid}(x_i) = \frac{1}{1 + \exp(-x_i)}`,    where :math:`x_i` is the element of the input.
+
+    .. math::
+
+        \text{sigmoid}(x_i) = \frac{1}{1 + \exp(-x_i)},
+
+    where :math:`x_i` is the element of the input.
 
     The picture about Sigmoid looks like this `Sigmoid <https://en.wikipedia.org/wiki/
     Sigmoid_function#/media/File:Logistic-curve.svg>`_.
@@ -453,8 +478,13 @@ class PReLU(Cell):
 
     Applies the PReLU function element-wise.
 
-    PReLU is defined as: :math:`prelu(x_i)= \max(0, x_i) + w * \min(0, x_i)`, where :math:`x_i`
-    is an element of an channel of the input.
+    PReLU is defined as:
+
+    .. math::
+
+        prelu(x_i)= \max(0, x_i) + w * \min(0, x_i),
+
+    where :math:`x_i` is an element of an channel of the input.
 
     Here :math:`w` is a learnable parameter with a default initial value 0.25.
     Parameter :math:`w` has dimensionality of the argument channel. If called without argument
diff --git a/mindspore/nn/layer/basic.py b/mindspore/nn/layer/basic.py
index d643ecb664..b70f93bd9a 100644
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -98,18 +98,18 @@ class Dropout(Cell):
     Randomly set some elements of the input tensor to zero with probability :math:`1 - keep\_prob` during training
     using samples from a Bernoulli distribution.
 
-    Note:
-        Each channel will be zeroed out independently on every construct call.
+    The outputs are scaled by a factor of :math:`\frac{1}{keep\_prob}`    during training so
+    that the output layer remains at a similar scale. During inference, this
+    layer returns the same tensor as the input.
 
-        The outputs are scaled by a factor of :math:`\frac{1}{keep\_prob}`    during training so
-        that the output layer remains at a similar scale. During inference, this
-        layer returns the same tensor as the input.
+    This technique is proposed in paper `Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+    <http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf>`_ and proved to be effective to reduce
+    over-fitting and prevents neurons from co-adaptation. See more details in `Improving neural networks by
+    preventing co-adaptation of feature detectors
+    <https://arxiv.org/pdf/1207.0580.pdf>`_.
 
-        This technique is proposed in paper `Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        <http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf>`_ and proved to be effective to reduce
-        over-fitting and prevents neurons from co-adaptation. See more details in `Improving neural networks by
-        preventing co-adaptation of feature detectors
-        <https://arxiv.org/pdf/1207.0580.pdf>`_.
+    Note:
+        Each channel will be zeroed out independently on every construct call.
 
     Args:
         keep_prob (float): The keep rate, greater than 0 and less equal than 1. E.g. rate=0.9,
diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py
index 3f32d4ca71..f1c0c0a740 100644
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -512,23 +512,23 @@ class BCELoss(_Loss):
     r"""
     BCELoss creates a criterion to measure the binary cross entropy between the true labels and predicted labels.
 
-    Note:
-        Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`.
-        Let,
+    Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`.
+    Let,
 
-        .. math::
-            L = \{l_1,\dots,l_N\}^\top, \quad
-            l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
+    .. math::
+        L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
 
-        Then,
+    Then,
 
-        .. math::
-            \ell(x, y) = \begin{cases}
-            L, & \text{if reduction} = \text{`none';}\\
-            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
-            \end{cases}
+    .. math::
+        \ell(x, y) = \begin{cases}
+        L, & \text{if reduction} = \text{`none';}\\
+        \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+        \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
 
+    Note:
         Note that the predicted labels should always be the output of sigmoid and the true labels should be numbers
         between 0 and 1.
 
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 8994331ad7..8b2e2e45b2 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -45,13 +45,6 @@ class Momentum(Optimizer):
 
     Refer to the paper on the importance of initialization and momentum in deep learning for more details.
 
-    Note:
-        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
-        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
-        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
-
-        To improve parameter groups performance, the customized order of parameters can be supported.
-
     .. math::
             v_{t} = v_{t-1} \ast u + gradients
 
@@ -67,6 +60,13 @@ class Momentum(Optimizer):
 
     Here: where grad, lr, p, v and u denote the gradients, learning_rate, params, moments, and momentum respectively.
 
+    Note:
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
+        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
+
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py
index cac34e5e35..fe6e6fd9bc 100644
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -46,13 +46,13 @@ class RMSProp(Optimizer):
 
     The equation is as follows:
 
-    ..  math::
+    .. math::
         s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
-    ..  math::
+    .. math::
         m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} + \\epsilon}} \\nabla Q_{i}(w)
 
-    ..  math::
+    .. math::
         w = w - m_{t}
 
     The first equation calculates moving average of the squared gradient for
@@ -60,16 +60,16 @@ class RMSProp(Optimizer):
 
     if centered is True:
 
-    ..  math::
+    .. math::
         g_{t} = \\rho g_{t-1} + (1 - \\rho)\\nabla Q_{i}(w)
 
-    ..  math::
+    .. math::
         s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
-    ..  math::
+    .. math::
         m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} - g_{t}^2 + \\epsilon}} \\nabla Q_{i}(w)
 
-    ..  math::
+    .. math::
         w = w - m_{t}
 
     where :math:`w` represents `params`, which will be updated.
diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py
index 81d0e39676..48f1d7508f 100755
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -39,13 +39,6 @@ class SGD(Optimizer):
     Nesterov momentum is based on the formula from paper `On the importance of initialization and
     momentum in deep learning <http://proceedings.mlr.press/v28/sutskever13.html>`_.
 
-    Note:
-        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
-        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
-        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
-
-        To improve parameter groups performance, the customized order of parameters can be supported.
-
     .. math::
             v_{t+1} = u \ast v_{t} + gradient \ast (1-dampening)
 
@@ -63,6 +56,13 @@ class SGD(Optimizer):
 
     Here : where p, v and u denote the parameters, accum, and momentum respectively.
 
+    Note:
+        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
+        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
+        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.
+
+        To improve parameter groups performance, the customized order of parameters can be supported.
+
     Args:
         params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
             the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
diff --git a/mindspore/ops/composite/random_ops.py b/mindspore/ops/composite/random_ops.py
index 4d050dfb43..f8bfbad5b0 100644
--- a/mindspore/ops/composite/random_ops.py
+++ b/mindspore/ops/composite/random_ops.py
@@ -211,9 +211,13 @@ def gamma(shape, alpha, beta, seed=None):
 
 
 def poisson(shape, mean, seed=None):
-    """
+    r"""
     Generates random numbers according to the Poisson random number distribution.
 
+    .. math::
+
+        \text{P}(i|μ) = \frac{\exp(-μ)μ^{i}}{i!}
+
     Args:
         shape (tuple): The shape of random tensor to be generated.
         mean (Tensor): The mean μ distribution parameter. It should be greater than 0 with float32 data type.
diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py
index 47924b891f..da97966a65 100644
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@@ -4104,21 +4104,22 @@ class BroadcastTo(PrimitiveWithInfer):
 
     When input shape is broadcast to target shape, it starts with the trailing dimensions.
 
-    Raises:
-        ValueError: Given a shape tuple, if it has several -1; or if the -1 is in an invalid position
-            such as one that does not have a opposing dimension in an input tensor; or if the target and
-            input shapes are incompatiable.
-
     Args:
         shape (tuple): The target shape to broadcast. Can be fully specified, or have -1 in one position
             where it will be substituted by the input tensor's shape in that position, see example.
 
     Inputs:
-        - **input_x** (Tensor) - The input tensor.
+        - **input_x** (Tensor) - The input tensor. The data type should be one of the following types: float16, float32,
+          int32, int8, uint8.
 
     Outputs:
         Tensor, with the given `shape` and the same data type as `input_x`.
 
+    Raises:
+        ValueError: Given a shape tuple, if it has several -1; or if the -1 is in an invalid position
+            such as one that does not have a opposing dimension in an input tensor; or if the target and
+            input shapes are incompatiable.
+
     Supported Platforms:
         ``Ascend`` ``GPU``
 
@@ -4402,7 +4403,9 @@ class ReverseSequence(PrimitiveWithInfer):
 
 class EditDistance(PrimitiveWithInfer):
     """
-    Computes the Levebshtein Edit Distance. It is used to measure the similarity of two sequences.
+    Computes the Levebshtein Edit Distance. It is used to measure the similarity of two sequences. The inputs are
+    variable-length sequences provided by SparseTensors (hypothesis_indices, hypothesis_values, hypothesis_shape)
+    and (truth_indices, truth_values, truth_shape).
 
     Args:
         normalize (bool): If true, edit distances are normalized by length of truth. Default: True.
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index 12bd70bcf5..deb505b5c1 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -840,6 +840,10 @@ class CumSum(PrimitiveWithInfer):
     """
     Computes the cumulative sum of input tensor along axis.
 
+    .. math::
+
+        y_i = x_1 + x_2 + x_3 + ... + x_i
+
     Args:
         exclusive (bool): If true, perform exclusive mode. Default: False.
         reverse (bool): If true, perform inverse cumulative sum. Default: False.
@@ -2248,6 +2252,10 @@ class Ceil(PrimitiveWithInfer):
     """
     Rounds a tensor up to the closest integer element-wise.
 
+    .. math::
+
+        out_i = [input_i] = [input_i] + 1
+
     Inputs:
         - **input_x** (Tensor) - The input tensor. It's element data type must be float16 or float32.
 
@@ -2357,6 +2365,10 @@ class Acosh(PrimitiveWithInfer):
     """
     Computes inverse hyperbolic cosine of the input element-wise.
 
+    .. math::
+
+        out_i = cosh^{-1}(input_i)
+
     Inputs:
         - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
 
@@ -2423,6 +2435,10 @@ class Asinh(PrimitiveWithInfer):
     """
     Computes inverse hyperbolic sine of the input element-wise.
 
+    .. math::
+
+        out_i = sinh^{-1}(input_i)
+
     Inputs:
         - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
 
@@ -3241,6 +3257,10 @@ class ACos(PrimitiveWithInfer):
     """
     Computes arccosine of input tensors element-wise.
 
+    .. math::
+
+        out_i = cos^{-1}(input_i)
+
     Inputs:
         - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
 
@@ -3405,6 +3425,10 @@ class Abs(PrimitiveWithInfer):
     """
     Returns absolute value of a tensor element-wise.
 
+    .. math::
+
+        out_i = |input_i|
+
     Inputs:
         - **input_x** (Tensor) - The input tensor. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
 
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 0f336974f4..4337eeef01 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -260,7 +260,8 @@ class Softsign(PrimitiveWithInfer):
     The function is shown as follows:
 
     .. math::
-        \text{output} = \frac{\text{input_x}}{1 + \left| \text{input_x} \right|},
+
+        \text{SoftSign}(x) = \frac{x}{ 1 + |x|}
 
     Inputs:
         - **input_x** (Tensor) - The input tensor whose data type must be float16 or float32.
@@ -332,6 +333,10 @@ class ReLU6(PrimitiveWithInfer):
     r"""
     Computes ReLU (Rectified Linear Unit) upper bounded by 6 of input tensors element-wise.
 
+    .. math::
+
+        \text{ReLU6}(x) = \min(\max(0,x), 6)
+
     It returns :math:`\min(\max(0,x), 6)` element-wise.
 
     Inputs:
@@ -437,15 +442,12 @@ class Elu(PrimitiveWithInfer):
     r"""
     Computes exponential linear:
 
-    if x < 0:
-
     .. math::
-        \text{x} = \alpha * (\exp(\text{x}) - 1)
 
-    if x >= 0:
-
-    .. math::
-        \text{x} = \text{x}
+        \text{x} = \begin{cases}
+        \alpha * (\exp(\text{x}) - 1), & \text{if x} < \text{0;}\\
+        \text{x}, & \text{if x} >= \text{0.}
+        \end{cases}
 
     The data type of input tensor must be float.
 
@@ -1569,8 +1571,11 @@ class MaxPoolWithArgmax(_Pool):
           It has the same data type as `input`.
         - **mask** (Tensor) -  Max values' index represented by the mask. Data type is int32.
 
+    Raises:
+        TypeError: If the input data type is not float16 or float32.
+
     Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``
 
     Examples:
         >>> input_tensor = Tensor(np.arange(1 * 3 * 3 * 4).reshape((1, 3, 3, 4)), mindspore.float32)
@@ -2357,8 +2362,8 @@ class SGD(PrimitiveWithCheck):
     """
     Computes the stochastic gradient descent. Momentum is optional.
 
-    Nesterov momentum is based on the formula from On the importance of
-    initialization and momentum in deep learning.
+    Nesterov momentum is based on the formula from paper 'On the importance of
+    initialization and momentum in deep learning <http://proceedings.mlr.press/v28/sutskever13.html>'_.
 
     Note:
         For details, please refer to `nn.SGD` source code.
@@ -3005,7 +3010,7 @@ class Gelu(PrimitiveWithInfer):
 
 class FastGelu(PrimitiveWithInfer):
     r"""
-    fast Gaussian Error Linear Units activation function.
+    Fast Gaussian Error Linear Units activation function.
 
     FastGelu is defined as follows:
 
@@ -3181,7 +3186,8 @@ class LSTM(PrimitiveWithInfer):
     """
     Performs the Long Short-Term Memory (LSTM) on the input.
 
-    For detailed information, please refer to `nn.LSTM`.
+    For detailed information, please refer to `nn.LSTM
+    <https://www.mindspore.cn/doc/api_python/zh-CN/master/mindspore/nn/mindspore.nn.LSTM.html>`_.
 
     Supported Platforms:
         ``GPU`` ``CPU``
@@ -3289,14 +3295,13 @@ class SigmoidCrossEntropyWithLogits(PrimitiveWithInfer):
     r"""
     Uses the given logits to compute sigmoid cross entropy.
 
-    Note:
-        Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
+    Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
 
-        .. math::
-            p_{ij} = sigmoid(X_{ij}) = \frac{1}{1 + e^{-X_{ij}}}
+    .. math::
+        p_{ij} = sigmoid(X_{ij}) = \frac{1}{1 + e^{-X_{ij}}}
 
-        .. math::
-            loss_{ij} = -[Y_{ij} * ln(p_{ij}) + (1 - Y_{ij})ln(1 - p_{ij})]
+    .. math::
+        loss_{ij} = -[Y_{ij} * ln(p_{ij}) + (1 - Y_{ij})ln(1 - p_{ij})]
 
     Inputs:
         - **logits** (Tensor) - Input logits.
@@ -4376,22 +4381,21 @@ class BinaryCrossEntropy(PrimitiveWithInfer):
     r"""
     Computes the binary cross entropy between the target and the output.
 
-    Note:
-        Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`.
-        Let,
+    Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`.
+    Let,
 
-        .. math::
-            L = \{l_1,\dots,l_N\}^\top, \quad
-            l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
+    .. math::
+        L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
 
-        Then,
+    Then,
 
-        .. math::
-            \ell(x, y) = \begin{cases}
-            L, & \text{if reduction} = \text{'none';}\\
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
-            \end{cases}
+    .. math::
+        \ell(x, y) = \begin{cases}
+        L, & \text{if reduction} = \text{'none';}\\
+        \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
+        \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+        \end{cases}
 
     Args:
         reduction (str): Specifies the reduction to be applied to the output.
@@ -6568,6 +6572,21 @@ class DynamicGRUV2(PrimitiveWithInfer):
     r"""
     Applies a single-layer gated recurrent unit (GRU) to an input sequence.
 
+    .. math::
+
+        \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
+
     Args:
         direction (str): A string identifying the direction in the op. Default: 'UNIDIRECTIONAL'.
             Only 'UNIDIRECTIONAL' is currently supported.
@@ -6619,6 +6638,8 @@ class DynamicGRUV2(PrimitiveWithInfer):
         - **hidden_new** (Tensor) - A Tensor of shape :math:`(\text{num_step}, \text{batch_size}, \text{hidden_size})`.
           Has the same data type with input `bias_type`.
 
+        A note about the bias_type:
+
         - If `bias_input` and `bias_hidden` both are `None`, `bias_type` is date type of `init_h`.
         - If `bias_input` is not `None`, `bias_type` is the date type of `bias_input`.
         - If `bias_input` is `None` and `bias_hidden` is not `None, `bias_type` is the date type of `bias_hidden`.
@@ -6772,6 +6793,11 @@ class LRN(PrimitiveWithInfer):
     r"""
     Local Response Normalization.
 
+    .. math::
+
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+
     Args:
         depth_radius (int): Half-width of the 1-D normalization window with the shape of 0-D.
         bias (float): An offset (usually positive to avoid dividing by 0).