From 5bd1a198ad0b8ac24727e45708cffe5973e29a24 Mon Sep 17 00:00:00 2001 From: lihongkang <[lihongkang1@huawei.com]> Date: Fri, 8 Jan 2021 18:44:19 +0800 Subject: [PATCH] fix bugs --- mindspore/nn/layer/activation.py | 46 +++++++++++--- mindspore/nn/layer/basic.py | 20 +++--- mindspore/nn/loss/loss.py | 26 ++++---- mindspore/nn/optim/momentum.py | 14 ++--- mindspore/nn/optim/rmsprop.py | 14 ++--- mindspore/nn/optim/sgd.py | 14 ++--- mindspore/ops/composite/random_ops.py | 6 +- mindspore/ops/operations/array_ops.py | 17 ++--- mindspore/ops/operations/math_ops.py | 24 +++++++ mindspore/ops/operations/nn_ops.py | 90 +++++++++++++++++---------- 10 files changed, 179 insertions(+), 92 deletions(-) diff --git a/mindspore/nn/layer/activation.py b/mindspore/nn/layer/activation.py index 67d7726338..3e8f7704a2 100644 --- a/mindspore/nn/layer/activation.py +++ b/mindspore/nn/layer/activation.py @@ -96,7 +96,11 @@ class LogSoftmax(Cell): The input is transformed by the Softmax function and then by the log function to lie in range[-inf,0). Logsoftmax is defined as: - :math:`\text{logsoftmax}(x_i) = \log \left(\frac{\exp(x_i)}{\sum_{j=0}^{n-1} \exp(x_j)}\right)`, + + .. math:: + + \text{logsoftmax}(x_i) = \log \left(\frac{\exp(x_i)}{\sum_{j=0}^{n-1} \exp(x_j)}\right), + where :math:`x_{i}` is the :math:`i`-th slice in the given dimension of the input Tensor. Args: @@ -177,8 +181,13 @@ class ReLU(Cell): r""" Rectified Linear Unit activation function. - Applies the rectified linear unit function element-wise. It returns - element-wise :math:`\max(0, x)`, specially, the neurons with the negative output + Applies the rectified linear unit function element-wise. + + .. math:: + + \text{ReLU}(x) = (x)^+ = \max(0, x), + + It returns element-wise :math:`\max(0, x)`, specially, the neurons with the negative output will be suppressed and the active neurons will stay the same. The picture about ReLU looks like this `ReLU `_. @@ -453,8 +478,13 @@ class PReLU(Cell): Applies the PReLU function element-wise. - PReLU is defined as: :math:`prelu(x_i)= \max(0, x_i) + w * \min(0, x_i)`, where :math:`x_i` - is an element of an channel of the input. + PReLU is defined as: + + .. math:: + + prelu(x_i)= \max(0, x_i) + w * \min(0, x_i), + + where :math:`x_i` is an element of an channel of the input. Here :math:`w` is a learnable parameter with a default initial value 0.25. Parameter :math:`w` has dimensionality of the argument channel. If called without argument diff --git a/mindspore/nn/layer/basic.py b/mindspore/nn/layer/basic.py index d643ecb664..b70f93bd9a 100644 --- a/mindspore/nn/layer/basic.py +++ b/mindspore/nn/layer/basic.py @@ -98,18 +98,18 @@ class Dropout(Cell): Randomly set some elements of the input tensor to zero with probability :math:`1 - keep\_prob` during training using samples from a Bernoulli distribution. - Note: - Each channel will be zeroed out independently on every construct call. + The outputs are scaled by a factor of :math:`\frac{1}{keep\_prob}` during training so + that the output layer remains at a similar scale. During inference, this + layer returns the same tensor as the input. - The outputs are scaled by a factor of :math:`\frac{1}{keep\_prob}` during training so - that the output layer remains at a similar scale. During inference, this - layer returns the same tensor as the input. + This technique is proposed in paper `Dropout: A Simple Way to Prevent Neural Networks from Overfitting + `_ and proved to be effective to reduce + over-fitting and prevents neurons from co-adaptation. See more details in `Improving neural networks by + preventing co-adaptation of feature detectors + `_. - This technique is proposed in paper `Dropout: A Simple Way to Prevent Neural Networks from Overfitting - `_ and proved to be effective to reduce - over-fitting and prevents neurons from co-adaptation. See more details in `Improving neural networks by - preventing co-adaptation of feature detectors - `_. + Note: + Each channel will be zeroed out independently on every construct call. Args: keep_prob (float): The keep rate, greater than 0 and less equal than 1. E.g. rate=0.9, diff --git a/mindspore/nn/loss/loss.py b/mindspore/nn/loss/loss.py index 3f32d4ca71..f1c0c0a740 100644 --- a/mindspore/nn/loss/loss.py +++ b/mindspore/nn/loss/loss.py @@ -512,23 +512,23 @@ class BCELoss(_Loss): r""" BCELoss creates a criterion to measure the binary cross entropy between the true labels and predicted labels. - Note: - Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`. - Let, + Set the predicted labels as :math:`x`, true labels as :math:`y`, the output loss as :math:`\ell(x, y)`. + Let, - .. math:: - L = \{l_1,\dots,l_N\}^\top, \quad - l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right] + .. math:: + L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right] - Then, + Then, - .. math:: - \ell(x, y) = \begin{cases} - L, & \text{if reduction} = \text{`none';}\\ - \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ - \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} - \end{cases} + .. math:: + \ell(x, y) = \begin{cases} + L, & \text{if reduction} = \text{`none';}\\ + \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} + \end{cases} + Note: Note that the predicted labels should always be the output of sigmoid and the true labels should be numbers between 0 and 1. diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py index 8994331ad7..8b2e2e45b2 100755 --- a/mindspore/nn/optim/momentum.py +++ b/mindspore/nn/optim/momentum.py @@ -45,13 +45,6 @@ class Momentum(Optimizer): Refer to the paper on the importance of initialization and momentum in deep learning for more details. - Note: - When separating parameter groups, the weight decay in each group will be applied on the parameters if the - weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied - on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive. - - To improve parameter groups performance, the customized order of parameters can be supported. - .. math:: v_{t} = v_{t-1} \ast u + gradients @@ -67,6 +60,13 @@ class Momentum(Optimizer): Here: where grad, lr, p, v and u denote the gradients, learning_rate, params, moments, and momentum respectively. + Note: + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied + on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive. + + To improve parameter groups performance, the customized order of parameters can be supported. + Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params", diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py index cac34e5e35..fe6e6fd9bc 100644 --- a/mindspore/nn/optim/rmsprop.py +++ b/mindspore/nn/optim/rmsprop.py @@ -46,13 +46,13 @@ class RMSProp(Optimizer): The equation is as follows: - .. math:: + .. math:: s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2 - .. math:: + .. math:: m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} + \\epsilon}} \\nabla Q_{i}(w) - .. math:: + .. math:: w = w - m_{t} The first equation calculates moving average of the squared gradient for @@ -60,16 +60,16 @@ class RMSProp(Optimizer): if centered is True: - .. math:: + .. math:: g_{t} = \\rho g_{t-1} + (1 - \\rho)\\nabla Q_{i}(w) - .. math:: + .. math:: s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2 - .. math:: + .. math:: m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} - g_{t}^2 + \\epsilon}} \\nabla Q_{i}(w) - .. math:: + .. math:: w = w - m_{t} where :math:`w` represents `params`, which will be updated. diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py index 81d0e39676..48f1d7508f 100755 --- a/mindspore/nn/optim/sgd.py +++ b/mindspore/nn/optim/sgd.py @@ -39,13 +39,6 @@ class SGD(Optimizer): Nesterov momentum is based on the formula from paper `On the importance of initialization and momentum in deep learning `_. - Note: - When separating parameter groups, the weight decay in each group will be applied on the parameters if the - weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied - on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive. - - To improve parameter groups performance, the customized order of parameters can be supported. - .. math:: v_{t+1} = u \ast v_{t} + gradient \ast (1-dampening) @@ -63,6 +56,13 @@ class SGD(Optimizer): Here : where p, v and u denote the parameters, accum, and momentum respectively. + Note: + When separating parameter groups, the weight decay in each group will be applied on the parameters if the + weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied + on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive. + + To improve parameter groups performance, the customized order of parameters can be supported. + Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params", diff --git a/mindspore/ops/composite/random_ops.py b/mindspore/ops/composite/random_ops.py index 4d050dfb43..f8bfbad5b0 100644 --- a/mindspore/ops/composite/random_ops.py +++ b/mindspore/ops/composite/random_ops.py @@ -211,9 +211,13 @@ def gamma(shape, alpha, beta, seed=None): def poisson(shape, mean, seed=None): - """ + r""" Generates random numbers according to the Poisson random number distribution. + .. math:: + + \text{P}(i|μ) = \frac{\exp(-μ)μ^{i}}{i!} + Args: shape (tuple): The shape of random tensor to be generated. mean (Tensor): The mean μ distribution parameter. It should be greater than 0 with float32 data type. diff --git a/mindspore/ops/operations/array_ops.py b/mindspore/ops/operations/array_ops.py index 47924b891f..da97966a65 100644 --- a/mindspore/ops/operations/array_ops.py +++ b/mindspore/ops/operations/array_ops.py @@ -4104,21 +4104,22 @@ class BroadcastTo(PrimitiveWithInfer): When input shape is broadcast to target shape, it starts with the trailing dimensions. - Raises: - ValueError: Given a shape tuple, if it has several -1; or if the -1 is in an invalid position - such as one that does not have a opposing dimension in an input tensor; or if the target and - input shapes are incompatiable. - Args: shape (tuple): The target shape to broadcast. Can be fully specified, or have -1 in one position where it will be substituted by the input tensor's shape in that position, see example. Inputs: - - **input_x** (Tensor) - The input tensor. + - **input_x** (Tensor) - The input tensor. The data type should be one of the following types: float16, float32, + int32, int8, uint8. Outputs: Tensor, with the given `shape` and the same data type as `input_x`. + Raises: + ValueError: Given a shape tuple, if it has several -1; or if the -1 is in an invalid position + such as one that does not have a opposing dimension in an input tensor; or if the target and + input shapes are incompatiable. + Supported Platforms: ``Ascend`` ``GPU`` @@ -4402,7 +4403,9 @@ class ReverseSequence(PrimitiveWithInfer): class EditDistance(PrimitiveWithInfer): """ - Computes the Levebshtein Edit Distance. It is used to measure the similarity of two sequences. + Computes the Levebshtein Edit Distance. It is used to measure the similarity of two sequences. The inputs are + variable-length sequences provided by SparseTensors (hypothesis_indices, hypothesis_values, hypothesis_shape) + and (truth_indices, truth_values, truth_shape). Args: normalize (bool): If true, edit distances are normalized by length of truth. Default: True. diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py index 12bd70bcf5..deb505b5c1 100644 --- a/mindspore/ops/operations/math_ops.py +++ b/mindspore/ops/operations/math_ops.py @@ -840,6 +840,10 @@ class CumSum(PrimitiveWithInfer): """ Computes the cumulative sum of input tensor along axis. + .. math:: + + y_i = x_1 + x_2 + x_3 + ... + x_i + Args: exclusive (bool): If true, perform exclusive mode. Default: False. reverse (bool): If true, perform inverse cumulative sum. Default: False. @@ -2248,6 +2252,10 @@ class Ceil(PrimitiveWithInfer): """ Rounds a tensor up to the closest integer element-wise. + .. math:: + + out_i = [input_i] = [input_i] + 1 + Inputs: - **input_x** (Tensor) - The input tensor. It's element data type must be float16 or float32. @@ -2357,6 +2365,10 @@ class Acosh(PrimitiveWithInfer): """ Computes inverse hyperbolic cosine of the input element-wise. + .. math:: + + out_i = cosh^{-1}(input_i) + Inputs: - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. @@ -2423,6 +2435,10 @@ class Asinh(PrimitiveWithInfer): """ Computes inverse hyperbolic sine of the input element-wise. + .. math:: + + out_i = sinh^{-1}(input_i) + Inputs: - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. @@ -3241,6 +3257,10 @@ class ACos(PrimitiveWithInfer): """ Computes arccosine of input tensors element-wise. + .. math:: + + out_i = cos^{-1}(input_i) + Inputs: - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. @@ -3405,6 +3425,10 @@ class Abs(PrimitiveWithInfer): """ Returns absolute value of a tensor element-wise. + .. math:: + + out_i = |input_i| + Inputs: - **input_x** (Tensor) - The input tensor. The shape of tensor is :math:`(x_1, x_2, ..., x_R)`. diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index 0f336974f4..4337eeef01 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -260,7 +260,8 @@ class Softsign(PrimitiveWithInfer): The function is shown as follows: .. math:: - \text{output} = \frac{\text{input_x}}{1 + \left| \text{input_x} \right|}, + + \text{SoftSign}(x) = \frac{x}{ 1 + |x|} Inputs: - **input_x** (Tensor) - The input tensor whose data type must be float16 or float32. @@ -332,6 +333,10 @@ class ReLU6(PrimitiveWithInfer): r""" Computes ReLU (Rectified Linear Unit) upper bounded by 6 of input tensors element-wise. + .. math:: + + \text{ReLU6}(x) = \min(\max(0,x), 6) + It returns :math:`\min(\max(0,x), 6)` element-wise. Inputs: @@ -437,15 +442,12 @@ class Elu(PrimitiveWithInfer): r""" Computes exponential linear: - if x < 0: - .. math:: - \text{x} = \alpha * (\exp(\text{x}) - 1) - if x >= 0: - - .. math:: - \text{x} = \text{x} + \text{x} = \begin{cases} + \alpha * (\exp(\text{x}) - 1), & \text{if x} < \text{0;}\\ + \text{x}, & \text{if x} >= \text{0.} + \end{cases} The data type of input tensor must be float. @@ -1569,8 +1571,11 @@ class MaxPoolWithArgmax(_Pool): It has the same data type as `input`. - **mask** (Tensor) - Max values' index represented by the mask. Data type is int32. + Raises: + TypeError: If the input data type is not float16 or float32. + Supported Platforms: - ``Ascend`` + ``Ascend`` ``GPU`` Examples: >>> input_tensor = Tensor(np.arange(1 * 3 * 3 * 4).reshape((1, 3, 3, 4)), mindspore.float32) @@ -2357,8 +2362,8 @@ class SGD(PrimitiveWithCheck): """ Computes the stochastic gradient descent. Momentum is optional. - Nesterov momentum is based on the formula from On the importance of - initialization and momentum in deep learning. + Nesterov momentum is based on the formula from paper 'On the importance of + initialization and momentum in deep learning '_. Note: For details, please refer to `nn.SGD` source code. @@ -3005,7 +3010,7 @@ class Gelu(PrimitiveWithInfer): class FastGelu(PrimitiveWithInfer): r""" - fast Gaussian Error Linear Units activation function. + Fast Gaussian Error Linear Units activation function. FastGelu is defined as follows: @@ -3181,7 +3186,8 @@ class LSTM(PrimitiveWithInfer): """ Performs the Long Short-Term Memory (LSTM) on the input. - For detailed information, please refer to `nn.LSTM`. + For detailed information, please refer to `nn.LSTM + `_. Supported Platforms: ``GPU`` ``CPU`` @@ -3289,14 +3295,13 @@ class SigmoidCrossEntropyWithLogits(PrimitiveWithInfer): r""" Uses the given logits to compute sigmoid cross entropy. - Note: - Sets input logits as `X`, input label as `Y`, output as `loss`. Then, + Sets input logits as `X`, input label as `Y`, output as `loss`. Then, - .. math:: - p_{ij} = sigmoid(X_{ij}) = \frac{1}{1 + e^{-X_{ij}}} + .. math:: + p_{ij} = sigmoid(X_{ij}) = \frac{1}{1 + e^{-X_{ij}}} - .. math:: - loss_{ij} = -[Y_{ij} * ln(p_{ij}) + (1 - Y_{ij})ln(1 - p_{ij})] + .. math:: + loss_{ij} = -[Y_{ij} * ln(p_{ij}) + (1 - Y_{ij})ln(1 - p_{ij})] Inputs: - **logits** (Tensor) - Input logits. @@ -4376,22 +4381,21 @@ class BinaryCrossEntropy(PrimitiveWithInfer): r""" Computes the binary cross entropy between the target and the output. - Note: - Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`. - Let, + Sets input as :math:`x`, input label as :math:`y`, output as :math:`\ell(x, y)`. + Let, - .. math:: - L = \{l_1,\dots,l_N\}^\top, \quad - l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right] + .. math:: + L = \{l_1,\dots,l_N\}^\top, \quad + l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right] - Then, + Then, - .. math:: - \ell(x, y) = \begin{cases} - L, & \text{if reduction} = \text{'none';}\\ - \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\ - \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.} - \end{cases} + .. math:: + \ell(x, y) = \begin{cases} + L, & \text{if reduction} = \text{'none';}\\ + \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\ + \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.} + \end{cases} Args: reduction (str): Specifies the reduction to be applied to the output. @@ -6568,6 +6572,21 @@ class DynamicGRUV2(PrimitiveWithInfer): r""" Applies a single-layer gated recurrent unit (GRU) to an input sequence. + .. math:: + + \begin{array}{ll} + r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ + z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ + n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ + h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} + \end{array} + + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input + at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer + at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`, + :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively. + :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. + Args: direction (str): A string identifying the direction in the op. Default: 'UNIDIRECTIONAL'. Only 'UNIDIRECTIONAL' is currently supported. @@ -6619,6 +6638,8 @@ class DynamicGRUV2(PrimitiveWithInfer): - **hidden_new** (Tensor) - A Tensor of shape :math:`(\text{num_step}, \text{batch_size}, \text{hidden_size})`. Has the same data type with input `bias_type`. + A note about the bias_type: + - If `bias_input` and `bias_hidden` both are `None`, `bias_type` is date type of `init_h`. - If `bias_input` is not `None`, `bias_type` is the date type of `bias_input`. - If `bias_input` is `None` and `bias_hidden` is not `None, `bias_type` is the date type of `bias_hidden`. @@ -6772,6 +6793,11 @@ class LRN(PrimitiveWithInfer): r""" Local Response Normalization. + .. math:: + + b_{c} = a_{c}\left(k + \frac{\alpha}{n} + \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta} + Args: depth_radius (int): Half-width of the 1-D normalization window with the shape of 0-D. bias (float): An offset (usually positive to avoid dividing by 0).