From 140495e2322b57bd405c98a1d3add2fab6f50924 Mon Sep 17 00:00:00 2001 From: panfei Date: Mon, 19 Apr 2021 15:05:44 +0800 Subject: [PATCH] modify formula --- mindspore/nn/optim/ada_grad.py | 4 +- mindspore/nn/optim/momentum.py | 6 +- mindspore/nn/optim/rmsprop.py | 22 +++--- mindspore/ops/operations/nn_ops.py | 82 ++++++++++---------- model_zoo/official/cv/resnet/src/momentum.py | 6 +- 5 files changed, 60 insertions(+), 60 deletions(-) diff --git a/mindspore/nn/optim/ada_grad.py b/mindspore/nn/optim/ada_grad.py index a029502ac2..a12f470bd3 100644 --- a/mindspore/nn/optim/ada_grad.py +++ b/mindspore/nn/optim/ada_grad.py @@ -46,8 +46,8 @@ class Adagrad(Optimizer): .. math:: \begin{array}{ll} \\ - h_{t} = h_{t-1} + g\\ - w_{t} = w_{t-1} - lr*\frac{1}{\sqrt{h_{t}}}*g + h_{t+1} = h_{t} + g\\ + w_{t+1} = w_{t} - lr*\frac{1}{\sqrt{h_{t+1}}}*g \end{array} :math:`h` represents the cumulative sum of gradient squared, :math:`g` represents `gradients`. diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py index e103c5deab..6629f27716 100755 --- a/mindspore/nn/optim/momentum.py +++ b/mindspore/nn/optim/momentum.py @@ -44,17 +44,17 @@ class Momentum(Optimizer): Refer to the paper on the importance of initialization and momentum in deep learning for more details. .. math:: - v_{t} = v_{t-1} \ast u + gradients + v_{t+1} = v_{t} \ast u + gradients If use_nesterov is True: .. math:: - p_{t} = p_{t-1} - (grad \ast lr + v_{t} \ast u \ast lr) + p_{t+1} = p_{t} - (grad \ast lr + v_{t+1} \ast u \ast lr) If use_nesterov is False: .. math:: - p_{t} = p_{t-1} - lr \ast v_{t} + p_{t+1} = p_{t} - lr \ast v_{t+1} Here: where grad, lr, p, v and u denote the gradients, learning_rate, params, moments, and momentum respectively. diff --git a/mindspore/nn/optim/rmsprop.py b/mindspore/nn/optim/rmsprop.py index dfd9314aa5..e0f141c45d 100644 --- a/mindspore/nn/optim/rmsprop.py +++ b/mindspore/nn/optim/rmsprop.py @@ -47,35 +47,35 @@ class RMSProp(Optimizer): The equation is as follows: .. math:: - s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2 + s_{t+1} = \\rho s_{t} + (1 - \\rho)(\\nabla Q_{i}(w))^2 .. math:: - m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} + \\epsilon}} \\nabla Q_{i}(w) + m_{t+1} = \\beta m_{t} + \\frac{\\eta} {\\sqrt{s_{t+1} + \\epsilon}} \\nabla Q_{i}(w) .. math:: - w = w - m_{t} + w = w - m_{t+1} The first equation calculates moving average of the squared gradient for - each weight. Then dividing the gradient by :math:`\\sqrt{ms_{t} + \\epsilon}`. + each weight. Then dividing the gradient by :math:`\\sqrt{ms_{t+1} + \\epsilon}`. if centered is True: .. math:: - g_{t} = \\rho g_{t-1} + (1 - \\rho)\\nabla Q_{i}(w) + g_{t+1} = \\rho g_{t} + (1 - \\rho)\\nabla Q_{i}(w) .. math:: - s_{t} = \\rho s_{t-1} + (1 - \\rho)(\\nabla Q_{i}(w))^2 + s_{t+1} = \\rho s_{t} + (1 - \\rho)(\\nabla Q_{i}(w))^2 .. math:: - m_{t} = \\beta m_{t-1} + \\frac{\\eta} {\\sqrt{s_{t} - g_{t}^2 + \\epsilon}} \\nabla Q_{i}(w) + m_{t+1} = \\beta m_{t} + \\frac{\\eta} {\\sqrt{s_{t+1} - g_{t+1}^2 + \\epsilon}} \\nabla Q_{i}(w) .. math:: - w = w - m_{t} + w = w - m_{t+1} where :math:`w` represents `params`, which will be updated. - :math:`g_{t}` is mean gradients, :math:`g_{t-1}` is the last moment of :math:`g_{t}`. - :math:`s_{t}` is the mean square gradients, :math:`s_{t-1}` is the last moment of :math:`s_{t}`, - :math:`m_{t}` is moment, the delta of `w`, :math:`m_{t-1}` is the last moment of :math:`m_{t}`. + :math:`g_{t+1}` is mean gradients, :math:`g_{t}` is the last moment of :math:`g_{t+1}`. + :math:`s_{t+1}` is the mean square gradients, :math:`s_{t}` is the last moment of :math:`s_{t+1}`, + :math:`m_{t+1}` is moment, the delta of `w`, :math:`m_{t}` is the last moment of :math:`m_{t+1}`. :math:`\\rho` represents `decay`. :math:`\\beta` is the momentum term, represents `momentum`. :math:`\\epsilon` is a smoothing term to avoid division by zero, represents `epsilon`. :math:`\\eta` is learning rate, represents `learning_rate`. :math:`\\nabla Q_{i}(w)` is gradients, diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index 9aed786370..dbbf919afd 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -2738,14 +2738,14 @@ class ApplyRMSProp(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - s_{t} = \rho s_{t-1} + (1 - \rho)(\nabla Q_{i}(w))^2 \\ - m_{t} = \beta m_{t-1} + \frac{\eta} {\sqrt{s_{t} + \epsilon}} \nabla Q_{i}(w) \\ - w = w - m_{t} + s_{t+1} = \rho s_{t} + (1 - \rho)(\nabla Q_{i}(w))^2 \\ + m_{t+1} = \beta m_{t} + \frac{\eta} {\sqrt{s_{t+1} + \epsilon}} \nabla Q_{i}(w) \\ + w = w - m_{t+1} \end{array} where :math:`w` represents `var`, which will be updated. - :math:`s_{t}` represents `mean_square`, :math:`s_{t-1}` is the last momentent of :math:`s_{t}`, - :math:`m_{t}` represents `moment`, :math:`m_{t-1}` is the last momentent of :math:`m_{t}`. + :math:`s_{t+1}` represents `mean_square`, :math:`s_{t}` is the last momentent of :math:`s_{t+1}`, + :math:`m_{t+1}` represents `moment`, :math:`m_{t}` is the last momentent of :math:`m_{t+1}`. :math:`\rho` represents `decay`. :math:`\beta` is the momentum term, represents `momentum`. :math:`\epsilon` is a smoothing term to avoid division by zero, represents `epsilon`. :math:`\eta` represents `learning_rate`. :math:`\nabla Q_{i}(w)` represents `grad`. @@ -2834,16 +2834,16 @@ class ApplyCenteredRMSProp(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - g_{t} = \rho g_{t-1} + (1 - \rho)\nabla Q_{i}(w) \\ - s_{t} = \rho s_{t-1} + (1 - \rho)(\nabla Q_{i}(w))^2 \\ - m_{t} = \beta m_{t-1} + \frac{\eta} {\sqrt{s_{t} - g_{t}^2 + \epsilon}} \nabla Q_{i}(w) \\ - w = w - m_{t} + g_{t+1} = \rho g_{t} + (1 - \rho)\nabla Q_{i}(w) \\ + s_{t+1} = \rho s_{t} + (1 - \rho)(\nabla Q_{i}(w))^2 \\ + m_{t+1} = \beta m_{t} + \frac{\eta} {\sqrt{s_{t+1} - g_{t+1}^2 + \epsilon}} \nabla Q_{i}(w) \\ + w = w - m_{t+1} \end{array} where :math:`w` represents `var`, which will be updated. - :math:`g_{t}` represents `mean_gradient`, :math:`g_{t-1}` is the last momentent of :math:`g_{t}`. - :math:`s_{t}` represents `mean_square`, :math:`s_{t-1}` is the last momentent of :math:`s_{t}`, - :math:`m_{t}` represents `moment`, :math:`m_{t-1}` is the last momentent of :math:`m_{t}`. + :math:`g_{t+1}` represents `mean_gradient`, :math:`g_{t}` is the last momentent of :math:`g_{t+1}`. + :math:`s_{t+1}` represents `mean_square`, :math:`s_{t}` is the last momentent of :math:`s_{t+1}`, + :math:`m_{t+1}` represents `moment`, :math:`m_{t}` is the last momentent of :math:`m_{t+1}`. :math:`\rho` represents `decay`. :math:`\beta` is the momentum term, represents `momentum`. :math:`\epsilon` is a smoothing term to avoid division by zero, represents `epsilon`. :math:`\eta` represents `learning_rate`. :math:`\nabla Q_{i}(w)` represents `grad`. @@ -5025,16 +5025,16 @@ class ApplyAdaMax(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - m_{t} = \beta_1 * m_{t-1} + (1 - \beta_1) * g \\ - v_{t} = \max(\beta_2 * v_{t-1}, \left| g \right|) \\ - var = var - \frac{l}{1 - \beta_1^t} * \frac{m_{t}}{v_{t} + \epsilon} + m_{t+1} = \beta_1 * m_{t} + (1 - \beta_1) * g \\ + v_{t+1} = \max(\beta_2 * v_{t}, \left| g \right|) \\ + var = var - \frac{l}{1 - \beta_1^{t+1} * \frac{m_{t+1}}{v_{t+1} + \epsilon} \end{array} - :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t-1}` - is the last momentent of :math:`m_{t}`, :math:`v` represents the 2nd moment vector, :math:`v_{t-1}` - is the last momentent of :math:`v_{t}`, :math:`l` represents scaling factor `lr`, + :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t}` + is the last momentent of :math:`m_{t+1}`, :math:`v` represents the 2nd moment vector, :math:`v_{t}` + is the last momentent of :math:`v_{t+1}`, :math:`l` represents scaling factor `lr`, :math:`g` represents `grad`, :math:`\beta_1, \beta_2` represent `beta1` and `beta2`, - :math:`beta_1^t` represents `beta1_power`, :math:`var` represents the variable to be updated, + :math:`beta_1^{t+1}` represents `beta1_power`, :math:`var` represents the variable to be updated, :math:`\epsilon` represents `epsilon`. Inputs of `var`, `m`, `v` and `grad` comply with the implicit type conversion rules @@ -5938,13 +5938,13 @@ class ApplyAddSign(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - m_{t} = \beta * m_{t-1} + (1 - \beta) * g \\ + m_{t+1} = \beta * m_{t} + (1 - \beta) * g \\ \text{update} = (\alpha + \text{sign_decay} * sign(g) * sign(m)) * g \\ - var = var - lr_{t} * \text{update} + var = var - lr_{t+1} * \text{update} \end{array} - :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t-1}` - is the last momentent of :math:`m_{t}`, :math:`lr` represents scaling factor `lr`, :math:`g` represents `grad`. + :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t}` + is the last momentent of :math:`m_{t+1}`, :math:`lr` represents scaling factor `lr`, :math:`g` represents `grad`. Inputs of `var`, `accum` and `grad` comply with the implicit type conversion rules to make the data types consistent. @@ -6063,13 +6063,13 @@ class ApplyPowerSign(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - m_{t} = \beta * m_{t-1} + (1 - \beta) * g \\ + m_{t+1} = \beta * m_{t} + (1 - \beta) * g \\ \text{update} = \exp(\text{logbase} * \text{sign_decay} * sign(g) * sign(m)) * g \\ - var = var - lr_{t} * \text{update} + var = var - lr_{t+1} * \text{update} \end{array} - :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t-1}` - is the last momentent of :math:`m_{t}`, :math:`lr` represents scaling factor `lr`, :math:`g` represents `grad`. + :math:`t` represents updating step while :math:`m` represents the 1st moment vector, :math:`m_{t}` + is the last momentent of :math:`m_{t+1}`, :math:`lr` represents scaling factor `lr`, :math:`g` represents `grad`. All of inputs comply with the implicit type conversion rules to make the data types consistent. If `lr`, `logbase`, `sign_decay` or `beta` is a number, the number is automatically converted to Tensor, @@ -7154,12 +7154,12 @@ class DynamicRNN(PrimitiveWithInfer): .. math:: \begin{array}{ll} \\ - i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\ - f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\ - \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\ - o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\ - c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\ - h_t = o_t * \tanh(c_t) \\ + i_{t+1} = \sigma(W_{ix} x_{t+1} + b_{ix} + W_{ih} h_{(t)} + b_{ih}) \\ + f_{t+1} = \sigma(W_{fx} x_{t+1} + b_{fx} + W_{fh} h_{(t)} + b_{fh}) \\ + \tilde{c}_{t+1} = \tanh(W_{cx} x_{t+1} + b_{cx} + W_{ch} h_{(t)} + b_{ch}) \\ + o_{t+1} = \sigma(W_{ox} x_{t+1} + b_{ox} + W_{oh} h_{(t)} + b_{oh}) \\ + c_{t+1} = f_{t+1} * c_{(t)} + i_t * \tilde{c}_{t+1} \\ + h_{t+1} = o_{t+1} * \tanh(c_{t+1}) \\ \end{array} Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b` @@ -7309,16 +7309,16 @@ class DynamicGRUV2(PrimitiveWithInfer): .. math:: \begin{array}{ll} - r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ - z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ - n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ - h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} + r_{t+1} = \sigma(W_{ir} x_{t+1} + b_{ir} + W_{hr} h_{(t)} + b_{hr}) \\ + z_{t+1} = \sigma(W_{iz} x_{t+1} + b_{iz} + W_{hz} h_{(t)} + b_{hz}) \\ + n_{t+1} = \tanh(W_{in} x_{t+1} + b_{in} + r_{t+1} * (W_{hn} h_{(t)}+ b_{hn})) \\ + h_{t+1} = (1 - z_{t+1}) * n_{t+1} + z_{t+1} * h_{(t)} \end{array} - where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input - at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer - at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`, - :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively. + where :math:`h_{t+1}` is the hidden state at time `t+1`, :math:`x_{t+1}` is the input + at time `t+1`, :math:`h_{t}` is the hidden state of the layer + at time `t` or the initial hidden state at time `0`, and :math:`r_{t+1}`, + :math:`z_{t+1}`, :math:`n_{t+1}` are the reset, update, and new gates, respectively. :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. Args: diff --git a/model_zoo/official/cv/resnet/src/momentum.py b/model_zoo/official/cv/resnet/src/momentum.py index 7c2e3c5ec8..1839bbaad4 100644 --- a/model_zoo/official/cv/resnet/src/momentum.py +++ b/model_zoo/official/cv/resnet/src/momentum.py @@ -37,17 +37,17 @@ class Momentum(Optimizer): Refer to the paper on the importance of initialization and momentum in deep learning for more details. .. math:: - v_{t} = v_{t-1} \ast u + gradients + v_{t+1} = v_{t} \ast u + gradients If use_nesterov is True: .. math:: - p_{t} = p_{t-1} - (grad \ast lr + v_{t} \ast u \ast lr) + p_{t+1} = p_{t} - (grad \ast lr + v_{t+1} \ast u \ast lr) If use_nesterov is False: .. math:: - p_{t} = p_{t-1} - lr \ast v_{t} + p_{t+1} = p_{t} - lr \ast v_{t+1} Here: where grad, lr, p, v and u denote the gradients, learning_rate, params, moments, and momentum respectively.