| @@ -40,7 +40,7 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, d | |||
| beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). | |||
| eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. | |||
| lr (Tensor): Learning rate. | |||
| weight_decay (Number): Weight decay. Should be in range [0.0, 1.0]. | |||
| weight_decay (Number): Weight decay. Should be equal to or greater than 0. | |||
| param (Tensor): Parameters. | |||
| m (Tensor): m value of parameters. | |||
| v (Tensor): v value of parameters. | |||
| @@ -200,8 +200,8 @@ class Adam(Optimizer): | |||
| use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients. | |||
| If True, updates the gradients using NAG. | |||
| If False, updates the gradients without using NAG. Default: False. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be in range [0.0, 1.0]. Default: 0.0. | |||
| loss_scale (float): A floating point value for the loss scale. Should be not less than 1.0. Default: 1.0. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. | |||
| loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0. | |||
| Inputs: | |||
| - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. | |||
| @@ -318,7 +318,7 @@ class AdamWeightDecay(Optimizer): | |||
| Should be in range (0.0, 1.0). | |||
| eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6. | |||
| Should be greater than 0. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be in range [0.0, 1.0]. Default: 0.0. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. | |||
| Inputs: | |||
| - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. | |||
| @@ -116,7 +116,7 @@ class FTRL(Optimizer): | |||
| l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0. | |||
| use_locking (bool): If True use locks for update operation. Default: False. | |||
| loss_scale (float): Value for the loss scale. It should be equal to or greater than 1.0. Default: 1.0. | |||
| weight_decay (float): Weight decay value to multiply weight, should be in range [0.0, 1.0]. Default: 0.0. | |||
| weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0. | |||
| Inputs: | |||
| - **grads** (tuple[Tensor]) - The gradients of `params` in optimizer, the shape is as same as the `params` | |||
| @@ -43,7 +43,7 @@ def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v | |||
| beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). | |||
| eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. | |||
| lr (Tensor): Learning rate. | |||
| weight_decay (Number): Weight decay. Should be in range [0.0, 1.0]. | |||
| weight_decay (Number): Weight decay. Should be equal to or greater than 0. | |||
| global_step (Tensor): Global step. | |||
| param (Tensor): Parameters. | |||
| m (Tensor): m value of parameters. | |||
| @@ -126,7 +126,7 @@ def _update_run_op_graph_kernel(beta1, beta2, eps, global_step, lr, weight_decay | |||
| beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). | |||
| eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. | |||
| lr (Tensor): Learning rate. | |||
| weight_decay (Number): Weight decay. Should be in range [0.0, 1.0]. | |||
| weight_decay (Number): Weight decay. Should be equal to or greater than 0. | |||
| global_step (Tensor): Global step. | |||
| param (Tensor): Parameters. | |||
| m (Tensor): m value of parameters. | |||
| @@ -227,7 +227,7 @@ class Lamb(Optimizer): | |||
| Should be in range (0.0, 1.0). | |||
| eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6. | |||
| Should be greater than 0. | |||
| weight_decay (float): Weight decay (L2 penalty). Default: 0.0. Should be in range [0.0, 1.0]. | |||
| weight_decay (float): Weight decay (L2 penalty). Default: 0.0. Should be equal to or greater than 0. | |||
| Inputs: | |||
| - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. | |||
| @@ -133,7 +133,7 @@ class LazyAdam(Optimizer): | |||
| If True, updates the gradients using NAG. | |||
| If False, updates the gradients without using NAG. Default: False. | |||
| weight_decay (float): Weight decay (L2 penalty). Default: 0.0. | |||
| loss_scale (float): A floating point value for the loss scale. It should be not less than 1.0. Default: | |||
| loss_scale (float): A floating point value for the loss scale. Should be equal to or greater than 1. Default: | |||
| 1.0. | |||
| Inputs: | |||
| @@ -92,8 +92,8 @@ class Momentum(Optimizer): | |||
| equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float. | |||
| momentum (float): Hyperparameter of type float, means momentum for the moving average. | |||
| It should be at least 0.0. | |||
| weight_decay (int, float): Weight decay (L2 penalty). It should be in range [0.0, 1.0]. Default: 0.0. | |||
| loss_scale (int, float): A floating point value for the loss scale. Should be not less than 1.0. Default: 1.0. | |||
| weight_decay (int, float): Weight decay (L2 penalty). It should be equal to or greater than 0.0. Default: 0.0. | |||
| loss_scale (int, float): A floating point value for the loss scale. It should be greater than 0.0. Default: 1.0. | |||
| use_nesterov (bool): Enable Nesterov momentum. Default: False. | |||
| Inputs: | |||
| @@ -78,9 +78,9 @@ class Optimizer(Cell): | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| weight_decay (float): A floating point value for the weight decay. It should be in range [0.0, 1.0]. | |||
| weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. | |||
| If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. | |||
| loss_scale (float): A floating point value for the loss scale. It should be not less than 1.0. If the | |||
| loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the | |||
| type of `loss_scale` input is int, it will be converted to float. Default: 1.0. | |||
| Raises: | |||
| @@ -102,7 +102,7 @@ class Optimizer(Cell): | |||
| if isinstance(loss_scale, int): | |||
| loss_scale = float(loss_scale) | |||
| validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) | |||
| validator.check_number_range("loss_scale", loss_scale, 1.0, float("inf"), Rel.INC_LEFT, self.cls_name) | |||
| validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name) | |||
| self.loss_scale = loss_scale | |||
| weight_decay = self._preprocess_weight_decay(weight_decay) | |||
| @@ -98,8 +98,8 @@ class ProximalAdagrad(Optimizer): | |||
| l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0. | |||
| l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0. | |||
| use_locking (bool): If True use locks for update operation. Default: False. | |||
| loss_scale (float): Value for the loss scale. It should be not less than 1.0. Default: 1.0. | |||
| weight_decay (float): Weight decay value to multiply weight, should be in range [0.0, 1.0]. Default: 0.0. | |||
| loss_scale (float): Value for the loss scale. It should be greater than 0.0. Default: 1.0. | |||
| weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0. | |||
| Inputs: | |||
| - **grads** (tuple[Tensor]) - The gradients of `params` in optimizer, the shape is as same as the `params` | |||
| @@ -121,8 +121,8 @@ class RMSProp(Optimizer): | |||
| 0. Default: 1e-10. | |||
| use_locking (bool): Enable a lock to protect the update of variable and accumlation tensors. Default: False. | |||
| centered (bool): If True, gradients are normalized by the estimated variance of the gradient. Default: False. | |||
| loss_scale (float): A floating point value for the loss scale. Should be not less than 1.0. Default: 1.0. | |||
| weight_decay (float): Weight decay (L2 penalty). Should be in range [0.0, 1.0]. Default: 0.0. | |||
| loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0. | |||
| weight_decay (float): Weight decay (L2 penalty). Should be equal to or greater than 0. Default: 0.0. | |||
| Inputs: | |||
| - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. | |||
| @@ -88,10 +88,11 @@ class SGD(Optimizer): | |||
| Default: 0.1. | |||
| momentum (float): A floating point value the momentum. should be at least 0.0. Default: 0.0. | |||
| dampening (float): A floating point value of dampening for momentum. should be at least 0.0. Default: 0.0. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be in range [0.0, 1.0]. Default: 0.0. | |||
| weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. | |||
| nesterov (bool): Enables the Nesterov momentum. If use nesterov, momentum must be positive, | |||
| and dampening must equal to 0.0. Default: False. | |||
| loss_scale (float): A floating point value for the loss scale. Should be not less than 1.0. Default: 1.0. | |||
| loss_scale (float): A floating point value for the loss scale, which should be larger | |||
| than 0.0. Default: 1.0. | |||
| Inputs: | |||
| - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`. | |||
| @@ -98,7 +98,7 @@ def test_momentum_with_loss_scale(): | |||
| net = Net(strategy1, strategy2, weight) | |||
| optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9, loss_scale=1.0) | |||
| optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9, loss_scale=0.5) | |||
| net_with_loss = NetWithLoss(net, strategy3) | |||
| @@ -169,7 +169,7 @@ def test_momentum_with_loss_scale_and_dynamic_lr(): | |||
| net = Net(strategy1, strategy2, weight) | |||
| lr = Tensor(np.ones([6]), dtype=ms.float32) | |||
| optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9, loss_scale=1.0) | |||
| optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9, loss_scale=0.5) | |||
| net_with_loss = NetWithLoss(net, strategy3) | |||