Merge pull request !1831 from ghzl/add-oder-parameters-in-group-functionstags/v0.5.0-beta
| @@ -142,10 +142,12 @@ class Adam(Optimizer): | |||||
| value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | ||||
| applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | ||||
| To improve parameter groups performance, the customized order of parameters can be supported. | |||||
| Args: | Args: | ||||
| params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | ||||
| the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | ||||
| "lr" and "weight_decay" are the keys can be parsed. | |||||
| "lr", "weight_decay" and "order_params" are the keys can be parsed. | |||||
| - params: Required. The value should be a list of `Parameter`. | - params: Required. The value should be a list of `Parameter`. | ||||
| @@ -155,6 +157,11 @@ class Adam(Optimizer): | |||||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | ||||
| will be used. If not, the `weight_decay` in the API will be used. | will be used. If not, the `weight_decay` in the API will be used. | ||||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||||
| decay. | |||||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | ||||
| Iterable or a Tensor and the dims of the Tensor is 1, | Iterable or a Tensor and the dims of the Tensor is 1, | ||||
| use dynamic learning rate, then the i-th step will | use dynamic learning rate, then the i-th step will | ||||
| @@ -191,13 +198,16 @@ class Adam(Optimizer): | |||||
| >>> | >>> | ||||
| >>> #2) Use parameter groups and set different values | >>> #2) Use parameter groups and set different values | ||||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, | |||||
| >>> {'params': no_conv_params}] | |||||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||||
| >>> {'params': bias_params, 'lr': 0.01}, | |||||
| >>> {'order_params': net.trainable_params()}] | |||||
| >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) | >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) | ||||
| >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 | |||||
| >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a | |||||
| >>> # learning rate of 0.1 and a weight decay of 0.0. | |||||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||||
| >>> | >>> | ||||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | >>> loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | >>> model = Model(net, loss_fn=loss, optimizer=optim) | ||||
| @@ -45,10 +45,12 @@ class Momentum(Optimizer): | |||||
| value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | ||||
| applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | ||||
| To improve parameter groups performance, the customized order of parameters can be supported. | |||||
| Args: | Args: | ||||
| params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | ||||
| the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | ||||
| "lr" and "weight_decay" are the keys can be parsed. | |||||
| "lr", "weight_decay" and "order_params" are the keys can be parsed. | |||||
| - params: Required. The value should be a list of `Parameter`. | - params: Required. The value should be a list of `Parameter`. | ||||
| @@ -58,6 +60,11 @@ class Momentum(Optimizer): | |||||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | ||||
| will be used. If not, the `weight_decay` in the API will be used. | will be used. If not, the `weight_decay` in the API will be used. | ||||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||||
| decay. | |||||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | ||||
| Iterable or a Tensor and the dims of the Tensor is 1, | Iterable or a Tensor and the dims of the Tensor is 1, | ||||
| use dynamic learning rate, then the i-th step will | use dynamic learning rate, then the i-th step will | ||||
| @@ -86,13 +93,16 @@ class Momentum(Optimizer): | |||||
| >>> | >>> | ||||
| >>> #2) Use parameter groups and set different values | >>> #2) Use parameter groups and set different values | ||||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, | |||||
| >>> {'params': no_conv_params}] | |||||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||||
| >>> {'params': bias_params, 'lr': 0.01}, | |||||
| >>> {'order_params': net.trainable_params()}] | |||||
| >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) | >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) | ||||
| >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 | |||||
| >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a | |||||
| >>> # learning rate of 0.1 and a weight decay of 0.0. | |||||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||||
| >>> | >>> | ||||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | >>> loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) | >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) | ||||
| @@ -48,6 +48,8 @@ class Optimizer(Cell): | |||||
| value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | ||||
| applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | ||||
| To improve parameter groups performance, the customized order of parameters can be supported. | |||||
| Args: | Args: | ||||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | ||||
| Iterable or a Tensor and the dims of the Tensor is 1, | Iterable or a Tensor and the dims of the Tensor is 1, | ||||
| @@ -60,7 +62,7 @@ class Optimizer(Cell): | |||||
| converted to float. | converted to float. | ||||
| parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be | parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be | ||||
| updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`, | updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`, | ||||
| the "params", "lr" and "weight_decay" are the keys can be parsed. | |||||
| the "params", "lr", "weight_decay" and "order_params" are the keys can be parsed. | |||||
| - params: Required. The value should be a list of `Parameter`. | - params: Required. The value should be a list of `Parameter`. | ||||
| @@ -70,6 +72,11 @@ class Optimizer(Cell): | |||||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | ||||
| will be used. If not, the `weight_decay` in the API will be used. | will be used. If not, the `weight_decay` in the API will be used. | ||||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||||
| decay. | |||||
| weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. | weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. | ||||
| If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. | If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. | ||||
| loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the | loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the | ||||
| @@ -103,6 +110,7 @@ class Optimizer(Cell): | |||||
| self.is_group = False | self.is_group = False | ||||
| self.is_group_lr = False | self.is_group_lr = False | ||||
| self.is_group_params_ordered = False | |||||
| self.loss_scale = loss_scale | self.loss_scale = loss_scale | ||||
| if isinstance(learning_rate, int): | if isinstance(learning_rate, int): | ||||
| learning_rate = float(learning_rate) | learning_rate = float(learning_rate) | ||||
| @@ -210,9 +218,8 @@ class Optimizer(Cell): | |||||
| raise TypeError("Learning rate should be float, Tensor or Iterable.") | raise TypeError("Learning rate should be float, Tensor or Iterable.") | ||||
| return lr | return lr | ||||
| def _init_group_params(self, parameters, learning_rate, weight_decay): | |||||
| """Init learning rate or weight decay in group params.""" | |||||
| origin_dynamic_lr = self.dynamic_lr | |||||
| def _parse_group_params(self, parameters, learning_rate): | |||||
| """Parse group params.""" | |||||
| if self.dynamic_lr: | if self.dynamic_lr: | ||||
| dynamic_lr_length = learning_rate.size() | dynamic_lr_length = learning_rate.size() | ||||
| else: | else: | ||||
| @@ -220,6 +227,15 @@ class Optimizer(Cell): | |||||
| for group_param in parameters: | for group_param in parameters: | ||||
| lr_length = dynamic_lr_length | lr_length = dynamic_lr_length | ||||
| if 'order_params' in group_param.keys(): | |||||
| if len(group_param.keys()) > 1: | |||||
| raise ValueError("The order params dict in group parameters should " | |||||
| "only include the 'order_params' key.") | |||||
| if not isinstance(group_param['order_params'], Iterable): | |||||
| raise TypeError("The value of 'order_params' should be an Iterable type.") | |||||
| self.is_group_params_ordered = True | |||||
| continue | |||||
| if 'lr' in group_param.keys(): | if 'lr' in group_param.keys(): | ||||
| self.is_group_lr = True | self.is_group_lr = True | ||||
| self._get_single_lr(group_param['lr']) | self._get_single_lr(group_param['lr']) | ||||
| @@ -229,10 +245,20 @@ class Optimizer(Cell): | |||||
| elif isinstance(group_param['lr'], Tensor): | elif isinstance(group_param['lr'], Tensor): | ||||
| lr_length = group_param['lr'].size() | lr_length = group_param['lr'].size() | ||||
| self.dynamic_lr = True | self.dynamic_lr = True | ||||
| if dynamic_lr_length not in (lr_length, 0): | if dynamic_lr_length not in (lr_length, 0): | ||||
| raise ValueError("The dynamic learning rate in group should be the same size.") | raise ValueError("The dynamic learning rate in group should be the same size.") | ||||
| if not group_param['params']: | |||||
| raise ValueError("Optimizer got an empty group parameter list.") | |||||
| dynamic_lr_length = lr_length | dynamic_lr_length = lr_length | ||||
| self.dynamic_lr_length = dynamic_lr_length | |||||
| def _init_group_params(self, parameters, learning_rate, weight_decay): | |||||
| """Init learning rate or weight decay in group params.""" | |||||
| origin_dynamic_lr = self.dynamic_lr | |||||
| self._parse_group_params(parameters, learning_rate) | |||||
| if self.dynamic_lr and not origin_dynamic_lr: | if self.dynamic_lr and not origin_dynamic_lr: | ||||
| self.gather = P.GatherV2() | self.gather = P.GatherV2() | ||||
| self.assignadd = P.AssignAdd() | self.assignadd = P.AssignAdd() | ||||
| @@ -240,20 +266,20 @@ class Optimizer(Cell): | |||||
| params_store = [] | params_store = [] | ||||
| for group_param in parameters: | for group_param in parameters: | ||||
| if not group_param['params']: | |||||
| raise ValueError("Optimizer got an empty parameter list.") | |||||
| if 'order_params' in group_param.keys(): | |||||
| ordered_parameters = group_param['order_params'] | |||||
| continue | |||||
| self.group_params += group_param['params'] | self.group_params += group_param['params'] | ||||
| if 'lr' in group_param.keys(): | if 'lr' in group_param.keys(): | ||||
| params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor)) | params_dynamic_lr = isinstance(group_param['lr'], (Iterable, Tensor)) | ||||
| if self.dynamic_lr and not params_dynamic_lr: | if self.dynamic_lr and not params_dynamic_lr: | ||||
| lr = Tensor(np.array([group_param['lr']] * dynamic_lr_length).astype(np.float32)) | |||||
| lr = Tensor(np.array([group_param['lr']] * self.dynamic_lr_length).astype(np.float32)) | |||||
| else: | else: | ||||
| lr = self._get_single_lr(group_param['lr']) | lr = self._get_single_lr(group_param['lr']) | ||||
| else: | else: | ||||
| if self.dynamic_lr and not origin_dynamic_lr: | if self.dynamic_lr and not origin_dynamic_lr: | ||||
| lr = Tensor(np.array([self.scalar_lr] * dynamic_lr_length).astype(np.float32)) | |||||
| lr = Tensor(np.array([self.scalar_lr] * self.dynamic_lr_length).astype(np.float32)) | |||||
| else: | else: | ||||
| lr = learning_rate | lr = learning_rate | ||||
| @@ -273,10 +299,33 @@ class Optimizer(Cell): | |||||
| validator.check_value_type("parameter", param, [Parameter], self.cls_name) | validator.check_value_type("parameter", param, [Parameter], self.cls_name) | ||||
| if param.name in params_store: | if param.name in params_store: | ||||
| raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.") | raise RuntimeError(f"The {param.name} parameter has appeared in parameter groups.") | ||||
| params_store.append(param.name) | params_store.append(param.name) | ||||
| self.group_lr.append(Parameter(lr, name="lr_" + param.name)) | self.group_lr.append(Parameter(lr, name="lr_" + param.name)) | ||||
| self.group_weight_decay.append(weight_decay_) | self.group_weight_decay.append(weight_decay_) | ||||
| if self.is_group_params_ordered: | |||||
| self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay) | |||||
| def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay): | |||||
| """ | |||||
| Order group parameter, learning rate and weight decay in group params. And assign the parameters | |||||
| which in the value of 'order_params' but not in any group to default value. | |||||
| """ | |||||
| params_length = len(ordered_parameters) | |||||
| ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters] | |||||
| ordered_weight_decay = [weight_decay * self.loss_scale] * params_length | |||||
| params_name = [param.name for param in ordered_parameters] | |||||
| for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay): | |||||
| index = params_name.index(param.name) | |||||
| ordered_learning_rate[index] = lr | |||||
| ordered_weight_decay[index] = wd | |||||
| self.group_params = list(ordered_parameters) | |||||
| self.group_lr = ordered_learning_rate | |||||
| self.group_weight_decay = ordered_weight_decay | |||||
| def get_lr(self): | def get_lr(self): | ||||
| """ | """ | ||||
| Get the learning rate of current step. | Get the learning rate of current step. | ||||
| @@ -51,6 +51,8 @@ class RMSProp(Optimizer): | |||||
| value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | ||||
| applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | ||||
| To improve parameter groups performance, the customized order of parameters can be supported. | |||||
| Update `params` according to the RMSProp algorithm. | Update `params` according to the RMSProp algorithm. | ||||
| The equation is as follows: | The equation is as follows: | ||||
| @@ -93,7 +95,7 @@ class RMSProp(Optimizer): | |||||
| Args: | Args: | ||||
| params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | ||||
| the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | ||||
| "lr" and "weight_decay" are the keys can be parsed. | |||||
| "lr", "weight_decay" and "order_params" are the keys can be parsed. | |||||
| - params: Required. The value should be a list of `Parameter`. | - params: Required. The value should be a list of `Parameter`. | ||||
| @@ -103,6 +105,11 @@ class RMSProp(Optimizer): | |||||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | ||||
| will be used. If not, the `weight_decay` in the API will be used. | will be used. If not, the `weight_decay` in the API will be used. | ||||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||||
| decay. | |||||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | ||||
| Iterable or a Tensor and the dims of the Tensor is 1, | Iterable or a Tensor and the dims of the Tensor is 1, | ||||
| use dynamic learning rate, then the i-th step will | use dynamic learning rate, then the i-th step will | ||||
| @@ -133,13 +140,16 @@ class RMSProp(Optimizer): | |||||
| >>> | >>> | ||||
| >>> #2) Use parameter groups and set different values | >>> #2) Use parameter groups and set different values | ||||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, | |||||
| >>> {'params': no_conv_params}] | |||||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||||
| >>> {'params': bias_params, 'lr': 0.01}, | |||||
| >>> {'order_params': net.trainable_params()}] | |||||
| >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) | >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) | ||||
| >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 | |||||
| >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a | |||||
| >>> # learning rate of 0.1 and a weight decay of 0.0. | |||||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||||
| >>> | >>> | ||||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | >>> loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | >>> model = Model(net, loss_fn=loss, optimizer=optim) | ||||
| @@ -47,10 +47,12 @@ class SGD(Optimizer): | |||||
| value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | value of weight_decay > 0. When not separating parameter groups, the `weight_decay` in the API will be | ||||
| applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters. | ||||
| To improve parameter groups performance, the customized order of parameters can be supported. | |||||
| Args: | Args: | ||||
| params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, | ||||
| the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params", | ||||
| "lr" and "weight_decay" are the keys can be parsed. | |||||
| "lr", "weight_decay" and "order_params" are the keys can be parsed. | |||||
| - params: Required. The value should be a list of `Parameter`. | - params: Required. The value should be a list of `Parameter`. | ||||
| @@ -60,6 +62,11 @@ class SGD(Optimizer): | |||||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | ||||
| will be used. If not, the `weight_decay` in the API will be used. | will be used. If not, the `weight_decay` in the API will be used. | ||||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||||
| decay. | |||||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | ||||
| Iterable or a Tensor and the dims of the Tensor is 1, | Iterable or a Tensor and the dims of the Tensor is 1, | ||||
| use dynamic learning rate, then the i-th step will | use dynamic learning rate, then the i-th step will | ||||
| @@ -90,13 +97,16 @@ class SGD(Optimizer): | |||||
| >>> | >>> | ||||
| >>> #2) Use parameter groups and set different values | >>> #2) Use parameter groups and set different values | ||||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, | |||||
| >>> {'params': no_conv_params}] | |||||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||||
| >>> {'params': bias_params, 'lr': 0.01}, | |||||
| >>> {'order_params': net.trainable_params()}] | |||||
| >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) | >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) | ||||
| >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 | |||||
| >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a | |||||
| >>> # learning rate of 0.1 and a weight decay of 0.0. | |||||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||||
| >>> | >>> | ||||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | >>> loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | >>> model = Model(net, loss_fn=loss, optimizer=optim) | ||||
| @@ -13,6 +13,8 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| """Dataset help for minddata dataset""" | """Dataset help for minddata dataset""" | ||||
| import math | |||||
| from mindspore._checkparam import check_bool | from mindspore._checkparam import check_bool | ||||
| from .. import context | from .. import context | ||||
| from .parallel_utils import ParallelMode | from .parallel_utils import ParallelMode | ||||
| @@ -104,10 +106,10 @@ class _DatasetIter: | |||||
| loop_count = 1 | loop_count = 1 | ||||
| if hasattr(dataset, '__loop_size__'): | if hasattr(dataset, '__loop_size__'): | ||||
| loop_size = dataset.__loop_size__ | loop_size = dataset.__loop_size__ | ||||
| if dataset.get_dataset_size() % loop_size != 0: | |||||
| if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0: | |||||
| raise ValueError(f'Dataset size {dataset.get_dataset_size()} and ' | raise ValueError(f'Dataset size {dataset.get_dataset_size()} and ' | ||||
| f'loop_size {loop_size} are not matched.') | f'loop_size {loop_size} are not matched.') | ||||
| loop_count = int(dataset.get_dataset_size() / loop_size) | |||||
| loop_count = math.ceil(dataset.get_dataset_size() / loop_size) | |||||
| return loop_count | return loop_count | ||||
| @@ -60,8 +60,9 @@ def test_group_lr(): | |||||
| default_lr = 0.1 | default_lr = 0.1 | ||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | ||||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||||
| {'params': no_conv_params}] | |||||
| group_params = [{'params': no_conv_params}, | |||||
| {'params': conv_params, 'lr': conv_lr}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| net.set_train() | net.set_train() | ||||
| loss = nn.SoftmaxCrossEntropyWithLogits() | loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| @@ -69,12 +70,15 @@ def test_group_lr(): | |||||
| assert opt.is_group is True | assert opt.is_group is True | ||||
| assert opt.is_group_lr is True | assert opt.is_group_lr is True | ||||
| assert opt.dynamic_lr is False | assert opt.dynamic_lr is False | ||||
| for lr, param in zip(opt.learning_rate, opt.parameters): | |||||
| assert opt.is_group_params_ordered is True | |||||
| for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | if param in conv_params: | ||||
| assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) | assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) | ||||
| else: | else: | ||||
| assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | ||||
| assert param.name == order_param.name | |||||
| net_with_loss = WithLossCell(net, loss) | net_with_loss = WithLossCell(net, loss) | ||||
| train_network = TrainOneStepCell(net_with_loss, opt) | train_network = TrainOneStepCell(net_with_loss, opt) | ||||
| _executor.compile(train_network, inputs, label) | _executor.compile(train_network, inputs, label) | ||||
| @@ -89,20 +93,24 @@ def test_group_dynamic_1(): | |||||
| default_lr = (0.1, 0.2, 0.3) | default_lr = (0.1, 0.2, 0.3) | ||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | ||||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||||
| {'params': no_conv_params}] | |||||
| group_params = [{'params': no_conv_params}, | |||||
| {'params': conv_params, 'lr': conv_lr}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| net.set_train() | net.set_train() | ||||
| loss = nn.SoftmaxCrossEntropyWithLogits() | loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9) | opt = Momentum(group_params, learning_rate=default_lr, momentum=0.9) | ||||
| assert opt.is_group is True | assert opt.is_group is True | ||||
| assert opt.dynamic_lr is True | assert opt.dynamic_lr is True | ||||
| for lr, param in zip(opt.learning_rate, opt.parameters): | |||||
| assert opt.is_group_params_ordered is True | |||||
| for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | if param in conv_params: | ||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array([conv_lr] * 3).astype(np.float32)).asnumpy()) | assert np.all(lr.data.asnumpy() == Tensor(np.array([conv_lr] * 3).astype(np.float32)).asnumpy()) | ||||
| else: | else: | ||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array(list(default_lr)).astype(np.float32)).asnumpy()) | assert np.all(lr.data.asnumpy() == Tensor(np.array(list(default_lr)).astype(np.float32)).asnumpy()) | ||||
| assert param.name == order_param.name | |||||
| net_with_loss = WithLossCell(net, loss) | net_with_loss = WithLossCell(net, loss) | ||||
| train_network = TrainOneStepCell(net_with_loss, opt) | train_network = TrainOneStepCell(net_with_loss, opt) | ||||
| _executor.compile(train_network, inputs, label) | _executor.compile(train_network, inputs, label) | ||||
| @@ -127,9 +135,9 @@ def test_group_dynamic_2(): | |||||
| assert opt.dynamic_lr is True | assert opt.dynamic_lr is True | ||||
| for lr, param in zip(opt.learning_rate, opt.parameters): | for lr, param in zip(opt.learning_rate, opt.parameters): | ||||
| if param in conv_params: | if param in conv_params: | ||||
| assert np.all(lr.data == Tensor(np.array(list(conv_lr)).astype(np.float32))) | |||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array(list(conv_lr)).astype(np.float32)).asnumpy()) | |||||
| else: | else: | ||||
| assert np.all(lr.data == Tensor(np.array([default_lr] * 3).astype(np.float32))) | |||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3).astype(np.float32)).asnumpy()) | |||||
| net_with_loss = WithLossCell(net, loss) | net_with_loss = WithLossCell(net, loss) | ||||
| train_network = TrainOneStepCell(net_with_loss, opt) | train_network = TrainOneStepCell(net_with_loss, opt) | ||||
| @@ -180,15 +188,18 @@ def test_weight_decay(): | |||||
| default_weight_decay = 0.0 | default_weight_decay = 0.0 | ||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | ||||
| no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | ||||
| group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, | |||||
| {'params': no_conv_params}] | |||||
| group_params = [{'params': no_conv_params}, | |||||
| {'params': conv_params, 'weight_decay': conv_weight_decay}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| net.set_train() | net.set_train() | ||||
| loss = nn.SoftmaxCrossEntropyWithLogits() | loss = nn.SoftmaxCrossEntropyWithLogits() | ||||
| opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay) | opt = SGD(group_params, learning_rate=0.1, weight_decay=default_weight_decay) | ||||
| assert opt.is_group is True | assert opt.is_group is True | ||||
| assert opt.is_group_lr is False | assert opt.is_group_lr is False | ||||
| for weight_decay, decay_flags, param in zip(opt.weight_decay, opt.decay_flags, opt.parameters): | |||||
| assert opt.is_group_params_ordered is True | |||||
| for weight_decay, decay_flags, param, order_param in zip( | |||||
| opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | if param in conv_params: | ||||
| assert weight_decay == conv_weight_decay | assert weight_decay == conv_weight_decay | ||||
| assert decay_flags is True | assert decay_flags is True | ||||
| @@ -196,6 +207,8 @@ def test_weight_decay(): | |||||
| assert weight_decay == default_weight_decay | assert weight_decay == default_weight_decay | ||||
| assert decay_flags is False | assert decay_flags is False | ||||
| assert param.name == order_param.name | |||||
| net_with_loss = WithLossCell(net, loss) | net_with_loss = WithLossCell(net, loss) | ||||
| train_network = TrainOneStepCell(net_with_loss, opt) | train_network = TrainOneStepCell(net_with_loss, opt) | ||||
| _executor.compile(train_network, inputs, label) | _executor.compile(train_network, inputs, label) | ||||
| @@ -233,6 +246,19 @@ def test_get_lr_parameter_with_group(): | |||||
| assert lr.name == 'lr_' + param.name | assert lr.name == 'lr_' + param.name | ||||
| def test_get_lr_parameter_with_order_group(): | |||||
| net = LeNet5() | |||||
| conv_lr = 0.1 | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| opt = SGD(group_params) | |||||
| assert opt.is_group_lr is True | |||||
| for param in opt.parameters: | |||||
| lr = opt.get_lr_parameter(param) | |||||
| assert lr.name == 'lr_' + param.name | |||||
| def test_get_lr_parameter_with_no_group(): | def test_get_lr_parameter_with_no_group(): | ||||
| net = LeNet5() | net = LeNet5() | ||||
| conv_weight_decay = 0.8 | conv_weight_decay = 0.8 | ||||
| @@ -250,3 +276,125 @@ def test_get_lr_parameter_with_no_group(): | |||||
| params_error = [1, 2, 3] | params_error = [1, 2, 3] | ||||
| with pytest.raises(TypeError): | with pytest.raises(TypeError): | ||||
| opt.get_lr_parameter(params_error) | opt.get_lr_parameter(params_error) | ||||
| def test_order_params_lr(): | |||||
| net = LeNet5() | |||||
| conv_lr = 0.01 | |||||
| default_lr = 0.1 | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| opt = SGD(group_params, learning_rate=default_lr) | |||||
| assert opt.is_group is True | |||||
| assert opt.is_group_lr is True | |||||
| assert opt.is_group_params_ordered is True | |||||
| for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) | |||||
| else: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | |||||
| assert param.name == order_param.name | |||||
| assert lr.name == 'lr_' + param.name | |||||
| def test_order_params_weight_decay(): | |||||
| net = LeNet5() | |||||
| conv_weight_decay = 0.01 | |||||
| default_wd = 0.0 | |||||
| default_lr = 0.1 | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) | |||||
| assert opt.is_group is True | |||||
| assert opt.is_group_lr is False | |||||
| assert opt.is_group_params_ordered is True | |||||
| assert opt.learning_rate.name == "learning_rate" | |||||
| assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | |||||
| for weight_decay, decay_flags, param, order_param in zip( | |||||
| opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | |||||
| assert weight_decay == conv_weight_decay | |||||
| assert decay_flags is True | |||||
| else: | |||||
| assert weight_decay == default_wd | |||||
| assert decay_flags is False | |||||
| assert param.name == order_param.name | |||||
| def test_order_params_all_1(): | |||||
| net = LeNet5() | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||||
| group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||||
| {'params': bias_params, 'lr': 0.01}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0) | |||||
| assert opt.is_group is True | |||||
| assert opt.is_group_lr is True | |||||
| assert opt.is_group_params_ordered is True | |||||
| for weight_decay, decay_flags, lr, param, order_param in zip( | |||||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy()) | |||||
| assert weight_decay == 0.01 | |||||
| assert decay_flags is True | |||||
| elif param in bias_params: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(0.01, mstype.float32).asnumpy()) | |||||
| assert weight_decay == 0.0 | |||||
| assert decay_flags is False | |||||
| else: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy()) | |||||
| assert weight_decay == 0.0 | |||||
| assert decay_flags is False | |||||
| assert param.name == order_param.name | |||||
| assert lr.name == 'lr_' + param.name | |||||
| def test_order_params_all_2(): | |||||
| net = LeNet5() | |||||
| conv_weight_decay = 0.01 | |||||
| fc1_lr = (0.5, 0.4, 0.3) | |||||
| default_lr = 0.1 | |||||
| default_wd = 0.0 | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params())) | |||||
| group_params = [{'params': fc1_params, 'lr': fc1_lr}, | |||||
| {'params': conv_params, 'weight_decay': conv_weight_decay}, | |||||
| {'order_params': net.trainable_params()}] | |||||
| opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) | |||||
| assert opt.is_group is True | |||||
| assert opt.is_group_lr is True | |||||
| assert opt.is_group_params_ordered is True | |||||
| for weight_decay, decay_flags, lr, param, order_param in zip( | |||||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): | |||||
| if param in conv_params: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy()) | |||||
| assert weight_decay == conv_weight_decay | |||||
| assert decay_flags is True | |||||
| elif param in fc1_params: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(fc1_lr, mstype.float32).asnumpy()) | |||||
| assert weight_decay == default_wd | |||||
| assert decay_flags is False | |||||
| else: | |||||
| assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy()) | |||||
| assert weight_decay == default_wd | |||||
| assert decay_flags is False | |||||
| assert param.name == order_param.name | |||||
| assert lr.name == 'lr_' + param.name | |||||
| def test_get_order_params_with_not_include(): | |||||
| net = LeNet5() | |||||
| conv_weight_decay = 0.8 | |||||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||||
| no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||||
| group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, | |||||
| {'order_params': no_conv_params}] | |||||
| with pytest.raises(ValueError): | |||||
| SGD(group_params) | |||||