Merge pull request !2748 from ghzl/change-order-params-only-equal-to-group-paramtags/v0.6.0-beta
| @@ -181,8 +181,7 @@ class Adam(Optimizer): | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||
| decay. | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | |||
| Iterable or a Tensor and the dims of the Tensor is 1, | |||
| @@ -220,16 +219,14 @@ class Adam(Optimizer): | |||
| >>> | |||
| >>> #2) Use parameter groups and set different values | |||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| >>> {'params': bias_params, 'lr': 0.01}, | |||
| >>> {'params': no_conv_params, 'lr': 0.01}, | |||
| >>> {'order_params': net.trainable_params()}] | |||
| >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0) | |||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||
| >>> | |||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | |||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | |||
| @@ -109,6 +109,10 @@ class LazyAdam(Optimizer): | |||
| - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay | |||
| will be used. If not, the `weight_decay` in the API will be used. | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | |||
| Iterable or a Tensor and the dims of the Tensor is 1, | |||
| use dynamic learning rate, then the i-th step will | |||
| @@ -146,12 +150,13 @@ class LazyAdam(Optimizer): | |||
| >>> #2) Use parameter groups and set different values | |||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01}, | |||
| >>> {'params': no_conv_params}] | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| >>> {'params': no_conv_params, 'lr': 0.01}, | |||
| >>> {'order_params': net.trainable_params()}] | |||
| >>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0) | |||
| >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01 | |||
| >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a | |||
| >>> # learning rate of 0.1 and a weight decay of 0.0. | |||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||
| >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||
| >>> | |||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | |||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | |||
| @@ -64,8 +64,7 @@ class Momentum(Optimizer): | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||
| decay. | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | |||
| Iterable or a Tensor and the dims of the Tensor is 1, | |||
| @@ -97,16 +96,14 @@ class Momentum(Optimizer): | |||
| >>> | |||
| >>> #2) Use parameter groups and set different values | |||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| >>> {'params': bias_params, 'lr': 0.01}, | |||
| >>> {'params': no_conv_params, 'lr': 0.01}, | |||
| >>> {'order_params': net.trainable_params()}] | |||
| >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0) | |||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||
| >>> | |||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | |||
| >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) | |||
| @@ -77,8 +77,7 @@ class Optimizer(Cell): | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||
| decay. | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0. | |||
| If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0. | |||
| @@ -351,16 +350,18 @@ class Optimizer(Cell): | |||
| self.group_weight_decay.append(weight_decay_) | |||
| if self.is_group_params_ordered: | |||
| self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay) | |||
| self._order_and_adjust_group_params(ordered_parameters) | |||
| def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay): | |||
| def _order_and_adjust_group_params(self, ordered_parameters): | |||
| """ | |||
| Order group parameter, learning rate and weight decay in group params. And assign the parameters | |||
| which in the value of 'order_params' but not in any group to default value. | |||
| Order group parameter, learning rate and weight decay in group params. | |||
| """ | |||
| params_length = len(ordered_parameters) | |||
| ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters] | |||
| ordered_weight_decay = [weight_decay * self.loss_scale] * params_length | |||
| params_length = len(self.group_params) | |||
| if len(ordered_parameters) != len(self.group_params): | |||
| raise ValueError(f"The value of 'order_params' should be same with all group parameters.") | |||
| ordered_learning_rate = [None] * params_length | |||
| ordered_weight_decay = [None] * params_length | |||
| params_name = [param.name for param in ordered_parameters] | |||
| for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay): | |||
| @@ -107,8 +107,7 @@ class RMSProp(Optimizer): | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||
| decay. | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | |||
| Iterable or a Tensor and the dims of the Tensor is 1, | |||
| @@ -140,16 +139,14 @@ class RMSProp(Optimizer): | |||
| >>> | |||
| >>> #2) Use parameter groups and set different values | |||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| >>> {'params': bias_params, 'lr': 0.01}, | |||
| >>> {'params': no_conv_params, 'lr': 0.01}, | |||
| >>> {'order_params': net.trainable_params()}] | |||
| >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0) | |||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||
| >>> | |||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | |||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | |||
| @@ -64,8 +64,7 @@ class SGD(Optimizer): | |||
| - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and | |||
| the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which | |||
| in the value of 'order_params' but not in any group will use default learning rate and default weight | |||
| decay. | |||
| in the value of 'order_params' should be in one of group parameters. | |||
| learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is | |||
| Iterable or a Tensor and the dims of the Tensor is 1, | |||
| @@ -98,16 +97,14 @@ class SGD(Optimizer): | |||
| >>> | |||
| >>> #2) Use parameter groups and set different values | |||
| >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||
| >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| >>> group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| >>> {'params': bias_params, 'lr': 0.01}, | |||
| >>> {'params': no_conv_params, 'lr': 0.01}, | |||
| >>> {'order_params': net.trainable_params()}] | |||
| >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0) | |||
| >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01. | |||
| >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0. | |||
| >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'. | |||
| >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate | |||
| >>> # of default value 0.1 and a weight decay of default value 0.0. | |||
| >>> | |||
| >>> loss = nn.SoftmaxCrossEntropyWithLogits() | |||
| >>> model = Model(net, loss_fn=loss, optimizer=optim) | |||
| @@ -250,8 +250,9 @@ def test_get_lr_parameter_with_order_group(): | |||
| net = LeNet5() | |||
| conv_lr = 0.1 | |||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params())) | |||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||
| {'order_params': net.trainable_params()}] | |||
| {'params': no_conv_params}] | |||
| opt = SGD(group_params) | |||
| assert opt.is_group_lr is True | |||
| for param in opt.parameters: | |||
| @@ -278,65 +279,19 @@ def test_get_lr_parameter_with_no_group(): | |||
| opt.get_lr_parameter(params_error) | |||
| def test_order_params_lr(): | |||
| net = LeNet5() | |||
| conv_lr = 0.01 | |||
| default_lr = 0.1 | |||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| group_params = [{'params': conv_params, 'lr': conv_lr}, | |||
| {'order_params': net.trainable_params()}] | |||
| opt = SGD(group_params, learning_rate=default_lr) | |||
| assert opt.is_group is True | |||
| assert opt.is_group_lr is True | |||
| assert opt.is_group_params_ordered is True | |||
| for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()): | |||
| if param in conv_params: | |||
| assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy()) | |||
| else: | |||
| assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | |||
| assert param.name == order_param.name | |||
| assert lr.name == 'lr_' + param.name | |||
| def test_order_params_weight_decay(): | |||
| net = LeNet5() | |||
| conv_weight_decay = 0.01 | |||
| default_wd = 0.0 | |||
| default_lr = 0.1 | |||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay}, | |||
| {'order_params': net.trainable_params()}] | |||
| opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) | |||
| assert opt.is_group is True | |||
| assert opt.is_group_lr is False | |||
| assert opt.is_group_params_ordered is True | |||
| assert opt.learning_rate.name == "learning_rate" | |||
| assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy()) | |||
| for weight_decay, decay_flags, param, order_param in zip( | |||
| opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()): | |||
| if param in conv_params: | |||
| assert weight_decay == conv_weight_decay | |||
| assert decay_flags is True | |||
| else: | |||
| assert weight_decay == default_wd | |||
| assert decay_flags is False | |||
| assert param.name == order_param.name | |||
| def test_order_params_all_1(): | |||
| def test_order_params_1(): | |||
| net = LeNet5() | |||
| conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params())) | |||
| bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params())) | |||
| group_params = [{'params': conv_params, 'weight_decay': 0.01}, | |||
| {'params': bias_params, 'lr': 0.01}, | |||
| {'order_params': net.trainable_params()}] | |||
| {'order_params': bias_params+conv_params}] | |||
| opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0) | |||
| assert opt.is_group is True | |||
| assert opt.is_group_lr is True | |||
| assert opt.is_group_params_ordered is True | |||
| for weight_decay, decay_flags, lr, param, order_param in zip( | |||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): | |||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, bias_params+conv_params): | |||
| if param in conv_params: | |||
| assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy()) | |||
| assert weight_decay == 0.01 | |||
| @@ -354,7 +309,7 @@ def test_order_params_all_1(): | |||
| assert lr.name == 'lr_' + param.name | |||
| def test_order_params_all_2(): | |||
| def test_order_params_2(): | |||
| net = LeNet5() | |||
| conv_weight_decay = 0.01 | |||
| fc1_lr = (0.5, 0.4, 0.3) | |||
| @@ -364,13 +319,13 @@ def test_order_params_all_2(): | |||
| fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params())) | |||
| group_params = [{'params': fc1_params, 'lr': fc1_lr}, | |||
| {'params': conv_params, 'weight_decay': conv_weight_decay}, | |||
| {'order_params': net.trainable_params()}] | |||
| {'order_params': fc1_params+conv_params}] | |||
| opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd) | |||
| assert opt.is_group is True | |||
| assert opt.is_group_lr is True | |||
| assert opt.is_group_params_ordered is True | |||
| for weight_decay, decay_flags, lr, param, order_param in zip( | |||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()): | |||
| opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, fc1_params+conv_params): | |||
| if param in conv_params: | |||
| assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy()) | |||
| assert weight_decay == conv_weight_decay | |||
| @@ -388,7 +343,7 @@ def test_order_params_all_2(): | |||
| assert lr.name == 'lr_' + param.name | |||
| def test_get_order_params_with_not_include(): | |||
| def test_get_order_params_with_not_same(): | |||
| net = LeNet5() | |||
| conv_weight_decay = 0.8 | |||