!2748 Change order param only equal to group param

Merge pull request !2748 from ghzl/change-order-params-only-equal-to-group-param
5 years ago · d04f3b9a49
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -181,8 +181,7 @@ class Adam(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' but not in any group will use default learning rate and default weight
              decay.
              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                             Iterable or a Tensor and the dims of the Tensor is 1,
@@ -220,16 +219,14 @@ class Adam(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
        >>>                 {'params': bias_params, 'lr': 0.01},
        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.Adam(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -109,6 +109,10 @@ class LazyAdam(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
                                                        use dynamic learning rate, then the i-th step will
@@ -146,12 +150,13 @@ class LazyAdam(Optimizer):
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01, 'lr': 0.01},
        >>>                 {'params': no_conv_params}]
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.LazyAdam(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # the conv_params's parameters will use a learning rate of 0.01 and a weight decay of 0.01
        >>> # the no_cov_params's parameters don't set learning and weight decay. So they will use a
        >>> # learning rate of 0.1 and a weight decay of 0.0.
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -64,8 +64,7 @@ class Momentum(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' but not in any group will use default learning rate and default weight
              decay.
              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                             Iterable or a Tensor and the dims of the Tensor is 1,
@@ -97,16 +96,14 @@ class Momentum(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
        >>>                 {'params': bias_params, 'lr': 0.01},
        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.Momentum(group_params, learning_rate=0.1, momentum=0.9, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -77,8 +77,7 @@ class Optimizer(Cell):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' but not in any group will use default learning rate and default weight
              decay.
              in the value of 'order_params' should be in one of group parameters.

        weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
@@ -351,16 +350,18 @@ class Optimizer(Cell):
                self.group_weight_decay.append(weight_decay_)

        if self.is_group_params_ordered:
            self._order_and_adjust_group_params(ordered_parameters, learning_rate, weight_decay)
            self._order_and_adjust_group_params(ordered_parameters)

    def _order_and_adjust_group_params(self, ordered_parameters, learning_rate, weight_decay):
    def _order_and_adjust_group_params(self, ordered_parameters):
        """
        Order group parameter, learning rate and weight decay in group params. And assign the parameters
        which in the value of 'order_params' but not in any group to default value.
        Order group parameter, learning rate and weight decay in group params.
        """
        params_length = len(ordered_parameters)
        ordered_learning_rate = [Parameter(learning_rate, name="lr_" + param.name) for param in ordered_parameters]
        ordered_weight_decay = [weight_decay * self.loss_scale] * params_length
        params_length = len(self.group_params)
        if len(ordered_parameters) != len(self.group_params):
            raise ValueError(f"The value of 'order_params' should be same with all group parameters.")

        ordered_learning_rate = [None] * params_length
        ordered_weight_decay = [None] * params_length
        params_name = [param.name for param in ordered_parameters]

        for param, lr, wd in zip(self.group_params, self.group_lr, self.group_weight_decay):
--- a/mindspore/nn/optim/rmsprop.py
+++ b/mindspore/nn/optim/rmsprop.py
@@ -107,8 +107,7 @@ class RMSProp(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' but not in any group will use default learning rate and default weight
              decay.
              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
@@ -140,16 +139,14 @@ class RMSProp(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
        >>>                 {'params': bias_params, 'lr': 0.01},
        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.RMSProp(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -64,8 +64,7 @@ class SGD(Optimizer):

            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
              in the value of 'order_params' but not in any group will use default learning rate and default weight
              decay.
              in the value of 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
                                                        Iterable or a Tensor and the dims of the Tensor is 1,
@@ -98,16 +97,14 @@ class SGD(Optimizer):
        >>>
        >>> #2) Use parameter groups and set different values
        >>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
        >>> bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
        >>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
        >>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
        >>>                 {'params': bias_params, 'lr': 0.01},
        >>>                 {'params': no_conv_params, 'lr': 0.01},
        >>>                 {'order_params': net.trainable_params()}]
        >>> opt = nn.SGD(group_params, learning_rate=0.1, weight_decay=0.0)
        >>> # The conv_params's parameters will use a learning rate of default value 0.1 and a weight decay of 0.01.
        >>> # The bias_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The no_conv_params's parameters will use a learning rate of 0.01 and a weight decay of default value 0.0.
        >>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
        >>> # The parameters which in the value of 'order_params' but not in any group will use a learning rate
        >>> # of default value 0.1 and a weight decay of default value 0.0.
        >>>
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim)
--- a/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
+++ b/tests/ut/python/optimizer/test_optimizer_with_parameter_groups.py
@@ -250,8 +250,9 @@ def test_get_lr_parameter_with_order_group():
    net = LeNet5()
    conv_lr = 0.1
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
    no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'lr': conv_lr},
                    {'order_params': net.trainable_params()}]
                    {'params': no_conv_params}]
    opt = SGD(group_params)
    assert opt.is_group_lr is True
    for param in opt.parameters:
@@ -278,65 +279,19 @@ def test_get_lr_parameter_with_no_group():
        opt.get_lr_parameter(params_error)


 def test_order_params_lr():
    net = LeNet5()
    conv_lr = 0.01
    default_lr = 0.1
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'lr': conv_lr},
                    {'order_params': net.trainable_params()}]
    opt = SGD(group_params, learning_rate=default_lr)
    assert opt.is_group is True
    assert opt.is_group_lr is True
    assert opt.is_group_params_ordered is True
    for lr, param, order_param in zip(opt.learning_rate, opt.parameters, net.trainable_params()):
        if param in conv_params:
            assert np.all(lr.data.asnumpy() == Tensor(conv_lr, mstype.float32).asnumpy())
        else:
            assert np.all(lr.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())

        assert param.name == order_param.name
        assert lr.name == 'lr_' + param.name


 def test_order_params_weight_decay():
    net = LeNet5()
    conv_weight_decay = 0.01
    default_wd = 0.0
    default_lr = 0.1
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'weight_decay': conv_weight_decay},
                    {'order_params': net.trainable_params()}]
    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
    assert opt.is_group is True
    assert opt.is_group_lr is False
    assert opt.is_group_params_ordered is True
    assert opt.learning_rate.name == "learning_rate"
    assert np.all(opt.learning_rate.data.asnumpy() == Tensor(default_lr, mstype.float32).asnumpy())
    for weight_decay, decay_flags, param, order_param in zip(
            opt.weight_decay, opt.decay_flags, opt.parameters, net.trainable_params()):
        if param in conv_params:
            assert weight_decay == conv_weight_decay
            assert decay_flags is True
        else:
            assert weight_decay == default_wd
            assert decay_flags is False
        assert param.name == order_param.name


 def test_order_params_all_1():
 def test_order_params_1():
    net = LeNet5()
    conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
    bias_params = list(filter(lambda x: 'bias' in x.name, net.trainable_params()))
    group_params = [{'params': conv_params, 'weight_decay': 0.01},
                    {'params': bias_params, 'lr': 0.01},
                    {'order_params': net.trainable_params()}]
                    {'order_params': bias_params+conv_params}]
    opt = SGD(group_params, learning_rate=0.1, weight_decay=0.0)
    assert opt.is_group is True
    assert opt.is_group_lr is True
    assert opt.is_group_params_ordered is True
    for weight_decay, decay_flags, lr, param, order_param in zip(
            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, bias_params+conv_params):
        if param in conv_params:
            assert np.all(lr.data.asnumpy() == Tensor(0.1, mstype.float32).asnumpy())
            assert weight_decay == 0.01
@@ -354,7 +309,7 @@ def test_order_params_all_1():
        assert lr.name == 'lr_' + param.name


 def test_order_params_all_2():
 def test_order_params_2():
    net = LeNet5()
    conv_weight_decay = 0.01
    fc1_lr = (0.5, 0.4, 0.3)
@@ -364,13 +319,13 @@ def test_order_params_all_2():
    fc1_params = list(filter(lambda x: 'fc1' in x.name, net.trainable_params()))
    group_params = [{'params': fc1_params, 'lr': fc1_lr},
                    {'params': conv_params, 'weight_decay': conv_weight_decay},
                    {'order_params': net.trainable_params()}]
                    {'order_params': fc1_params+conv_params}]
    opt = SGD(group_params, learning_rate=default_lr, weight_decay=default_wd)
    assert opt.is_group is True
    assert opt.is_group_lr is True
    assert opt.is_group_params_ordered is True
    for weight_decay, decay_flags, lr, param, order_param in zip(
            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, net.trainable_params()):
            opt.weight_decay, opt.decay_flags, opt.learning_rate, opt.parameters, fc1_params+conv_params):
        if param in conv_params:
            assert np.all(lr.data.asnumpy() == Tensor(np.array([default_lr] * 3), mstype.float32).asnumpy())
            assert weight_decay == conv_weight_decay
@@ -388,7 +343,7 @@ def test_order_params_all_2():
        assert lr.name == 'lr_' + param.name


 def test_get_order_params_with_not_include():
 def test_get_order_params_with_not_same():
    net = LeNet5()
    conv_weight_decay = 0.8