|
|
|
@@ -327,12 +327,17 @@ class AdamWeightDecayDynamicLR(Optimizer): |
|
|
|
beta2=0.999, |
|
|
|
eps=1e-6, |
|
|
|
weight_decay=0.0, |
|
|
|
decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): |
|
|
|
decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name, |
|
|
|
warmup_steps=0): |
|
|
|
super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params) |
|
|
|
_check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) |
|
|
|
_check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name) |
|
|
|
# turn them to scalar when me support scalar/tensor mix operations |
|
|
|
self.global_step = Parameter(initializer(0, [1]), name="global_step") |
|
|
|
self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) |
|
|
|
self.warmup_flag = False |
|
|
|
if warmup_steps > 0: |
|
|
|
self.warmup_flag = True |
|
|
|
self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32)) |
|
|
|
self.end_learning_rate = Tensor(np.array([end_learning_rate]).astype(np.float32)) |
|
|
|
self.diff_learning_rate = Tensor(np.array([learning_rate - end_learning_rate]).astype(np.float32)) |
|
|
|
@@ -348,12 +353,20 @@ class AdamWeightDecayDynamicLR(Optimizer): |
|
|
|
self.hyper_map = C.HyperMap() |
|
|
|
self.min = P.Minimum() |
|
|
|
self.pow = P.Pow() |
|
|
|
self.greater = P.Greater() |
|
|
|
self.one = Tensor(np.array([1.0]).astype(np.float32)) |
|
|
|
self.cast = P.Cast() |
|
|
|
self.start_learning_rate = Tensor(np.array([learning_rate]).astype(np.float32)) |
|
|
|
|
|
|
|
def construct(self, gradients): |
|
|
|
step = self.min(self.global_step, self.decay_steps) |
|
|
|
p = step / self.decay_steps |
|
|
|
lr = self.diff_learning_rate * self.pow(self.one - p, self.power) + self.end_learning_rate |
|
|
|
if self.warmup_flag: |
|
|
|
warmup_percent = self.global_step / self.warmup_steps |
|
|
|
warmup_lr = self.start_learning_rate * warmup_percent |
|
|
|
is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32) |
|
|
|
lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr |
|
|
|
updated_velocity = self.hyper_map(F.partial(adam_opt, self.beta1, self.beta2, self.eps, lr, |
|
|
|
self.weight_decay_tensor), |
|
|
|
self.params, self.moments1, self.moments2, gradients, self.decay_flag) |
|
|
|
|