| @@ -327,12 +327,17 @@ class AdamWeightDecayDynamicLR(Optimizer): | |||||
| beta2=0.999, | beta2=0.999, | ||||
| eps=1e-6, | eps=1e-6, | ||||
| weight_decay=0.0, | weight_decay=0.0, | ||||
| decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): | |||||
| decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name, | |||||
| warmup_steps=0): | |||||
| super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params) | super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params) | ||||
| _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) | _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) | ||||
| _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name) | _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name) | ||||
| # turn them to scalar when me support scalar/tensor mix operations | # turn them to scalar when me support scalar/tensor mix operations | ||||
| self.global_step = Parameter(initializer(0, [1]), name="global_step") | self.global_step = Parameter(initializer(0, [1]), name="global_step") | ||||
| self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) | |||||
| self.warmup_flag = False | |||||
| if warmup_steps > 0: | |||||
| self.warmup_flag = True | |||||
| self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32)) | self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32)) | ||||
| self.end_learning_rate = Tensor(np.array([end_learning_rate]).astype(np.float32)) | self.end_learning_rate = Tensor(np.array([end_learning_rate]).astype(np.float32)) | ||||
| self.diff_learning_rate = Tensor(np.array([learning_rate - end_learning_rate]).astype(np.float32)) | self.diff_learning_rate = Tensor(np.array([learning_rate - end_learning_rate]).astype(np.float32)) | ||||
| @@ -348,12 +353,20 @@ class AdamWeightDecayDynamicLR(Optimizer): | |||||
| self.hyper_map = C.HyperMap() | self.hyper_map = C.HyperMap() | ||||
| self.min = P.Minimum() | self.min = P.Minimum() | ||||
| self.pow = P.Pow() | self.pow = P.Pow() | ||||
| self.greater = P.Greater() | |||||
| self.one = Tensor(np.array([1.0]).astype(np.float32)) | self.one = Tensor(np.array([1.0]).astype(np.float32)) | ||||
| self.cast = P.Cast() | |||||
| self.start_learning_rate = Tensor(np.array([learning_rate]).astype(np.float32)) | |||||
| def construct(self, gradients): | def construct(self, gradients): | ||||
| step = self.min(self.global_step, self.decay_steps) | step = self.min(self.global_step, self.decay_steps) | ||||
| p = step / self.decay_steps | p = step / self.decay_steps | ||||
| lr = self.diff_learning_rate * self.pow(self.one - p, self.power) + self.end_learning_rate | lr = self.diff_learning_rate * self.pow(self.one - p, self.power) + self.end_learning_rate | ||||
| if self.warmup_flag: | |||||
| warmup_percent = self.global_step / self.warmup_steps | |||||
| warmup_lr = self.start_learning_rate * warmup_percent | |||||
| is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32) | |||||
| lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr | |||||
| updated_velocity = self.hyper_map(F.partial(adam_opt, self.beta1, self.beta2, self.eps, lr, | updated_velocity = self.hyper_map(F.partial(adam_opt, self.beta1, self.beta2, self.eps, lr, | ||||
| self.weight_decay_tensor), | self.weight_decay_tensor), | ||||
| self.params, self.moments1, self.moments2, gradients, self.decay_flag) | self.params, self.moments1, self.moments2, gradients, self.decay_flag) | ||||