|
- import math
- import time
- import numpy as np
-
- from mindspore.train.serialization import load_checkpoint, load_param_into_net
- from mindspore.train.callback import LossMonitor, Callback
- from mindspore.common.tensor import Tensor
- from mindspore.common import dtype as mstype
-
- class MyLossMonitor(LossMonitor):
- def __init__(self, per_print_times=1):
- super(MyLossMonitor, self).__init__()
- self._per_print_times = per_print_times
- self._start_time = time.time()
- self._loss_list = []
-
- def step_end(self, run_context):
- cb_params = run_context.original_args()
-
- loss = cb_params.net_outputs
-
- if isinstance(loss, (tuple, list)):
- if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
- loss = loss[0]
-
- if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
- loss = np.mean(loss.asnumpy())
-
- cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
-
- if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
- raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
- cb_params.cur_epoch_num, cur_step_in_epoch))
- if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
- self._loss_list.append(loss)
- if cb_params.cur_step_num % 100 == 0:
- print("epoch: %s, steps: [%s], mean loss is: %s"%(cb_params.cur_epoch_num, cur_step_in_epoch,
- np.array(self._loss_list).mean()), flush=True)
- self._loss_list = []
-
- self._start_time = time.time()
-
- class MyScaleSensCallback(Callback):
- '''MyLossScaleCallback'''
- def __init__(self, loss_scale_list, epoch_list):
- super(MyScaleSensCallback, self).__init__()
- self.loss_scale_list = loss_scale_list
- self.epoch_list = epoch_list
- self.scaling_sens = loss_scale_list[0]
-
- def epoch_end(self, run_context):
- cb_params = run_context.original_args()
- epoch = cb_params.cur_epoch_num
-
- for i, _ in enumerate(self.epoch_list):
- if epoch >= self.epoch_list[i]:
- self.scaling_sens = self.loss_scale_list[i+1]
- else:
- break
-
- scaling_sens_tensor = Tensor(self.scaling_sens, dtype=mstype.float32)
- cb_params.train_network.set_sense_scale(scaling_sens_tensor)
- print("Epoch: set train network scale sens to {}".format(self.scaling_sens))
-
-
- def _linear_warmup_learning_rate(current_step, warmup_steps, base_lr, init_lr):
- lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps)
- learning_rate = float(init_lr) + lr_inc * current_step
- return learning_rate
-
- def _a_cosine_learning_rate(current_step, base_lr, warmup_steps, decay_steps):
- base = float(current_step - warmup_steps) / float(decay_steps)
- learning_rate = (1 + math.cos(base * math.pi)) / 2 * base_lr
- return learning_rate
-
- def _dynamic_lr(base_lr, total_steps, warmup_steps, warmup_ratio=1 / 3):
- lr = []
- for i in range(total_steps):
- if i < warmup_steps:
- lr.append(_linear_warmup_learning_rate(i, warmup_steps, base_lr, base_lr * warmup_ratio))
- else:
- lr.append(_a_cosine_learning_rate(i, base_lr, warmup_steps, total_steps))
-
- return lr
-
- def get_lr(lr, lr_gamma, steps_per_epoch, max_epoch_train, lr_steps, group_size, lr_type='default', warmup_epoch=5):
- if lr_type == 'default':
- lr_stage = np.array([lr] * steps_per_epoch * max_epoch_train).astype('f')
- for step in lr_steps:
- step //= group_size
- lr_stage[step:] *= lr_gamma
- elif lr_type == 'cosine':
- lr_stage = _dynamic_lr(lr, steps_per_epoch * max_epoch_train, warmup_epoch * steps_per_epoch,
- warmup_ratio=1 / 3)
- lr_stage = np.array(lr_stage).astype('f')
- else:
- raise ValueError("lr type {} is not support.".format(lr_type))
-
- lr_base = lr_stage.copy()
- lr_base = lr_base / 4
- lr_vgg = lr_base.copy()
- vgg_freeze_step = 2000 // group_size
- lr_vgg[:vgg_freeze_step] = 0
-
- return lr_stage, lr_base, lr_vgg
-
-
- def load_model(test_net, model_path):
- if model_path:
- param_dict = load_checkpoint(model_path)
- param_dict_new = {}
- for key, values in param_dict.items():
- if key.startswith('moment'):
- continue
- elif key.startswith('network.'):
- param_dict_new[key[8:]] = values
-
- load_param_into_net(test_net, param_dict_new)
-
-
- class show_loss_list():
- def __init__(self, name):
- self.loss_list = np.zeros(6).astype('f')
- self.sums = 0
- self.name = name
-
- def add(self, list_of_tensor):
- self.sums += 1
- for i, loss_tensor in enumerate(list_of_tensor):
- self.loss_list[i] += loss_tensor.asnumpy()
-
- def show(self):
- print(self.name + ' stage_loss:', self.loss_list / (self.sums + 1e-8), flush=True)
- self.loss_list = np.zeros(6).astype('f')
- self.sums = 0
-
-
- class AverageMeter():
- def __init__(self):
- self.loss = 0
- self.sum = 0
-
- def add(self, tensor):
- self.sum += 1
- self.loss += tensor.asnumpy()
-
- def meter(self):
- avergeLoss = self.loss / (self.sum + 1e-8)
- self.loss = 0
- self.sum = 0
- return avergeLoss
|