You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Face detection train."""
  16. import os
  17. import time
  18. import datetime
  19. import argparse
  20. import numpy as np
  21. from mindspore import context
  22. from mindspore.train.loss_scale_manager import DynamicLossScaleManager
  23. from mindspore import Tensor
  24. from mindspore.nn import Momentum
  25. from mindspore.communication.management import init, get_rank, get_group_size
  26. from mindspore.context import ParallelMode
  27. from mindspore.train.callback import ModelCheckpoint, RunContext
  28. from mindspore.train.callback import _InternalCallbackParam, CheckpointConfig
  29. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  30. from mindspore.common import dtype as mstype
  31. import mindspore.dataset as de
  32. from src.FaceDetection.yolov3 import HwYolov3 as backbone_HwYolov3
  33. from src.FaceDetection.yolo_loss import YoloLoss
  34. from src.network_define import BuildTrainNetworkV2, TrainOneStepWithLossScaleCell
  35. from src.lrsche_factory import warmup_step_new
  36. from src.logging import get_logger
  37. from src.data_preprocess import compose_map_func
  38. from src.config import config
  39. devid = int(os.getenv('DEVICE_ID'))
  40. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=devid)
  41. def parse_args():
  42. '''parse_args'''
  43. parser = argparse.ArgumentParser('Yolov3 Face Detection')
  44. parser.add_argument('--mindrecord_path', type=str, default='', help='dataset path, e.g. /home/data.mindrecord')
  45. parser.add_argument('--pretrained', type=str, default='', help='pretrained model to load')
  46. parser.add_argument('--local_rank', type=int, default=0, help='current rank to support distributed')
  47. parser.add_argument('--world_size', type=int, default=8, help='current process number to support distributed')
  48. args, _ = parser.parse_known_args()
  49. return args
  50. def train(args):
  51. '''train'''
  52. print('=============yolov3 start trainging==================')
  53. # init distributed
  54. if args.world_size != 1:
  55. init()
  56. args.local_rank = get_rank()
  57. args.world_size = get_group_size()
  58. args.batch_size = config.batch_size
  59. args.warmup_lr = config.warmup_lr
  60. args.lr_rates = config.lr_rates
  61. args.lr_steps = config.lr_steps
  62. args.gamma = config.gamma
  63. args.weight_decay = config.weight_decay
  64. args.momentum = config.momentum
  65. args.max_epoch = config.max_epoch
  66. args.log_interval = config.log_interval
  67. args.ckpt_path = config.ckpt_path
  68. args.ckpt_interval = config.ckpt_interval
  69. args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  70. print('args.outputs_dir', args.outputs_dir)
  71. args.logger = get_logger(args.outputs_dir, args.local_rank)
  72. if args.world_size != 8:
  73. args.lr_steps = [i * 8 // args.world_size for i in args.lr_steps]
  74. if args.world_size == 1:
  75. args.weight_decay = 0.
  76. if args.world_size != 1:
  77. parallel_mode = ParallelMode.DATA_PARALLEL
  78. else:
  79. parallel_mode = ParallelMode.STAND_ALONE
  80. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.world_size, gradients_mean=True)
  81. mindrecord_path = args.mindrecord_path
  82. num_classes = config.num_classes
  83. anchors = config.anchors
  84. anchors_mask = config.anchors_mask
  85. num_anchors_list = [len(x) for x in anchors_mask]
  86. momentum = args.momentum
  87. args.logger.info('train opt momentum:{}'.format(momentum))
  88. weight_decay = args.weight_decay * float(args.batch_size)
  89. args.logger.info('real weight_decay:{}'.format(weight_decay))
  90. lr_scale = args.world_size / 8
  91. args.logger.info('lr_scale:{}'.format(lr_scale))
  92. # dataloader
  93. args.logger.info('start create dataloader')
  94. epoch = args.max_epoch
  95. ds = de.MindDataset(mindrecord_path + "0", columns_list=["image", "annotation"], num_shards=args.world_size,
  96. shard_id=args.local_rank)
  97. ds = ds.map(input_columns=["image", "annotation"],
  98. output_columns=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0',
  99. 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1',
  100. 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1',
  101. 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2',
  102. 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'],
  103. column_order=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0',
  104. 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1',
  105. 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1',
  106. 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2',
  107. 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'],
  108. operations=compose_map_func, num_parallel_workers=16, python_multiprocessing=True)
  109. ds = ds.batch(args.batch_size, drop_remainder=True, num_parallel_workers=8)
  110. args.steps_per_epoch = ds.get_dataset_size()
  111. lr = warmup_step_new(args, lr_scale=lr_scale)
  112. ds = ds.repeat(epoch)
  113. args.logger.info('args.steps_per_epoch:{}'.format(args.steps_per_epoch))
  114. args.logger.info('args.world_size:{}'.format(args.world_size))
  115. args.logger.info('args.local_rank:{}'.format(args.local_rank))
  116. args.logger.info('end create dataloader')
  117. args.logger.save_args(args)
  118. args.logger.important_info('start create network')
  119. create_network_start = time.time()
  120. # backbone and loss
  121. network = backbone_HwYolov3(num_classes, num_anchors_list, args)
  122. criterion0 = YoloLoss(num_classes, anchors, anchors_mask[0], 64, 0, head_idx=0.0)
  123. criterion1 = YoloLoss(num_classes, anchors, anchors_mask[1], 32, 0, head_idx=1.0)
  124. criterion2 = YoloLoss(num_classes, anchors, anchors_mask[2], 16, 0, head_idx=2.0)
  125. # load pretrain model
  126. if os.path.isfile(args.pretrained):
  127. param_dict = load_checkpoint(args.pretrained)
  128. param_dict_new = {}
  129. for key, values in param_dict.items():
  130. if key.startswith('moments.'):
  131. continue
  132. elif key.startswith('network.'):
  133. param_dict_new[key[8:]] = values
  134. else:
  135. param_dict_new[key] = values
  136. load_param_into_net(network, param_dict_new)
  137. args.logger.info('load model {} success'.format(args.pretrained))
  138. train_net = BuildTrainNetworkV2(network, criterion0, criterion1, criterion2, args)
  139. # optimizer
  140. opt = Momentum(params=train_net.trainable_params(), learning_rate=Tensor(lr), momentum=momentum,
  141. weight_decay=weight_decay)
  142. # package training process
  143. train_net = TrainOneStepWithLossScaleCell(train_net, opt)
  144. train_net.set_broadcast_flag()
  145. # checkpoint
  146. ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
  147. train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num)
  148. ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank))
  149. cb_params = _InternalCallbackParam()
  150. cb_params.train_network = train_net
  151. cb_params.epoch_num = ckpt_max_num
  152. cb_params.cur_epoch_num = 1
  153. run_context = RunContext(cb_params)
  154. ckpt_cb.begin(run_context)
  155. train_net.set_train()
  156. t_end = time.time()
  157. t_epoch = time.time()
  158. old_progress = -1
  159. i = 0
  160. scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 10, scale_factor=2, scale_window=2000)
  161. for data in ds.create_tuple_iterator(output_numpy=True):
  162. batch_images = data[0]
  163. batch_labels = data[1]
  164. coord_mask_0 = data[2]
  165. conf_pos_mask_0 = data[3]
  166. conf_neg_mask_0 = data[4]
  167. cls_mask_0 = data[5]
  168. t_coord_0 = data[6]
  169. t_conf_0 = data[7]
  170. t_cls_0 = data[8]
  171. gt_list_0 = data[9]
  172. coord_mask_1 = data[10]
  173. conf_pos_mask_1 = data[11]
  174. conf_neg_mask_1 = data[12]
  175. cls_mask_1 = data[13]
  176. t_coord_1 = data[14]
  177. t_conf_1 = data[15]
  178. t_cls_1 = data[16]
  179. gt_list_1 = data[17]
  180. coord_mask_2 = data[18]
  181. conf_pos_mask_2 = data[19]
  182. conf_neg_mask_2 = data[20]
  183. cls_mask_2 = data[21]
  184. t_coord_2 = data[22]
  185. t_conf_2 = data[23]
  186. t_cls_2 = data[24]
  187. gt_list_2 = data[25]
  188. img_tensor = Tensor(batch_images, mstype.float32)
  189. coord_mask_tensor_0 = Tensor(coord_mask_0.astype(np.float32))
  190. conf_pos_mask_tensor_0 = Tensor(conf_pos_mask_0.astype(np.float32))
  191. conf_neg_mask_tensor_0 = Tensor(conf_neg_mask_0.astype(np.float32))
  192. cls_mask_tensor_0 = Tensor(cls_mask_0.astype(np.float32))
  193. t_coord_tensor_0 = Tensor(t_coord_0.astype(np.float32))
  194. t_conf_tensor_0 = Tensor(t_conf_0.astype(np.float32))
  195. t_cls_tensor_0 = Tensor(t_cls_0.astype(np.float32))
  196. gt_list_tensor_0 = Tensor(gt_list_0.astype(np.float32))
  197. coord_mask_tensor_1 = Tensor(coord_mask_1.astype(np.float32))
  198. conf_pos_mask_tensor_1 = Tensor(conf_pos_mask_1.astype(np.float32))
  199. conf_neg_mask_tensor_1 = Tensor(conf_neg_mask_1.astype(np.float32))
  200. cls_mask_tensor_1 = Tensor(cls_mask_1.astype(np.float32))
  201. t_coord_tensor_1 = Tensor(t_coord_1.astype(np.float32))
  202. t_conf_tensor_1 = Tensor(t_conf_1.astype(np.float32))
  203. t_cls_tensor_1 = Tensor(t_cls_1.astype(np.float32))
  204. gt_list_tensor_1 = Tensor(gt_list_1.astype(np.float32))
  205. coord_mask_tensor_2 = Tensor(coord_mask_2.astype(np.float32))
  206. conf_pos_mask_tensor_2 = Tensor(conf_pos_mask_2.astype(np.float32))
  207. conf_neg_mask_tensor_2 = Tensor(conf_neg_mask_2.astype(np.float32))
  208. cls_mask_tensor_2 = Tensor(cls_mask_2.astype(np.float32))
  209. t_coord_tensor_2 = Tensor(t_coord_2.astype(np.float32))
  210. t_conf_tensor_2 = Tensor(t_conf_2.astype(np.float32))
  211. t_cls_tensor_2 = Tensor(t_cls_2.astype(np.float32))
  212. gt_list_tensor_2 = Tensor(gt_list_2.astype(np.float32))
  213. scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32)
  214. loss0, overflow, _ = train_net(img_tensor, coord_mask_tensor_0, conf_pos_mask_tensor_0,
  215. conf_neg_mask_tensor_0, cls_mask_tensor_0, t_coord_tensor_0,
  216. t_conf_tensor_0, t_cls_tensor_0, gt_list_tensor_0,
  217. coord_mask_tensor_1, conf_pos_mask_tensor_1, conf_neg_mask_tensor_1,
  218. cls_mask_tensor_1, t_coord_tensor_1, t_conf_tensor_1,
  219. t_cls_tensor_1, gt_list_tensor_1, coord_mask_tensor_2,
  220. conf_pos_mask_tensor_2, conf_neg_mask_tensor_2,
  221. cls_mask_tensor_2, t_coord_tensor_2, t_conf_tensor_2,
  222. t_cls_tensor_2, gt_list_tensor_2, scaling_sens)
  223. overflow = np.all(overflow.asnumpy())
  224. if overflow:
  225. scale_manager.update_loss_scale(overflow)
  226. else:
  227. scale_manager.update_loss_scale(False)
  228. args.logger.info('rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
  229. 'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, lr[i],
  230. batch_images.shape, batch_labels.shape))
  231. # save ckpt
  232. cb_params.cur_step_num = i + 1 # current step number
  233. cb_params.batch_num = i + 2
  234. if args.local_rank == 0:
  235. ckpt_cb.step_end(run_context)
  236. # save Log
  237. if i == 0:
  238. time_for_graph_compile = time.time() - create_network_start
  239. args.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile))
  240. if i % args.steps_per_epoch == 0:
  241. cb_params.cur_epoch_num += 1
  242. if i % args.log_interval == 0 and args.local_rank == 0:
  243. time_used = time.time() - t_end
  244. epoch = int(i / args.steps_per_epoch)
  245. fps = args.batch_size * (i - old_progress) * args.world_size / time_used
  246. args.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps))
  247. t_end = time.time()
  248. old_progress = i
  249. if i % args.steps_per_epoch == 0 and args.local_rank == 0:
  250. epoch_time_used = time.time() - t_epoch
  251. epoch = int(i / args.steps_per_epoch)
  252. fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used
  253. args.logger.info('=================================================')
  254. args.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
  255. args.logger.info('=================================================')
  256. t_epoch = time.time()
  257. i = i + 1
  258. args.logger.info('=============yolov3 training finished==================')
  259. if __name__ == "__main__":
  260. arg = parse_args()
  261. train(arg)