You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 11 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """train ImageNet."""
  16. import os
  17. import time
  18. import argparse
  19. import datetime
  20. import mindspore.nn as nn
  21. from mindspore import Tensor, context
  22. from mindspore import ParallelMode
  23. from mindspore.nn.optim import Momentum
  24. from mindspore.communication.management import init, get_rank, get_group_size
  25. from mindspore.train.callback import ModelCheckpoint
  26. from mindspore.train.callback import CheckpointConfig, Callback
  27. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  28. from mindspore.train.model import Model
  29. from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
  30. from src.dataset import classification_dataset
  31. from src.crossentropy import CrossEntropy
  32. from src.warmup_step_lr import warmup_step_lr
  33. from src.warmup_cosine_annealing_lr import warmup_cosine_annealing_lr
  34. from src.utils.logging import get_logger
  35. from src.utils.optimizers__init__ import get_param_groups
  36. from src.image_classification import get_network
  37. from src.config import config
  38. devid = int(os.getenv('DEVICE_ID'))
  39. context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
  40. device_target="Ascend", save_graphs=False, device_id=devid)
  41. class BuildTrainNetwork(nn.Cell):
  42. """build training network"""
  43. def __init__(self, network, criterion):
  44. super(BuildTrainNetwork, self).__init__()
  45. self.network = network
  46. self.criterion = criterion
  47. def construct(self, input_data, label):
  48. output = self.network(input_data)
  49. loss = self.criterion(output, label)
  50. return loss
  51. class ProgressMonitor(Callback):
  52. """monitor loss and time"""
  53. def __init__(self, args):
  54. super(ProgressMonitor, self).__init__()
  55. self.me_epoch_start_time = 0
  56. self.me_epoch_start_step_num = 0
  57. self.args = args
  58. self.ckpt_history = []
  59. def begin(self, run_context):
  60. self.args.logger.info('start network train...')
  61. def epoch_begin(self, run_context):
  62. pass
  63. def epoch_end(self, run_context, *me_args):
  64. cb_params = run_context.original_args()
  65. me_step = cb_params.cur_step_num - 1
  66. real_epoch = me_step // self.args.steps_per_epoch
  67. time_used = time.time() - self.me_epoch_start_time
  68. fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
  69. self.args.logger.info('epoch[{}], iter[{}], loss:{}, mean_fps:{:.2f}'
  70. 'imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
  71. if self.args.rank_save_ckpt_flag:
  72. import glob
  73. ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
  74. for ckpt in ckpts:
  75. ckpt_fn = os.path.basename(ckpt)
  76. if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
  77. continue
  78. if ckpt in self.ckpt_history:
  79. continue
  80. self.ckpt_history.append(ckpt)
  81. self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
  82. 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
  83. self.me_epoch_start_step_num = me_step
  84. self.me_epoch_start_time = time.time()
  85. def step_begin(self, run_context):
  86. pass
  87. def step_end(self, run_context, *me_args):
  88. pass
  89. def end(self, run_context):
  90. self.args.logger.info('end network train...')
  91. def parse_args(cloud_args=None):
  92. """parameters"""
  93. parser = argparse.ArgumentParser('mindspore classification training')
  94. # dataset related
  95. parser.add_argument('--data_dir', type=str, default='', help='train data dir')
  96. parser.add_argument('--per_batch_size', default=128, type=int, help='batch size for per gpu')
  97. # network related
  98. parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
  99. # distributed related
  100. parser.add_argument('--is_distributed', type=int, default=1, help='if multi device')
  101. # roma obs
  102. parser.add_argument('--train_url', type=str, default="", help='train url')
  103. args, _ = parser.parse_known_args()
  104. args = merge_args(args, cloud_args)
  105. args.image_size = config.image_size
  106. args.num_classes = config.num_classes
  107. args.lr = config.lr
  108. args.lr_scheduler = config.lr_scheduler
  109. args.lr_epochs = config.lr_epochs
  110. args.lr_gamma = config.lr_gamma
  111. args.eta_min = config.eta_min
  112. args.T_max = config.T_max
  113. args.max_epoch = config.max_epoch
  114. args.backbone = config.backbone
  115. args.warmup_epochs = config.warmup_epochs
  116. args.weight_decay = config.weight_decay
  117. args.momentum = config.momentum
  118. args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
  119. args.loss_scale = config.loss_scale
  120. args.label_smooth = config.label_smooth
  121. args.label_smooth_factor = config.label_smooth_factor
  122. args.ckpt_interval = config.ckpt_interval
  123. args.ckpt_path = config.ckpt_path
  124. args.is_save_on_master = config.is_save_on_master
  125. args.rank = config.rank
  126. args.group_size = config.group_size
  127. args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
  128. args.image_size = list(map(int, args.image_size.split(',')))
  129. return args
  130. def merge_args(args, cloud_args):
  131. """dictionary"""
  132. args_dict = vars(args)
  133. if isinstance(cloud_args, dict):
  134. for key in cloud_args.keys():
  135. val = cloud_args[key]
  136. if key in args_dict and val:
  137. arg_type = type(args_dict[key])
  138. if arg_type is not type(None):
  139. val = arg_type(val)
  140. args_dict[key] = val
  141. return args
  142. def train(cloud_args=None):
  143. """training process"""
  144. args = parse_args(cloud_args)
  145. # init distributed
  146. if args.is_distributed:
  147. init()
  148. args.rank = get_rank()
  149. args.group_size = get_group_size()
  150. if args.is_dynamic_loss_scale == 1:
  151. args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt
  152. # select for master rank save ckpt or all rank save, compatiable for model parallel
  153. args.rank_save_ckpt_flag = 0
  154. if args.is_save_on_master:
  155. if args.rank == 0:
  156. args.rank_save_ckpt_flag = 1
  157. else:
  158. args.rank_save_ckpt_flag = 1
  159. # logger
  160. args.outputs_dir = os.path.join(args.ckpt_path,
  161. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  162. args.logger = get_logger(args.outputs_dir, args.rank)
  163. # dataloader
  164. de_dataset = classification_dataset(args.data_dir, args.image_size,
  165. args.per_batch_size, args.max_epoch,
  166. args.rank, args.group_size)
  167. de_dataset.map_model = 4 # !!!important
  168. args.steps_per_epoch = de_dataset.get_dataset_size()
  169. args.logger.save_args(args)
  170. # network
  171. args.logger.important_info('start create network')
  172. # get network and init
  173. network = get_network(args.backbone, args.num_classes)
  174. if network is None:
  175. raise NotImplementedError('not implement {}'.format(args.backbone))
  176. network.add_flags_recursive(fp16=True)
  177. # loss
  178. if not args.label_smooth:
  179. args.label_smooth_factor = 0.0
  180. criterion = CrossEntropy(smooth_factor=args.label_smooth_factor,
  181. num_classes=args.num_classes)
  182. # load pretrain model
  183. if os.path.isfile(args.pretrained):
  184. param_dict = load_checkpoint(args.pretrained)
  185. param_dict_new = {}
  186. for key, values in param_dict.items():
  187. if key.startswith('moments.'):
  188. continue
  189. elif key.startswith('network.'):
  190. param_dict_new[key[8:]] = values
  191. else:
  192. param_dict_new[key] = values
  193. load_param_into_net(network, param_dict_new)
  194. args.logger.info('load model {} success'.format(args.pretrained))
  195. # lr scheduler
  196. if args.lr_scheduler == 'exponential':
  197. lr = warmup_step_lr(args.lr,
  198. args.lr_epochs,
  199. args.steps_per_epoch,
  200. args.warmup_epochs,
  201. args.max_epoch,
  202. gamma=args.lr_gamma,
  203. )
  204. elif args.lr_scheduler == 'cosine_annealing':
  205. lr = warmup_cosine_annealing_lr(args.lr,
  206. args.steps_per_epoch,
  207. args.warmup_epochs,
  208. args.max_epoch,
  209. args.T_max,
  210. args.eta_min)
  211. else:
  212. raise NotImplementedError(args.lr_scheduler)
  213. # optimizer
  214. opt = Momentum(params=get_param_groups(network),
  215. learning_rate=Tensor(lr),
  216. momentum=args.momentum,
  217. weight_decay=args.weight_decay,
  218. loss_scale=args.loss_scale)
  219. criterion.add_flags_recursive(fp32=True)
  220. # package training process, adjust lr + forward + backward + optimizer
  221. train_net = BuildTrainNetwork(network, criterion)
  222. if args.is_distributed:
  223. parallel_mode = ParallelMode.DATA_PARALLEL
  224. else:
  225. parallel_mode = ParallelMode.STAND_ALONE
  226. if args.is_dynamic_loss_scale == 1:
  227. loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
  228. else:
  229. loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
  230. # Model api changed since TR5_branch 2020/03/09
  231. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
  232. parameter_broadcast=True, mirror_mean=True)
  233. model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager)
  234. # checkpoint save
  235. progress_cb = ProgressMonitor(args)
  236. callbacks = [progress_cb,]
  237. if args.rank_save_ckpt_flag:
  238. ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
  239. ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
  240. keep_checkpoint_max=ckpt_max_num)
  241. ckpt_cb = ModelCheckpoint(config=ckpt_config,
  242. directory=args.outputs_dir,
  243. prefix='{}'.format(args.rank))
  244. callbacks.append(ckpt_cb)
  245. model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
  246. if __name__ == "__main__":
  247. train()