You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """train launch."""
  16. import os
  17. import time
  18. import argparse
  19. import datetime
  20. import mindspore.nn as nn
  21. from mindspore import Tensor
  22. from mindspore.nn.optim import Momentum
  23. from mindspore.communication.management import init, get_rank, get_group_size
  24. from mindspore.train.callback import ModelCheckpoint
  25. from mindspore.train.callback import CheckpointConfig, Callback
  26. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  27. from mindspore.train.model import Model
  28. from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
  29. from mindspore import context
  30. from mindspore.context import ParallelMode
  31. from mindspore.common import set_seed
  32. from src.optimizers import get_param_groups
  33. from src.network import DenseNet121
  34. from src.datasets import classification_dataset
  35. from src.losses.crossentropy import CrossEntropy
  36. from src.lr_scheduler import MultiStepLR, CosineAnnealingLR
  37. from src.utils.logging import get_logger
  38. from src.config import config
  39. devid = int(os.getenv('DEVICE_ID'))
  40. context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
  41. device_target="Davinci", save_graphs=False, device_id=devid)
  42. set_seed(1)
  43. class BuildTrainNetwork(nn.Cell):
  44. """build training network"""
  45. def __init__(self, network, criterion):
  46. super(BuildTrainNetwork, self).__init__()
  47. self.network = network
  48. self.criterion = criterion
  49. def construct(self, input_data, label):
  50. output = self.network(input_data)
  51. loss = self.criterion(output, label)
  52. return loss
  53. class ProgressMonitor(Callback):
  54. """monitor loss and time"""
  55. def __init__(self, args):
  56. super(ProgressMonitor, self).__init__()
  57. self.me_epoch_start_time = 0
  58. self.me_epoch_start_step_num = 0
  59. self.args = args
  60. self.ckpt_history = []
  61. def begin(self, run_context):
  62. self.args.logger.info('start network train...')
  63. def epoch_begin(self, run_context):
  64. pass
  65. def epoch_end(self, run_context, *me_args):
  66. """process epoch end"""
  67. cb_params = run_context.original_args()
  68. me_step = cb_params.cur_step_num - 1
  69. real_epoch = me_step // self.args.steps_per_epoch
  70. time_used = time.time() - self.me_epoch_start_time
  71. fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
  72. self.args.logger.info('epoch[{}], iter[{}], loss:{},'
  73. 'mean_fps:{:.2f} imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
  74. if self.args.rank_save_ckpt_flag:
  75. import glob
  76. ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
  77. for ckpt in ckpts:
  78. ckpt_fn = os.path.basename(ckpt)
  79. if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
  80. continue
  81. if ckpt in self.ckpt_history:
  82. continue
  83. self.ckpt_history.append(ckpt)
  84. self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
  85. 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
  86. self.me_epoch_start_step_num = me_step
  87. self.me_epoch_start_time = time.time()
  88. def step_begin(self, run_context):
  89. pass
  90. def step_end(self, run_context, *me_args):
  91. pass
  92. def end(self, run_context):
  93. self.args.logger.info('end network train...')
  94. def parse_args(cloud_args=None):
  95. """parameters"""
  96. parser = argparse.ArgumentParser('mindspore classification training')
  97. # dataset related
  98. parser.add_argument('--data_dir', type=str, default='', help='train data dir')
  99. # network related
  100. parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
  101. # distributed related
  102. parser.add_argument('--is_distributed', type=int, default=1, help='if multi device')
  103. # roma obs
  104. parser.add_argument('--train_url', type=str, default="", help='train url')
  105. args, _ = parser.parse_known_args()
  106. args = merge_args(args, cloud_args)
  107. args.image_size = config.image_size
  108. args.num_classes = config.num_classes
  109. args.lr = config.lr
  110. args.lr_scheduler = config.lr_scheduler
  111. args.lr_epochs = config.lr_epochs
  112. args.lr_gamma = config.lr_gamma
  113. args.eta_min = config.eta_min
  114. args.T_max = config.T_max
  115. args.max_epoch = config.max_epoch
  116. args.warmup_epochs = config.warmup_epochs
  117. args.weight_decay = config.weight_decay
  118. args.momentum = config.momentum
  119. args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
  120. args.loss_scale = config.loss_scale
  121. args.label_smooth = config.label_smooth
  122. args.label_smooth_factor = config.label_smooth_factor
  123. args.ckpt_interval = config.ckpt_interval
  124. args.ckpt_path = config.ckpt_path
  125. args.is_save_on_master = config.is_save_on_master
  126. args.rank = config.rank
  127. args.group_size = config.group_size
  128. args.log_interval = config.log_interval
  129. args.per_batch_size = config.per_batch_size
  130. args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
  131. args.image_size = list(map(int, args.image_size.split(',')))
  132. return args
  133. def merge_args(args, cloud_args):
  134. """dictionary"""
  135. args_dict = vars(args)
  136. if isinstance(cloud_args, dict):
  137. for key in cloud_args.keys():
  138. val = cloud_args[key]
  139. if key in args_dict and val:
  140. arg_type = type(args_dict[key])
  141. if arg_type is not type(None):
  142. val = arg_type(val)
  143. args_dict[key] = val
  144. return args
  145. def train(cloud_args=None):
  146. """training process"""
  147. args = parse_args(cloud_args)
  148. # init distributed
  149. if args.is_distributed:
  150. init()
  151. args.rank = get_rank()
  152. args.group_size = get_group_size()
  153. if args.is_dynamic_loss_scale == 1:
  154. args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt
  155. # select for master rank save ckpt or all rank save, compatiable for model parallel
  156. args.rank_save_ckpt_flag = 0
  157. if args.is_save_on_master:
  158. if args.rank == 0:
  159. args.rank_save_ckpt_flag = 1
  160. else:
  161. args.rank_save_ckpt_flag = 1
  162. # logger
  163. args.outputs_dir = os.path.join(args.ckpt_path,
  164. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  165. args.logger = get_logger(args.outputs_dir, args.rank)
  166. # dataloader
  167. de_dataset = classification_dataset(args.data_dir, args.image_size,
  168. args.per_batch_size, args.max_epoch,
  169. args.rank, args.group_size)
  170. de_dataset.map_model = 4
  171. args.steps_per_epoch = de_dataset.get_dataset_size()
  172. args.logger.save_args(args)
  173. # network
  174. args.logger.important_info('start create network')
  175. # get network and init
  176. network = DenseNet121(args.num_classes)
  177. # loss
  178. if not args.label_smooth:
  179. args.label_smooth_factor = 0.0
  180. criterion = CrossEntropy(smooth_factor=args.label_smooth_factor,
  181. num_classes=args.num_classes)
  182. # load pretrain model
  183. if os.path.isfile(args.pretrained):
  184. param_dict = load_checkpoint(args.pretrained)
  185. param_dict_new = {}
  186. for key, values in param_dict.items():
  187. if key.startswith('moments.'):
  188. continue
  189. elif key.startswith('network.'):
  190. param_dict_new[key[8:]] = values
  191. else:
  192. param_dict_new[key] = values
  193. load_param_into_net(network, param_dict_new)
  194. args.logger.info('load model {} success'.format(args.pretrained))
  195. # lr scheduler
  196. if args.lr_scheduler == 'exponential':
  197. lr_scheduler = MultiStepLR(args.lr,
  198. args.lr_epochs,
  199. args.lr_gamma,
  200. args.steps_per_epoch,
  201. args.max_epoch,
  202. warmup_epochs=args.warmup_epochs)
  203. elif args.lr_scheduler == 'cosine_annealing':
  204. lr_scheduler = CosineAnnealingLR(args.lr,
  205. args.T_max,
  206. args.steps_per_epoch,
  207. args.max_epoch,
  208. warmup_epochs=args.warmup_epochs,
  209. eta_min=args.eta_min)
  210. else:
  211. raise NotImplementedError(args.lr_scheduler)
  212. lr_schedule = lr_scheduler.get_lr()
  213. # optimizer
  214. opt = Momentum(params=get_param_groups(network),
  215. learning_rate=Tensor(lr_schedule),
  216. momentum=args.momentum,
  217. weight_decay=args.weight_decay,
  218. loss_scale=args.loss_scale)
  219. # mixed precision training
  220. criterion.add_flags_recursive(fp32=True)
  221. # package training process, adjust lr + forward + backward + optimizer
  222. train_net = BuildTrainNetwork(network, criterion)
  223. if args.is_distributed:
  224. parallel_mode = ParallelMode.DATA_PARALLEL
  225. else:
  226. parallel_mode = ParallelMode.STAND_ALONE
  227. if args.is_dynamic_loss_scale == 1:
  228. loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
  229. else:
  230. loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
  231. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
  232. gradients_mean=True)
  233. model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3")
  234. # checkpoint save
  235. progress_cb = ProgressMonitor(args)
  236. callbacks = [progress_cb,]
  237. if args.rank_save_ckpt_flag:
  238. ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
  239. ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
  240. keep_checkpoint_max=ckpt_max_num)
  241. ckpt_cb = ModelCheckpoint(config=ckpt_config,
  242. directory=args.outputs_dir,
  243. prefix='{}'.format(args.rank))
  244. callbacks.append(ckpt_cb)
  245. model.train(args.max_epoch, de_dataset, callbacks=callbacks)
  246. if __name__ == "__main__":
  247. train()