You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """train launch."""
  16. import os
  17. import time
  18. import argparse
  19. import datetime
  20. import mindspore.nn as nn
  21. from mindspore import Tensor
  22. from mindspore.nn.optim import Momentum
  23. from mindspore.communication.management import init, get_rank, get_group_size
  24. from mindspore.train.callback import ModelCheckpoint
  25. from mindspore.train.callback import CheckpointConfig, Callback
  26. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  27. from mindspore.train.model import Model
  28. from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
  29. from mindspore import context
  30. from mindspore.context import ParallelMode
  31. from mindspore.common import set_seed
  32. from src.optimizers import get_param_groups
  33. from src.losses.crossentropy import CrossEntropy
  34. from src.lr_scheduler import MultiStepLR, CosineAnnealingLR
  35. from src.utils.logging import get_logger
  36. set_seed(1)
  37. class BuildTrainNetwork(nn.Cell):
  38. """build training network"""
  39. def __init__(self, net, crit):
  40. super(BuildTrainNetwork, self).__init__()
  41. self.network = net
  42. self.criterion = crit
  43. def construct(self, input_data, label):
  44. output = self.network(input_data)
  45. loss = self.criterion(output, label)
  46. return loss
  47. class ProgressMonitor(Callback):
  48. """monitor loss and time"""
  49. def __init__(self, args):
  50. super(ProgressMonitor, self).__init__()
  51. self.me_epoch_start_time = 0
  52. self.me_epoch_start_step_num = 0
  53. self.args = args
  54. self.ckpt_history = []
  55. def begin(self, run_context):
  56. self.args.logger.info('start network train...')
  57. def epoch_begin(self, run_context):
  58. pass
  59. def epoch_end(self, run_context, *me_args):
  60. """process epoch end"""
  61. cb_params = run_context.original_args()
  62. me_step = cb_params.cur_step_num - 1
  63. real_epoch = me_step // self.args.steps_per_epoch
  64. time_used = time.time() - self.me_epoch_start_time
  65. fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
  66. self.args.logger.info('epoch[{}], iter[{}], loss:{},'
  67. 'mean_fps:{:.2f} imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
  68. if self.args.rank_save_ckpt_flag:
  69. import glob
  70. ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
  71. for ckpt in ckpts:
  72. ckpt_fn = os.path.basename(ckpt)
  73. if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
  74. continue
  75. if ckpt in self.ckpt_history:
  76. continue
  77. self.ckpt_history.append(ckpt)
  78. self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
  79. 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
  80. self.me_epoch_start_step_num = me_step
  81. self.me_epoch_start_time = time.time()
  82. def step_begin(self, run_context):
  83. pass
  84. def step_end(self, run_context, *me_args):
  85. pass
  86. def end(self, run_context):
  87. self.args.logger.info('end network train...')
  88. def parse_args(cloud_args=None):
  89. """parameters"""
  90. parser = argparse.ArgumentParser('mindspore classification training')
  91. # network and dataset choices
  92. parser.add_argument('--net', type=str, default='', help='Densenet Model, densenet100 or densenet121')
  93. parser.add_argument('--dataset', type=str, default='', help='Dataset, either cifar10 or imagenet')
  94. # dataset related
  95. parser.add_argument('--data_dir', type=str, default='', help='train data dir')
  96. # network related
  97. parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
  98. # distributed related
  99. parser.add_argument('--is_distributed', type=int, default=1, help='if multi device')
  100. # roma obs
  101. parser.add_argument('--train_url', type=str, default="", help='train url')
  102. # platform
  103. parser.add_argument('--device_target', type=str, default='Ascend', choices=('Ascend', 'GPU', 'CPU'),
  104. help='device target')
  105. args, _ = parser.parse_known_args()
  106. args = merge_args(args, cloud_args)
  107. if args.net == "densenet100":
  108. from src.config import config_100 as config
  109. else:
  110. from src.config import config_121 as config
  111. args.image_size = config.image_size
  112. args.num_classes = config.num_classes
  113. args.lr = config.lr
  114. args.lr_scheduler = config.lr_scheduler
  115. args.lr_epochs = config.lr_epochs
  116. args.lr_gamma = config.lr_gamma
  117. args.eta_min = config.eta_min
  118. args.T_max = config.T_max
  119. args.max_epoch = config.max_epoch
  120. args.warmup_epochs = config.warmup_epochs
  121. args.weight_decay = config.weight_decay
  122. args.momentum = config.momentum
  123. args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
  124. args.loss_scale = config.loss_scale
  125. args.label_smooth = config.label_smooth
  126. args.label_smooth_factor = config.label_smooth_factor
  127. args.ckpt_interval = config.ckpt_interval
  128. args.ckpt_path = config.ckpt_path
  129. args.is_save_on_master = config.is_save_on_master
  130. args.rank = config.rank
  131. args.group_size = config.group_size
  132. args.log_interval = config.log_interval
  133. args.per_batch_size = config.per_batch_size
  134. args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
  135. args.image_size = list(map(int, args.image_size.split(',')))
  136. return args
  137. def merge_args(args, cloud_args):
  138. """dictionary"""
  139. args_dict = vars(args)
  140. if isinstance(cloud_args, dict):
  141. for k in cloud_args.keys():
  142. val = cloud_args[k]
  143. if k in args_dict and val:
  144. arg_type = type(args_dict[k])
  145. if arg_type is not type(None):
  146. val = arg_type(val)
  147. args_dict[k] = val
  148. return args
  149. def get_lr_scheduler(args):
  150. if args.lr_scheduler == 'exponential':
  151. lr_scheduler = MultiStepLR(args.lr, args.lr_epochs, args.lr_gamma, args.steps_per_epoch, args.max_epoch,
  152. warmup_epochs=args.warmup_epochs)
  153. elif args.lr_scheduler == 'cosine_annealing':
  154. lr_scheduler = CosineAnnealingLR(args.lr, args.T_max, args.steps_per_epoch, args.max_epoch,
  155. warmup_epochs=args.warmup_epochs, eta_min=args.eta_min)
  156. else:
  157. raise NotImplementedError(args.lr_scheduler)
  158. return lr_scheduler
  159. def train(cloud_args=None):
  160. """training process"""
  161. args = parse_args(cloud_args)
  162. context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
  163. device_target=args.device_target, save_graphs=False)
  164. if args.device_target == 'Ascend':
  165. devid = int(os.getenv('DEVICE_ID'))
  166. context.set_context(device_id=devid)
  167. # init distributed
  168. if args.is_distributed:
  169. init()
  170. args.rank = get_rank()
  171. args.group_size = get_group_size()
  172. if args.is_dynamic_loss_scale == 1:
  173. args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt
  174. # select for master rank save ckpt or all rank save, compatible for model parallel
  175. args.rank_save_ckpt_flag = 0
  176. if args.is_save_on_master:
  177. if args.rank == 0:
  178. args.rank_save_ckpt_flag = 1
  179. else:
  180. args.rank_save_ckpt_flag = 1
  181. # logger
  182. args.outputs_dir = os.path.join(args.ckpt_path,
  183. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  184. args.logger = get_logger(args.outputs_dir, args.rank)
  185. if args.net == "densenet100":
  186. from src.network.densenet import DenseNet100 as DenseNet
  187. else:
  188. from src.network.densenet import DenseNet121 as DenseNet
  189. if args.dataset == "cifar10":
  190. from src.datasets import classification_dataset_cifar10 as classification_dataset
  191. else:
  192. from src.datasets import classification_dataset_imagenet as classification_dataset
  193. # dataloader
  194. de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch,
  195. args.rank, args.group_size)
  196. de_dataset.map_model = 4
  197. args.steps_per_epoch = de_dataset.get_dataset_size()
  198. args.logger.save_args(args)
  199. # network
  200. args.logger.important_info('start create network')
  201. # get network and init
  202. network = DenseNet(args.num_classes)
  203. # loss
  204. if not args.label_smooth:
  205. args.label_smooth_factor = 0.0
  206. criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)
  207. # load pretrain model
  208. if os.path.isfile(args.pretrained):
  209. param_dict = load_checkpoint(args.pretrained)
  210. param_dict_new = {}
  211. for key, values in param_dict.items():
  212. if key.startswith('moments.'):
  213. continue
  214. elif key.startswith('network.'):
  215. param_dict_new[key[8:]] = values
  216. else:
  217. param_dict_new[key] = values
  218. load_param_into_net(network, param_dict_new)
  219. args.logger.info('load model {} success'.format(args.pretrained))
  220. # lr scheduler
  221. lr_scheduler = get_lr_scheduler(args)
  222. lr_schedule = lr_scheduler.get_lr()
  223. # optimizer
  224. opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr_schedule),
  225. momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale)
  226. # mixed precision training
  227. criterion.add_flags_recursive(fp32=True)
  228. # package training process, adjust lr + forward + backward + optimizer
  229. train_net = BuildTrainNetwork(network, criterion)
  230. if args.is_distributed:
  231. parallel_mode = ParallelMode.DATA_PARALLEL
  232. else:
  233. parallel_mode = ParallelMode.STAND_ALONE
  234. if args.is_dynamic_loss_scale == 1:
  235. loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
  236. else:
  237. loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
  238. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
  239. gradients_mean=True)
  240. if args.device_target == 'Ascend':
  241. model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3")
  242. elif args.device_target == 'GPU':
  243. model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0")
  244. elif args.device_target == 'CPU':
  245. model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0")
  246. else:
  247. raise ValueError("Unsupported device target.")
  248. # checkpoint save
  249. progress_cb = ProgressMonitor(args)
  250. callbacks = [progress_cb,]
  251. if args.rank_save_ckpt_flag:
  252. ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
  253. ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
  254. keep_checkpoint_max=ckpt_max_num)
  255. ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir,
  256. prefix='{}'.format(args.rank))
  257. callbacks.append(ckpt_cb)
  258. model.train(args.max_epoch, de_dataset, callbacks=callbacks)
  259. if __name__ == "__main__":
  260. train()