You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """train ImageNet."""
  16. import os
  17. import time
  18. import argparse
  19. import datetime
  20. import mindspore.nn as nn
  21. from mindspore import Tensor, context
  22. from mindspore.context import ParallelMode
  23. from mindspore.nn.optim import Momentum
  24. from mindspore.communication.management import init, get_rank, get_group_size
  25. from mindspore.train.callback import ModelCheckpoint
  26. from mindspore.train.callback import CheckpointConfig, Callback
  27. from mindspore.train.model import Model
  28. from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
  29. from mindspore.common import set_seed
  30. from src.dataset import classification_dataset
  31. from src.crossentropy import CrossEntropy
  32. from src.lr_generator import get_lr
  33. from src.utils.logging import get_logger
  34. from src.utils.optimizers__init__ import get_param_groups
  35. from src.utils.var_init import load_pretrain_model
  36. from src.image_classification import get_network
  37. from src.config import config
  38. set_seed(1)
  39. class BuildTrainNetwork(nn.Cell):
  40. """build training network"""
  41. def __init__(self, network, criterion):
  42. super(BuildTrainNetwork, self).__init__()
  43. self.network = network
  44. self.criterion = criterion
  45. def construct(self, input_data, label):
  46. output = self.network(input_data)
  47. loss = self.criterion(output, label)
  48. return loss
  49. class ProgressMonitor(Callback):
  50. """monitor loss and time"""
  51. def __init__(self, args):
  52. super(ProgressMonitor, self).__init__()
  53. self.me_epoch_start_time = 0
  54. self.me_epoch_start_step_num = 0
  55. self.args = args
  56. self.ckpt_history = []
  57. def begin(self, run_context):
  58. self.args.logger.info('start network train...')
  59. def epoch_begin(self, run_context):
  60. pass
  61. def epoch_end(self, run_context, *me_args):
  62. cb_params = run_context.original_args()
  63. me_step = cb_params.cur_step_num - 1
  64. real_epoch = me_step // self.args.steps_per_epoch
  65. time_used = time.time() - self.me_epoch_start_time
  66. fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
  67. self.args.logger.info('epoch[{}], iter[{}], loss:{}, mean_fps:{:.2f}'
  68. 'imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
  69. if self.args.rank_save_ckpt_flag:
  70. import glob
  71. ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
  72. for ckpt in ckpts:
  73. ckpt_fn = os.path.basename(ckpt)
  74. if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
  75. continue
  76. if ckpt in self.ckpt_history:
  77. continue
  78. self.ckpt_history.append(ckpt)
  79. self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
  80. 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
  81. self.me_epoch_start_step_num = me_step
  82. self.me_epoch_start_time = time.time()
  83. def step_begin(self, run_context):
  84. pass
  85. def step_end(self, run_context, *me_args):
  86. pass
  87. def end(self, run_context):
  88. self.args.logger.info('end network train...')
  89. def parse_args(cloud_args=None):
  90. """parameters"""
  91. parser = argparse.ArgumentParser('mindspore classification training')
  92. parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform')
  93. # dataset related
  94. parser.add_argument('--data_dir', type=str, default='', help='train data dir')
  95. parser.add_argument('--per_batch_size', default=128, type=int, help='batch size for per gpu')
  96. # network related
  97. parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')
  98. # distributed related
  99. parser.add_argument('--is_distributed', action="store_true", default=False, help='if multi device')
  100. # roma obs
  101. parser.add_argument('--train_url', type=str, default="", help='train url')
  102. args, _ = parser.parse_known_args()
  103. args = merge_args(args, cloud_args)
  104. args.image_size = config.image_size
  105. args.num_classes = config.num_classes
  106. args.lr = config.lr
  107. args.lr_scheduler = config.lr_scheduler
  108. args.lr_epochs = config.lr_epochs
  109. args.lr_gamma = config.lr_gamma
  110. args.eta_min = config.eta_min
  111. args.T_max = config.T_max
  112. args.max_epoch = config.max_epoch
  113. args.warmup_epochs = config.warmup_epochs
  114. args.weight_decay = config.weight_decay
  115. args.momentum = config.momentum
  116. args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
  117. args.loss_scale = config.loss_scale
  118. args.label_smooth = config.label_smooth
  119. args.label_smooth_factor = config.label_smooth_factor
  120. args.ckpt_interval = config.ckpt_interval
  121. args.ckpt_save_max = config.ckpt_save_max
  122. args.ckpt_path = config.ckpt_path
  123. args.is_save_on_master = config.is_save_on_master
  124. args.rank = config.rank
  125. args.group_size = config.group_size
  126. args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
  127. args.image_size = list(map(int, args.image_size.split(',')))
  128. context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
  129. device_target=args.platform, save_graphs=False)
  130. # init distributed
  131. if args.is_distributed:
  132. init()
  133. args.rank = get_rank()
  134. args.group_size = get_group_size()
  135. else:
  136. args.rank = 0
  137. args.group_size = 1
  138. if args.is_dynamic_loss_scale == 1:
  139. args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt
  140. # select for master rank save ckpt or all rank save, compatible for model parallel
  141. args.rank_save_ckpt_flag = 0
  142. if args.is_save_on_master:
  143. if args.rank == 0:
  144. args.rank_save_ckpt_flag = 1
  145. else:
  146. args.rank_save_ckpt_flag = 1
  147. # logger
  148. args.outputs_dir = os.path.join(args.ckpt_path,
  149. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  150. args.logger = get_logger(args.outputs_dir, args.rank)
  151. return args
  152. def merge_args(args, cloud_args):
  153. """dictionary"""
  154. args_dict = vars(args)
  155. if isinstance(cloud_args, dict):
  156. for key in cloud_args.keys():
  157. val = cloud_args[key]
  158. if key in args_dict and val:
  159. arg_type = type(args_dict[key])
  160. if arg_type is not type(None):
  161. val = arg_type(val)
  162. args_dict[key] = val
  163. return args
  164. def train(cloud_args=None):
  165. """training process"""
  166. args = parse_args(cloud_args)
  167. if os.getenv('DEVICE_ID', "not_set").isdigit():
  168. context.set_context(device_id=int(os.getenv('DEVICE_ID')))
  169. # init distributed
  170. if args.is_distributed:
  171. parallel_mode = ParallelMode.DATA_PARALLEL
  172. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
  173. gradients_mean=True)
  174. # dataloader
  175. de_dataset = classification_dataset(args.data_dir, args.image_size,
  176. args.per_batch_size, 1,
  177. args.rank, args.group_size, num_parallel_workers=8)
  178. de_dataset.map_model = 4 # !!!important
  179. args.steps_per_epoch = de_dataset.get_dataset_size()
  180. args.logger.save_args(args)
  181. # network
  182. args.logger.important_info('start create network')
  183. # get network and init
  184. network = get_network(num_classes=args.num_classes, platform=args.platform)
  185. load_pretrain_model(args.pretrained, network, args)
  186. # lr scheduler
  187. lr = get_lr(args)
  188. # optimizer
  189. opt = Momentum(params=get_param_groups(network),
  190. learning_rate=Tensor(lr),
  191. momentum=args.momentum,
  192. weight_decay=args.weight_decay,
  193. loss_scale=args.loss_scale)
  194. # loss
  195. if not args.label_smooth:
  196. args.label_smooth_factor = 0.0
  197. loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)
  198. if args.is_dynamic_loss_scale == 1:
  199. loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
  200. else:
  201. loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
  202. model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
  203. metrics={'acc'}, amp_level="O3")
  204. # checkpoint save
  205. progress_cb = ProgressMonitor(args)
  206. callbacks = [progress_cb,]
  207. if args.rank_save_ckpt_flag:
  208. ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
  209. keep_checkpoint_max=args.ckpt_save_max)
  210. save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
  211. ckpt_cb = ModelCheckpoint(config=ckpt_config,
  212. directory=save_ckpt_path,
  213. prefix='{}'.format(args.rank))
  214. callbacks.append(ckpt_cb)
  215. model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
  216. if __name__ == "__main__":
  217. train()