You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Training Interface"""
  16. import sys
  17. import os
  18. import argparse
  19. import copy
  20. from mindspore.communication.management import init, get_rank, get_group_size
  21. from mindspore.train.model import ParallelMode, Model
  22. from mindspore.train.callback import TimeMonitor
  23. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  24. from mindspore.train.loss_scale_manager import FixedLossScaleManager
  25. from mindspore.nn import SGD, RMSProp, Loss, Top1CategoricalAccuracy, \
  26. Top5CategoricalAccuracy
  27. from mindspore import context, Tensor
  28. from src.dataset import create_dataset, create_dataset_val
  29. from src.utils import add_weight_decay, count_params, str2bool, get_lr
  30. from src.callback import EmaEvalCallBack, LossMonitor
  31. from src.loss import LabelSmoothingCrossEntropy
  32. from src.tinynet import tinynet
  33. parser = argparse.ArgumentParser(description='Training')
  34. # training parameters
  35. parser.add_argument('--data_path', type=str, default="", metavar="DIR",
  36. help='path to dataset')
  37. parser.add_argument('--model', default='tinynet_c', type=str, metavar='MODEL',
  38. help='Name of model to train (default: "tinynet_c"')
  39. parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
  40. help='number of label classes (default: 1000)')
  41. parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
  42. help='input batch size for training (default: 32)')
  43. parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
  44. help='Dropout rate (default: 0.)')
  45. parser.add_argument('--drop-connect', type=float, default=0.0, metavar='DROP',
  46. help='Drop connect rate (default: 0.)')
  47. parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
  48. help='Optimizer (default: "sgd"')
  49. parser.add_argument('--opt-eps', default=1e-8, type=float, metavar='EPSILON',
  50. help='Optimizer Epsilon (default: 1e-8)')
  51. parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
  52. help='SGD momentum (default: 0.9)')
  53. parser.add_argument('--weight-decay', type=float, default=0.0001,
  54. help='weight decay (default: 0.0001)')
  55. parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
  56. help='learning rate (default: 0.01)')
  57. parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
  58. help='warmup learning rate (default: 0.0001)')
  59. parser.add_argument('--epochs', type=int, default=200, metavar='N',
  60. help='number of epochs to train (default: 2)')
  61. parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
  62. help='epoch interval to decay LR')
  63. parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
  64. help='epochs to warmup LR, if scheduler supports')
  65. parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
  66. help='LR decay rate (default: 0.1)')
  67. parser.add_argument('--smoothing', type=float, default=0.1,
  68. help='label smoothing (default: 0.1)')
  69. parser.add_argument('--ema-decay', type=float, default=0,
  70. help='decay factor for model weights moving average \
  71. (default: 0.999)')
  72. parser.add_argument('--amp_level', type=str, default='O0')
  73. parser.add_argument('--per_print_times', type=int, default=100)
  74. # batch norm parameters
  75. parser.add_argument('--bn-tf', action='store_true', default=False,
  76. help='Use Tensorflow BatchNorm defaults for models that \
  77. support it (default: False)')
  78. parser.add_argument('--bn-momentum', type=float, default=None,
  79. help='BatchNorm momentum override (if not None)')
  80. parser.add_argument('--bn-eps', type=float, default=None,
  81. help='BatchNorm epsilon override (if not None)')
  82. # parallel parameters
  83. parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
  84. help='how many training processes to use (default: 1)')
  85. parser.add_argument('--distributed', action='store_true', default=False)
  86. parser.add_argument('--dataset_sink', action='store_true', default=True)
  87. # checkpoint config
  88. parser.add_argument('--ckpt', type=str, default=None)
  89. parser.add_argument('--ckpt_save_epoch', type=int, default=1)
  90. parser.add_argument('--loss_scale', type=int,
  91. default=1024, help='static loss scale')
  92. parser.add_argument('--train', type=str2bool, default=1, help='train or eval')
  93. parser.add_argument('--GPU', action='store_true', default=False,
  94. help='Use GPU for training (default: False)')
  95. def main():
  96. """Main entrance for training"""
  97. args = parser.parse_args()
  98. print(sys.argv)
  99. devid, args.rank_id, args.rank_size = 0, 0, 1
  100. context.set_context(mode=context.GRAPH_MODE)
  101. if args.distributed:
  102. if args.GPU:
  103. init("nccl")
  104. context.set_context(device_target='GPU')
  105. else:
  106. init()
  107. devid = int(os.getenv('DEVICE_ID'))
  108. context.set_context(device_target='Ascend',
  109. device_id=devid,
  110. reserve_class_name_in_scope=False)
  111. context.reset_auto_parallel_context()
  112. args.rank_id = get_rank()
  113. args.rank_size = get_group_size()
  114. context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
  115. gradients_mean=True,
  116. device_num=args.rank_size)
  117. else:
  118. if args.GPU:
  119. context.set_context(device_target='GPU')
  120. is_master = not args.distributed or (args.rank_id == 0)
  121. # parse model argument
  122. assert args.model.startswith(
  123. "tinynet"), "Only Tinynet models are supported."
  124. _, sub_name = args.model.split("_")
  125. net = tinynet(sub_model=sub_name,
  126. num_classes=args.num_classes,
  127. drop_rate=args.drop,
  128. drop_connect_rate=args.drop_connect,
  129. global_pool="avg",
  130. bn_tf=args.bn_tf,
  131. bn_momentum=args.bn_momentum,
  132. bn_eps=args.bn_eps)
  133. if is_master:
  134. print("Total number of parameters:", count_params(net))
  135. # input image size of the network
  136. input_size = net.default_cfg['input_size'][1]
  137. train_dataset = val_dataset = None
  138. train_data_url = os.path.join(args.data_path, 'train')
  139. val_data_url = os.path.join(args.data_path, 'val')
  140. val_dataset = create_dataset_val(args.batch_size,
  141. val_data_url,
  142. workers=args.workers,
  143. distributed=False,
  144. input_size=input_size)
  145. if args.train:
  146. train_dataset = create_dataset(args.batch_size,
  147. train_data_url,
  148. workers=args.workers,
  149. distributed=args.distributed,
  150. input_size=input_size)
  151. batches_per_epoch = train_dataset.get_dataset_size()
  152. loss = LabelSmoothingCrossEntropy(
  153. smooth_factor=args.smoothing, num_classes=args.num_classes)
  154. time_cb = TimeMonitor(data_size=batches_per_epoch)
  155. loss_scale_manager = FixedLossScaleManager(
  156. args.loss_scale, drop_overflow_update=False)
  157. lr_array = get_lr(base_lr=args.lr,
  158. total_epochs=args.epochs,
  159. steps_per_epoch=batches_per_epoch,
  160. decay_epochs=args.decay_epochs,
  161. decay_rate=args.decay_rate,
  162. warmup_epochs=args.warmup_epochs,
  163. warmup_lr_init=args.warmup_lr,
  164. global_epoch=0)
  165. lr = Tensor(lr_array)
  166. loss_cb = LossMonitor(lr_array,
  167. args.epochs,
  168. per_print_times=args.per_print_times,
  169. start_epoch=0)
  170. param_group = add_weight_decay(net, weight_decay=args.weight_decay)
  171. if args.opt == 'sgd':
  172. if is_master:
  173. print('Using SGD optimizer')
  174. optimizer = SGD(param_group,
  175. learning_rate=lr,
  176. momentum=args.momentum,
  177. weight_decay=args.weight_decay,
  178. loss_scale=args.loss_scale)
  179. elif args.opt == 'rmsprop':
  180. if is_master:
  181. print('Using rmsprop optimizer')
  182. optimizer = RMSProp(param_group,
  183. learning_rate=lr,
  184. decay=0.9,
  185. weight_decay=args.weight_decay,
  186. momentum=args.momentum,
  187. epsilon=args.opt_eps,
  188. loss_scale=args.loss_scale)
  189. loss.add_flags_recursive(fp32=True, fp16=False)
  190. eval_metrics = {'Validation-Loss': Loss(),
  191. 'Top1-Acc': Top1CategoricalAccuracy(),
  192. 'Top5-Acc': Top5CategoricalAccuracy()}
  193. if args.ckpt:
  194. ckpt = load_checkpoint(args.ckpt)
  195. load_param_into_net(net, ckpt)
  196. net.set_train(False)
  197. model = Model(net, loss, optimizer, metrics=eval_metrics,
  198. loss_scale_manager=loss_scale_manager,
  199. amp_level=args.amp_level)
  200. net_ema = copy.deepcopy(net)
  201. net_ema.set_train(False)
  202. assert args.ema_decay > 0, "EMA should be used in tinynet training."
  203. ema_cb = EmaEvalCallBack(network=net,
  204. ema_network=net_ema,
  205. loss_fn=loss,
  206. eval_dataset=val_dataset,
  207. decay=args.ema_decay,
  208. save_epoch=args.ckpt_save_epoch,
  209. dataset_sink_mode=args.dataset_sink,
  210. start_epoch=0)
  211. callbacks = [loss_cb, ema_cb, time_cb] if is_master else []
  212. if is_master:
  213. print("Training on " + args.model
  214. + " with " + str(args.num_classes) + " classes")
  215. model.train(args.epochs, train_dataset, callbacks=callbacks,
  216. dataset_sink_mode=args.dataset_sink)
  217. if __name__ == '__main__':
  218. main()