You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 8.2 kB

3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. """
  2. ######################## single-dataset train lenet example ########################
  3. This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training
  4. tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
  5. ######################## Instructions for using the training environment ########################
  6. The image of the debugging environment and the image of the training environment are two different images,
  7. and the working local directories are different. In the training task, you need to pay attention to the following points.
  8. 1、(1)The structure of the dataset uploaded for single dataset training in this example
  9. MNISTData.zip
  10. ├── test
  11. │ ├── t10k-images-idx3-ubyte
  12. │ └── t10k-labels-idx1-ubyte
  13. └── train
  14. ├── train-images-idx3-ubyte
  15. └── train-labels-idx1-ubyte
  16. (2)The dataset structure of the single dataset in the training image in this example
  17. workroot
  18. ├── data
  19. | ├── test
  20. | └── train
  21. 2、Single dataset training requires predefined functions
  22. (1)Defines whether the task is a training environment or a debugging environment.
  23. def WorkEnvironment(environment):
  24. if environment == 'train':
  25. workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
  26. elif environment == 'debug':
  27. workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
  28. print('current work mode:' + environment + ', workroot:' + workroot)
  29. return workroot
  30. (2)Copy single dataset from obs to training image.
  31. def ObsToEnv(obs_data_url, data_dir):
  32. try:
  33. mox.file.copy_parallel(obs_data_url, data_dir)
  34. print("Successfully Download {} to {}".format(obs_data_url, data_dir))
  35. except Exception as e:
  36. print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
  37. return
  38. (3)Copy the output model to obs.
  39. def EnvToObs(train_dir, obs_train_url):
  40. try:
  41. mox.file.copy_parallel(train_dir, obs_train_url)
  42. print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
  43. except Exception as e:
  44. print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
  45. return
  46. 3、3 parameters need to be defined
  47. --data_url is the dataset you selected on the Qizhi platform
  48. --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task,
  49. otherwise an error will be reported.
  50. There is no need to add these parameters to the running parameters of the Qizhi platform,
  51. because they are predefined in the background, you only need to define them in your code.
  52. 4、How the dataset is used
  53. A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
  54. of the dataset in the image.
  55. For details, please refer to the following sample code.
  56. """
  57. import os
  58. import argparse
  59. import moxing as mox
  60. from config import mnist_cfg as cfg
  61. from dataset import create_dataset
  62. from lenet import LeNet5
  63. import mindspore.nn as nn
  64. from mindspore import context
  65. from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
  66. from mindspore.train import Model
  67. from mindspore.nn.metrics import Accuracy
  68. from mindspore.common import set_seed
  69. ### Defines whether the task is a training environment or a debugging environment ###
  70. def WorkEnvironment(environment):
  71. if environment == 'train':
  72. workroot = '/home/work/user-job-dir'
  73. elif environment == 'debug':
  74. workroot = '/home/work'
  75. print('current work mode:' + environment + ', workroot:' + workroot)
  76. return workroot
  77. ### Copy single dataset from obs to training image###
  78. def ObsToEnv(obs_data_url, data_dir):
  79. try:
  80. mox.file.copy_parallel(obs_data_url, data_dir)
  81. print("Successfully Download {} to {}".format(obs_data_url, data_dir))
  82. except Exception as e:
  83. print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
  84. return
  85. ### Copy the output model to obs###
  86. def EnvToObs(train_dir, obs_train_url):
  87. try:
  88. mox.file.copy_parallel(train_dir, obs_train_url)
  89. print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
  90. except Exception as e:
  91. print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
  92. return
  93. ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset,
  94. ### otherwise an error will be reported.
  95. ###There is no need to add these parameters to the running parameters of the Qizhi platform,
  96. ###because they are predefined in the background, you only need to define them in your code.
  97. parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
  98. parser.add_argument('--data_url',
  99. help='path to training/inference dataset folder',
  100. default= WorkEnvironment('train') + '/data/')
  101. parser.add_argument('--train_url',
  102. help='model folder to save/load',
  103. default= WorkEnvironment('train') + '/model/')
  104. parser.add_argument(
  105. '--device_target',
  106. type=str,
  107. default="Ascend",
  108. choices=['Ascend', 'CPU'],
  109. help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
  110. parser.add_argument('--epoch_size',
  111. type=int,
  112. default=5,
  113. help='Training epochs.')
  114. if __name__ == "__main__":
  115. args, unknown = parser.parse_known_args()
  116. ### defining the training environment
  117. environment = 'train'
  118. workroot = WorkEnvironment(environment)
  119. ###Initialize the data and model directories in the training image###
  120. data_dir = workroot + '/data'
  121. train_dir = workroot + '/model'
  122. if not os.path.exists(data_dir):
  123. os.makedirs(data_dir)
  124. if not os.path.exists(train_dir):
  125. os.makedirs(train_dir)
  126. ### Copy the dataset from obs to the training image ###
  127. ObsToEnv(args.data_url,data_dir)
  128. ###Specifies the device CPU or Ascend NPU used for training###
  129. context.set_context(mode=context.GRAPH_MODE,
  130. device_target=args.device_target)
  131. ds_train = create_dataset(os.path.join(data_dir, "train"),
  132. cfg.batch_size)
  133. if ds_train.get_dataset_size() == 0:
  134. raise ValueError(
  135. "Please check dataset size > 0 and batch_size <= dataset size")
  136. network = LeNet5(cfg.num_classes)
  137. net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
  138. net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
  139. time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
  140. if args.device_target != "Ascend":
  141. model = Model(network,
  142. net_loss,
  143. net_opt,
  144. metrics={"accuracy": Accuracy()})
  145. else:
  146. model = Model(network,
  147. net_loss,
  148. net_opt,
  149. metrics={"accuracy": Accuracy()},
  150. amp_level="O2")
  151. config_ck = CheckpointConfig(
  152. save_checkpoint_steps=cfg.save_checkpoint_steps,
  153. keep_checkpoint_max=cfg.keep_checkpoint_max)
  154. ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
  155. directory=train_dir,
  156. config=config_ck)
  157. print("============== Starting Training ==============")
  158. epoch_size = cfg['epoch_size']
  159. if (args.epoch_size):
  160. epoch_size = args.epoch_size
  161. print('epoch_size is: ', epoch_size)
  162. model.train(epoch_size,
  163. ds_train,
  164. callbacks=[time_cb, ckpoint_cb,
  165. LossMonitor()])
  166. ###Copy the trained model data from the local running environment back to obs,
  167. ###and download it in the training task corresponding to the Qizhi platform
  168. EnvToObs(train_dir, args.train_url)