You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tf_train.py 6.7 kB

3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. # coding: utf-8
  2. import tensorflow as tf
  3. from tensorflow.examples.tutorials.mnist import input_data
  4. import os
  5. import argparse
  6. import moxing as mox
  7. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  8. workroot = '/home/work/user-job-dir'
  9. #初始化过滤器
  10. def weight_variable(shape):
  11. return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
  12. #初始化偏置,初始化时,所有值是0.1
  13. def bias_variable(shape):
  14. return tf.Variable(tf.constant(0.1, shape=shape))
  15. #卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1
  16. #第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充
  17. def conv2d(x, W):
  18. return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
  19. #池化运算
  20. def max_pool_2x2(x):
  21. return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
  22. def parse_args():
  23. parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
  24. # define 2 parameters for running on modelArts
  25. # data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
  26. parser.add_argument('--data_url',
  27. help='path to training/inference dataset folder',
  28. default= workroot + '/data/')
  29. parser.add_argument('--train_url',
  30. help='model folder to save/load',
  31. default= workroot + '/model/')
  32. parser.add_argument(
  33. '--device_target',
  34. type=str,
  35. default="Ascend",
  36. choices=['Ascend', 'CPU'],
  37. help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')
  38. #modelarts已经默认使用data_url和train_url
  39. parser.add_argument('--epoch_size',
  40. type=int,
  41. default=5,
  42. help='Training epochs.')
  43. args = parser.parse_args()
  44. return args
  45. if __name__ == "__main__":
  46. args = parse_args()
  47. print('args:')
  48. print(args)
  49. mnist = input_data.read_data_sets('mnist_data', one_hot=True)
  50. #创建x占位符,用于临时存放MNIST图片的数据,
  51. # [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784)
  52. x = tf.placeholder(tf.float32, [None, 784], name='input')
  53. #y_存的是实际图像的标签,即对应于每张输入图片实际的值
  54. y_ = tf.placeholder(tf.float32, [None, 10])
  55. #将图片从784维向量重新还原为28×28的矩阵图片,
  56. # 原因参考卷积神经网络模型图,最后一个参数代表深度,
  57. # 因为MNIST是黑白图片,所以深度为1,
  58. # 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了
  59. x_image = tf.reshape(x, [-1, 28, 28, 1])
  60. #第一层卷积
  61. #将过滤器设置成5×5×1的矩阵,
  62. #其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1
  63. #32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64
  64. W_conv1 = weight_variable([5, 5, 1, 32])
  65. #有多少个特征图就有多少个偏置
  66. b_conv1 = bias_variable([32])
  67. #使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数
  68. h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
  69. #卷积以后再经过池化操作
  70. h_pool1 = max_pool_2x2(h_conv1)
  71. #第二层卷积
  72. #因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变
  73. W_conv2 = weight_variable([5, 5, 32, 64])
  74. b_conv2 = bias_variable([64])
  75. h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
  76. h_pool2 = max_pool_2x2(h_conv2)
  77. #全连接层
  78. #经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2),
  79. #第二层池化后输出为(14/2)×(14/2)),深度为64,
  80. #我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024]
  81. W_fc1 = weight_variable([7 * 7 * 64, 1024])
  82. #偏置的个数和权重的个数一致
  83. b_fc1 = bias_variable([1024])
  84. #这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了)
  85. h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
  86. #使用ReLU激活函数
  87. h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
  88. #dropout
  89. #为了减少过拟合,我们在输出层之前加入dropout
  90. keep_prob = tf.placeholder(tf.float32, name='keep_prob')
  91. h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
  92. #输出层
  93. #全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9),
  94. # 所以这里权重W的尺寸为[1024, 10]
  95. W_fc2 = weight_variable([1024, 10])
  96. b_fc2 = bias_variable([10])
  97. #最后都要经过Softmax函数将输出转化为概率问题
  98. y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output')
  99. #损失函数和损失优化
  100. cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv)))
  101. train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
  102. #测试准确率,跟Softmax回归模型的一样
  103. correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
  104. accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  105. train_dir = workroot + '/model/' #模型存放路径
  106. if not os.path.exists(train_dir):
  107. os.mkdir(train_dir)
  108. obs_train_url = args.train_url
  109. #开始训练
  110. with tf.Session() as sess:
  111. #初始化所有变量
  112. sess.run(tf.global_variables_initializer())
  113. #训练两万次
  114. for i in range(2000):
  115. #每次获取50张图片数据和对应的标签
  116. batch = mnist.train.next_batch(50)
  117. #每训练100次,我们打印一次训练的准确率
  118. if i % 100 == 0:
  119. train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0})
  120. print("step %d, training accuracy %g" % (i, train_accuracy))
  121. #这里是真的训练,将数据传入
  122. sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5})
  123. # 用SavedModel的方式保存
  124. tf.compat.v1.saved_model.simple_save(sess,
  125. train_dir +"saved_model",
  126. inputs={"input": x, 'keep_prob':keep_prob},
  127. outputs={"output": y_conv})
  128. try:
  129. mox.file.copy_parallel(train_dir, obs_train_url)
  130. print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
  131. except Exception as e:
  132. print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))

No Description