You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train_for_c2net.py 4.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/python
  2. #coding=utf-8
  3. '''
  4. If there are Chinese comments in the code,please add at the beginning:
  5. #!/usr/bin/python
  6. #coding=utf-8
  7. In the training environment,
  8. the code will be automatically placed in the /tmp/code directory,
  9. the uploaded dataset will be automatically placed in the /tmp/dataset directory
  10. Note: the paths are different when selecting a single dataset and multiple datasets.
  11. (1)If it is a single dataset: if MnistDataset_torch.zip is selected,
  12. the dataset directory is /tmp/dataset/train, /dataset/test;
  13. The dataset structure of the single dataset in the training image in this example:
  14. tmp
  15. ├──dataset
  16. ├── test
  17. └── train
  18. If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip,
  19. the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
  20. and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
  21. The dataset structure in the training image for multiple datasets in this example:
  22. tmp
  23. ├──dataset
  24. ├── MnistDataset_torch
  25. | ├── test
  26. | └── train
  27. └── checkpoint_epoch1_0.73
  28. ├── mnist_epoch1_0.73.pkl
  29. the model download path is under /tmp/output by default, please specify the model output location to /tmp/output,
  30. qizhi platform will provide file downloads under the /tmp/output directory.
  31. In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool,
  32. which is written as:
  33. import os
  34. os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
  35. '''
  36. from model import Model
  37. import numpy as np
  38. import torch
  39. from torchvision.datasets import mnist
  40. from torch.nn import CrossEntropyLoss
  41. from torch.optim import SGD
  42. from torch.utils.data import DataLoader
  43. from torchvision.transforms import ToTensor
  44. import argparse
  45. import os
  46. # Training settings
  47. parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
  48. #The dataset location is placed under /dataset
  49. parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
  50. parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
  51. parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
  52. parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
  53. if __name__ == '__main__':
  54. args, unknown = parser.parse_known_args()
  55. #log output
  56. print('cuda is available:{}'.format(torch.cuda.is_available()))
  57. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  58. batch_size = args.batch_size
  59. train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
  60. test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
  61. train_loader = DataLoader(train_dataset, batch_size=batch_size)
  62. test_loader = DataLoader(test_dataset, batch_size=batch_size)
  63. model = Model().to(device)
  64. sgd = SGD(model.parameters(), lr=1e-1)
  65. cost = CrossEntropyLoss()
  66. epoch = args.epoch_size
  67. print('epoch_size is:{}'.format(epoch))
  68. for _epoch in range(epoch):
  69. print('the {} epoch_size begin'.format(_epoch + 1))
  70. model.train()
  71. for idx, (train_x, train_label) in enumerate(train_loader):
  72. train_x = train_x.to(device)
  73. train_label = train_label.to(device)
  74. label_np = np.zeros((train_label.shape[0], 10))
  75. sgd.zero_grad()
  76. predict_y = model(train_x.float())
  77. loss = cost(predict_y, train_label.long())
  78. if idx % 10 == 0:
  79. print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
  80. loss.backward()
  81. sgd.step()
  82. correct = 0
  83. _sum = 0
  84. model.eval()
  85. for idx, (test_x, test_label) in enumerate(test_loader):
  86. test_x = test_x
  87. test_label = test_label
  88. predict_y = model(test_x.to(device).float()).detach()
  89. predict_ys = np.argmax(predict_y.cpu(), axis=-1)
  90. label_np = test_label.numpy()
  91. _ = predict_ys == test_label
  92. correct += np.sum(_.numpy(), axis=-1)
  93. _sum += _.shape[0]
  94. print('accuracy: {:.2f}'.format(correct / _sum))
  95. #The model output location is placed under /model
  96. state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
  97. torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
  98. #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi
  99. os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")