ran.lu
/
910test
forked from kyxt/gpunpu

#!/usr/bin/python
#coding=utf-8
'''
If there are Chinese comments in the code，please add at the beginning：
#!/usr/bin/python
#coding=utf-8   

In the training environment, 
the code will be automatically placed in the /tmp/code directory, 
the uploaded dataset will be automatically placed in the /tmp/dataset directory

Note: the paths are different when selecting a single dataset and multiple datasets.
(1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /tmp/dataset/train, /dataset/test;

The dataset structure of the single dataset in the training image in this example:
  tmp
    ├──dataset 
         ├── test
         └── train 

If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
The dataset structure in the training image for multiple datasets in this example:
tmp
  ├──dataset
     ├── MnistDataset_torch
     |     ├── test
     |     └── train 
     └── checkpoint_epoch1_0.73 
           ├── mnist_epoch1_0.73.pkl


the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
qizhi platform will provide file downloads under the /tmp/output directory.

In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
which is written as: 
import os
os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
'''


from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
import os

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#The dataset location is placed under /dataset
parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print('the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()

        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi
        os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")