添加 'gpu/train_fail3.py'

3 years ago · 66fa99a88a
--- a/gpu/train_fail3.py
+++ b/gpu/train_fail3.py
@@ -0,0 +1,93 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        print("----------this is the end--------")
        print(a)