diff --git a/gpu_mnist_example/train_gcu.py b/gpu_mnist_example/train_gcu.py new file mode 100644 index 0000000..149ec3f --- /dev/null +++ b/gpu_mnist_example/train_gcu.py @@ -0,0 +1,167 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: + MnistDataset_torch.zip + ├── test + │ ├── MNIST/processed/test.pt + │ └── MNIST/processed/training.pt + │ ├── MNIST/raw/train-images-idx3-ubyte + │ └── MNIST/raw/train-labels-idx1-ubyte + │ ├── MNIST/raw/t10k-images-idx3-ubyte + │ └── MNIST/raw/t10k-labels-idx1-ubyte + ├── train + │ ├── MNIST/processed/test.pt + │ └── MNIST/processed/training.pt + │ ├── MNIST/raw/train-images-idx3-ubyte + │ └── MNIST/raw/train-labels-idx1-ubyte + │ ├── MNIST/raw/t10k-images-idx3-ubyte + │ └── MNIST/raw/t10k-labels-idx1-ubyte + + + +示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl + + +代码会自动放置在/tmp/code目录下。 +数据集在界面选择后,会自动放置在/tmp/dataset目录下。 +预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。 +输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。 +如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train +''' + +import torch +from model import Model +import numpy as np +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +import importlib.util + +def is_torch_dtu_available(): + if importlib.util.find_spec("torch_dtu") is None: + return False + if importlib.util.find_spec("torch_dtu.core") is None: + return False + return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + + +if __name__ == '__main__': + #获取参数并忽略超参数报错 + args, unknown = parser.parse_known_args() + + #初始化导入数据集和预训练模型到容器内 + openi_context = prepare() + + #获取数据集路径,预训练模型路径,输出路径 + dataset_path = openi_context.dataset_path + pretrain_model_path = openi_context.pretrain_model_path + output_path = openi_context.output_path + + dataset_path_A = dataset_path + "/MnistDataset_torch" + pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" + + print("dataset_path:") + print(os.listdir(dataset_path)) + os.listdir(dataset_path) + print("pretrain_model_path:") + print(os.listdir(pretrain_model_path)) + os.listdir(pretrain_model_path) + + print("output_path:") + print(os.listdir(output_path)) + os.listdir(output_path) + # load DPU envs-xx.sh + DTU_FLAG = True + if is_torch_dtu_available(): + import torch_dtu + import torch_dtu.distributed as dist + import torch_dtu.core.dtu_model as dm + from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP + print('dtu is available: True') + device = dm.dtu_device() + DTU_FLAG = True + else: + print('dtu is available: False') + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + DTU_FLAG = False + + + # 参数声明 + model = Model().to(device) + optimizer = SGD(model.parameters(), lr=1e-1) + #log output + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epochs = args.epoch_size + print('epoch_size is:{}'.format(epochs)) + + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"): + checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl") + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for _epoch in range(start_epoch, epochs): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + if DTU_FLAG: + dm.optimizer_step(sgd, barrier=True) + else: + sgd.step() + + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /tmp/output + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} + torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + print('test:') + print(os.listdir("/tmp/output")) \ No newline at end of file