| @@ -0,0 +1,167 @@ | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| ''' | |||
| If there are Chinese comments in the code,please add at the beginning: | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| 示例选用的数据集是MnistDataset_torch.zip | |||
| 数据集结构是: | |||
| MnistDataset_torch.zip | |||
| ├── test | |||
| │ ├── MNIST/processed/test.pt | |||
| │ └── MNIST/processed/training.pt | |||
| │ ├── MNIST/raw/train-images-idx3-ubyte | |||
| │ └── MNIST/raw/train-labels-idx1-ubyte | |||
| │ ├── MNIST/raw/t10k-images-idx3-ubyte | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| ├── train | |||
| │ ├── MNIST/processed/test.pt | |||
| │ └── MNIST/processed/training.pt | |||
| │ ├── MNIST/raw/train-images-idx3-ubyte | |||
| │ └── MNIST/raw/train-labels-idx1-ubyte | |||
| │ ├── MNIST/raw/t10k-images-idx3-ubyte | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| 示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl | |||
| 代码会自动放置在/tmp/code目录下。 | |||
| 数据集在界面选择后,会自动放置在/tmp/dataset目录下。 | |||
| 预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。 | |||
| 输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。 | |||
| 如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train | |||
| ''' | |||
| import torch | |||
| from model import Model | |||
| import numpy as np | |||
| from torchvision.datasets import mnist | |||
| from torch.nn import CrossEntropyLoss | |||
| from torch.optim import SGD | |||
| from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| import importlib.util | |||
| def is_torch_dtu_available(): | |||
| if importlib.util.find_spec("torch_dtu") is None: | |||
| return False | |||
| if importlib.util.find_spec("torch_dtu.core") is None: | |||
| return False | |||
| return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') | |||
| parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') | |||
| if __name__ == '__main__': | |||
| #获取参数并忽略超参数报错 | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| dataset_path_A = dataset_path + "/MnistDataset_torch" | |||
| pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" | |||
| print("dataset_path:") | |||
| print(os.listdir(dataset_path)) | |||
| os.listdir(dataset_path) | |||
| print("pretrain_model_path:") | |||
| print(os.listdir(pretrain_model_path)) | |||
| os.listdir(pretrain_model_path) | |||
| print("output_path:") | |||
| print(os.listdir(output_path)) | |||
| os.listdir(output_path) | |||
| # load DPU envs-xx.sh | |||
| DTU_FLAG = True | |||
| if is_torch_dtu_available(): | |||
| import torch_dtu | |||
| import torch_dtu.distributed as dist | |||
| import torch_dtu.core.dtu_model as dm | |||
| from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP | |||
| print('dtu is available: True') | |||
| device = dm.dtu_device() | |||
| DTU_FLAG = True | |||
| else: | |||
| print('dtu is available: False') | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| DTU_FLAG = False | |||
| # 参数声明 | |||
| model = Model().to(device) | |||
| optimizer = SGD(model.parameters(), lr=1e-1) | |||
| #log output | |||
| batch_size = args.batch_size | |||
| train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False) | |||
| train_loader = DataLoader(train_dataset, batch_size=batch_size) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| model = Model().to(device) | |||
| sgd = SGD(model.parameters(), lr=1e-1) | |||
| cost = CrossEntropyLoss() | |||
| epochs = args.epoch_size | |||
| print('epoch_size is:{}'.format(epochs)) | |||
| # 如果有保存的模型,则加载模型,并在其基础上继续训练 | |||
| if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"): | |||
| checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl") | |||
| model.load_state_dict(checkpoint['model']) | |||
| optimizer.load_state_dict(checkpoint['optimizer']) | |||
| start_epoch = checkpoint['epoch'] | |||
| print('加载 epoch {} 权重成功!'.format(start_epoch)) | |||
| else: | |||
| start_epoch = 0 | |||
| print('无保存模型,将从头开始训练!') | |||
| for _epoch in range(start_epoch, epochs): | |||
| print('the {} epoch_size begin'.format(_epoch + 1)) | |||
| model.train() | |||
| for idx, (train_x, train_label) in enumerate(train_loader): | |||
| train_x = train_x.to(device) | |||
| train_label = train_label.to(device) | |||
| label_np = np.zeros((train_label.shape[0], 10)) | |||
| sgd.zero_grad() | |||
| predict_y = model(train_x.float()) | |||
| loss = cost(predict_y, train_label.long()) | |||
| if idx % 10 == 0: | |||
| print('idx: {}, loss: {}'.format(idx, loss.sum().item())) | |||
| loss.backward() | |||
| if DTU_FLAG: | |||
| dm.optimizer_step(sgd, barrier=True) | |||
| else: | |||
| sgd.step() | |||
| correct = 0 | |||
| _sum = 0 | |||
| model.eval() | |||
| for idx, (test_x, test_label) in enumerate(test_loader): | |||
| test_x = test_x | |||
| test_label = test_label | |||
| predict_y = model(test_x.to(device).float()).detach() | |||
| predict_ys = np.argmax(predict_y.cpu(), axis=-1) | |||
| label_np = test_label.numpy() | |||
| _ = predict_ys == test_label | |||
| correct += np.sum(_.numpy(), axis=-1) | |||
| _sum += _.shape[0] | |||
| print('accuracy: {:.2f}'.format(correct / _sum)) | |||
| #The model output location is placed under /tmp/output | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} | |||
| torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) | |||
| print('test:') | |||
| print(os.listdir("/tmp/output")) | |||