diff --git a/gcu_mnist_example/gpu_mnist_example/train_gcu.py b/gcu_mnist_example/gpu_mnist_example/train_gcu.py index 0050dbf..51f4c27 100644 --- a/gcu_mnist_example/gpu_mnist_example/train_gcu.py +++ b/gcu_mnist_example/gpu_mnist_example/train_gcu.py @@ -30,7 +30,6 @@ If there are Chinese comments in the code,please add at the beginning: ''' import os -print("begin:") os.system("pip uninstall openi-test") os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) import torch @@ -75,15 +74,12 @@ if __name__ == '__main__': pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" print("dataset_path:") - print(os.listdir(dataset_path)) os.listdir(dataset_path) print("pretrain_model_path:") - print(os.listdir(pretrain_model_path)) os.listdir(pretrain_model_path) print("output_path:") - print(os.listdir(output_path)) os.listdir(output_path) # load DPU envs-xx.sh DTU_FLAG = True diff --git a/gpu_mnist_example/train_gcu.py b/gpu_mnist_example/train_gcu.py deleted file mode 100644 index 1b4322c..0000000 --- a/gpu_mnist_example/train_gcu.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/python -#coding=utf-8 -''' -If there are Chinese comments in the code,please add at the beginning: -#!/usr/bin/python -#coding=utf-8 - -示例选用的数据集是MnistDataset_torch.zip -数据集结构是: - MnistDataset_torch.zip - ├── test - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - ├── train - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - - - -示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl - -''' - -import os -print("begin:") -os.system("pip uninstall openi-test") -os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) -import torch -from model import Model -import numpy as np -from torchvision.datasets import mnist -from torch.nn import CrossEntropyLoss -from torch.optim import SGD -from torch.utils.data import DataLoader -from torchvision.transforms import ToTensor -import argparse -from openi.context import prepare, upload_openi - -import importlib.util - -def is_torch_dtu_available(): - if importlib.util.find_spec("torch_dtu") is None: - return False - if importlib.util.find_spec("torch_dtu.core") is None: - return False - return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') -parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') - - -if __name__ == '__main__': - #获取参数并忽略超参数报错 - args, unknown = parser.parse_known_args() - - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - - #获取数据集路径,预训练模型路径,输出路径 - dataset_path = openi_context.dataset_path - pretrain_model_path = openi_context.pretrain_model_path - output_path = openi_context.output_path - - dataset_path_A = dataset_path + "/MnistDataset" - pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" - - print("dataset_path:") - print(os.listdir(dataset_path)) - os.listdir(dataset_path) - - print("pretrain_model_path:") - print(os.listdir(pretrain_model_path)) - os.listdir(pretrain_model_path) - - print("output_path:") - print(os.listdir(output_path)) - os.listdir(output_path) - # load DPU envs-xx.sh - DTU_FLAG = True - if is_torch_dtu_available(): - import torch_dtu - import torch_dtu.distributed as dist - import torch_dtu.core.dtu_model as dm - from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP - print('dtu is available: True') - device = dm.dtu_device() - DTU_FLAG = True - else: - print('dtu is available: False') - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - DTU_FLAG = False - - - # 参数声明 - model = Model().to(device) - optimizer = SGD(model.parameters(), lr=1e-1) - #log output - batch_size = args.batch_size - train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False) - train_loader = DataLoader(train_dataset, batch_size=batch_size) - test_loader = DataLoader(test_dataset, batch_size=batch_size) - model = Model().to(device) - sgd = SGD(model.parameters(), lr=1e-1) - cost = CrossEntropyLoss() - epochs = args.epoch_size - print('epoch_size is:{}'.format(epochs)) - - # 如果有保存的模型,则加载模型,并在其基础上继续训练 - if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"): - checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl") - model.load_state_dict(checkpoint['model']) - optimizer.load_state_dict(checkpoint['optimizer']) - start_epoch = checkpoint['epoch'] - print('加载 epoch {} 权重成功!'.format(start_epoch)) - else: - start_epoch = 0 - print('无保存模型,将从头开始训练!') - - for _epoch in range(start_epoch, epochs): - print('the {} epoch_size begin'.format(_epoch + 1)) - model.train() - for idx, (train_x, train_label) in enumerate(train_loader): - train_x = train_x.to(device) - train_label = train_label.to(device) - label_np = np.zeros((train_label.shape[0], 10)) - sgd.zero_grad() - predict_y = model(train_x.float()) - loss = cost(predict_y, train_label.long()) - if idx % 10 == 0: - print('idx: {}, loss: {}'.format(idx, loss.sum().item())) - loss.backward() - if DTU_FLAG: - dm.optimizer_step(sgd, barrier=True) - else: - sgd.step() - - - correct = 0 - _sum = 0 - model.eval() - for idx, (test_x, test_label) in enumerate(test_loader): - test_x = test_x - test_label = test_label - predict_y = model(test_x.to(device).float()).detach() - predict_ys = np.argmax(predict_y.cpu(), axis=-1) - label_np = test_label.numpy() - _ = predict_ys == test_label - correct += np.sum(_.numpy(), axis=-1) - _sum += _.shape[0] - print('accuracy: {:.2f}'.format(correct / _sum)) - #The model output location is placed under /tmp/output - state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} - torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) - print('test:') - print(os.listdir("/tmp/output")) \ No newline at end of file diff --git a/gpu_mnist_example/train_gpu.py b/gpu_mnist_example/train_gpu.py index 5964908..3abe746 100644 --- a/gpu_mnist_example/train_gpu.py +++ b/gpu_mnist_example/train_gpu.py @@ -12,6 +12,7 @@ If there are Chinese comments in the code,please add at the beginning: ''' import os +os.system("pip install openi-test") os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) from model import Model diff --git a/npu_mnist_example/train_npu.py b/npu_mnist_example/train_npu.py index d8201f4..59d4608 100644 --- a/npu_mnist_example/train_npu.py +++ b/npu_mnist_example/train_npu.py @@ -33,10 +33,6 @@ from mindspore.communication.management import init, get_rank import time #导入openi包 from openi.context import prepare, upload_openi -print("hi:") -print(os.listdir("/home/work/user-job-dir/code")) -os.listdir("/home/work/user-job-dir/code") - parser = argparse.ArgumentParser(description='MindSpore Lenet Example') diff --git a/train.py b/train.py index 4696bb3..b4d0faa 100644 --- a/train.py +++ b/train.py @@ -12,5 +12,14 @@ dataset_path = openi_context.dataset_path pretrain_model_path = openi_context.pretrain_model_path output_path = openi_context.output_path +print("dataset_path:") +os.listdir(dataset_path) + +print("pretrain_model_path:") +os.listdir(pretrain_model_path) + +print("output_path:") +os.listdir(output_path) + #回传结果到openi upload_openi() \ No newline at end of file