Browse Source

更新

liuzx-patch-1
liuzx 2 years ago
parent
commit
d9ee5a2e23
5 changed files with 10 additions and 174 deletions
  1. +0
    -4
      gcu_mnist_example/gpu_mnist_example/train_gcu.py
  2. +0
    -166
      gpu_mnist_example/train_gcu.py
  3. +1
    -0
      gpu_mnist_example/train_gpu.py
  4. +0
    -4
      npu_mnist_example/train_npu.py
  5. +9
    -0
      train.py

+ 0
- 4
gcu_mnist_example/gpu_mnist_example/train_gcu.py View File

@@ -30,7 +30,6 @@ If there are Chinese comments in the code,please add at the beginning:
''' '''


import os import os
print("begin:")
os.system("pip uninstall openi-test") os.system("pip uninstall openi-test")
os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH")))
import torch import torch
@@ -75,15 +74,12 @@ if __name__ == '__main__':
pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j"


print("dataset_path:") print("dataset_path:")
print(os.listdir(dataset_path))
os.listdir(dataset_path) os.listdir(dataset_path)
print("pretrain_model_path:") print("pretrain_model_path:")
print(os.listdir(pretrain_model_path))
os.listdir(pretrain_model_path) os.listdir(pretrain_model_path)


print("output_path:") print("output_path:")
print(os.listdir(output_path))
os.listdir(output_path) os.listdir(output_path)
# load DPU envs-xx.sh # load DPU envs-xx.sh
DTU_FLAG = True DTU_FLAG = True


+ 0
- 166
gpu_mnist_example/train_gcu.py View File

@@ -1,166 +0,0 @@
#!/usr/bin/python
#coding=utf-8
'''
If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

示例选用的数据集是MnistDataset_torch.zip
数据集结构是:
MnistDataset_torch.zip
├── test
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte
├── train
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte



示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl

'''

import os
print("begin:")
os.system("pip uninstall openi-test")
os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH")))
import torch
from model import Model
import numpy as np
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
from openi.context import prepare, upload_openi

import importlib.util

def is_torch_dtu_available():
if importlib.util.find_spec("torch_dtu") is None:
return False
if importlib.util.find_spec("torch_dtu.core") is None:
return False
return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')


if __name__ == '__main__':
#获取参数并忽略超参数报错
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
openi_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path

dataset_path_A = dataset_path + "/MnistDataset"
pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j"

print("dataset_path:")
print(os.listdir(dataset_path))
os.listdir(dataset_path)
print("pretrain_model_path:")
print(os.listdir(pretrain_model_path))
os.listdir(pretrain_model_path)

print("output_path:")
print(os.listdir(output_path))
os.listdir(output_path)
# load DPU envs-xx.sh
DTU_FLAG = True
if is_torch_dtu_available():
import torch_dtu
import torch_dtu.distributed as dist
import torch_dtu.core.dtu_model as dm
from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP
print('dtu is available: True')
device = dm.dtu_device()
DTU_FLAG = True
else:
print('dtu is available: False')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DTU_FLAG = False
# 参数声明
model = Model().to(device)
optimizer = SGD(model.parameters(), lr=1e-1)
#log output
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epochs = args.epoch_size
print('epoch_size is:{}'.format(epochs))

# 如果有保存的模型,则加载模型,并在其基础上继续训练
if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"):
checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl")
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch']
print('加载 epoch {} 权重成功!'.format(start_epoch))
else:
start_epoch = 0
print('无保存模型,将从头开始训练!')

for _epoch in range(start_epoch, epochs):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
if DTU_FLAG:
dm.optimizer_step(sgd, barrier=True)
else:
sgd.step()
correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
print('accuracy: {:.2f}'.format(correct / _sum))
#The model output location is placed under /tmp/output
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1}
torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
print('test:')
print(os.listdir("/tmp/output"))

+ 1
- 0
gpu_mnist_example/train_gpu.py View File

@@ -12,6 +12,7 @@ If there are Chinese comments in the code,please add at the beginning:


''' '''
import os import os
os.system("pip install openi-test")
os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH")))


from model import Model from model import Model


+ 0
- 4
npu_mnist_example/train_npu.py View File

@@ -33,10 +33,6 @@ from mindspore.communication.management import init, get_rank
import time import time
#导入openi包 #导入openi包
from openi.context import prepare, upload_openi from openi.context import prepare, upload_openi
print("hi:")
print(os.listdir("/home/work/user-job-dir/code"))
os.listdir("/home/work/user-job-dir/code")



parser = argparse.ArgumentParser(description='MindSpore Lenet Example') parser = argparse.ArgumentParser(description='MindSpore Lenet Example')


+ 9
- 0
train.py View File

@@ -12,5 +12,14 @@ dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path output_path = openi_context.output_path


print("dataset_path:")
os.listdir(dataset_path)

print("pretrain_model_path:")
os.listdir(pretrain_model_path)

print("output_path:")
os.listdir(output_path)

#回传结果到openi #回传结果到openi
upload_openi() upload_openi()

Loading…
Cancel
Save