Browse Source

更新代码,更换包名

pull/1/head
liuzx 2 years ago
parent
commit
fddd1915bc
4 changed files with 167 additions and 107 deletions
  1. +15
    -32
      gcu_mnist_example/train_gcu.py
  2. +17
    -14
      gpu_mnist_example/train_gpu.py
  3. +18
    -61
      npu_mnist_example/train_npu.py
  4. +117
    -0
      npu_mnist_example/train_npu_multi_card.py

+ 15
- 32
gcu_mnist_example/train_gcu.py View File

@@ -21,18 +21,8 @@ If there are Chinese comments in the code,please add at the beginning:
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte



示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl

│ └── MNIST/raw/t10k-labels-idx1-ubyte

代码会自动放置在/tmp/code目录下。
数据集在界面选择后,会自动放置在/tmp/dataset目录下。
预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。
输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。
如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train
'''

import torch
@@ -45,8 +35,8 @@ from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
import os
#导入openi
from openi.context import prepare, upload_openi
#导入c2net
from c2net.context import prepare, upload_output

import importlib.util

@@ -59,25 +49,18 @@ def is_torch_dtu_available():

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#The dataset location is placed under /dataset
parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')

parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
parser.add_argument('--ckpt_url', default="", help='pretrain model path')
parser.add_argument('--pretrainmodel', default="/tmp/pretrainmodel/mnist_epoch1_0.86.pkl", help='pretrain model path')


if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
openi_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"

# load DPU envs-xx.sh
DTU_FLAG = True
@@ -101,8 +84,8 @@ if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#log output
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False)
train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
@@ -112,8 +95,8 @@ if __name__ == '__main__':
print('epoch_size is:{}'.format(epochs))

# 如果有保存的模型,则加载模型,并在其基础上继续训练
if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")):
checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl"))
if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")):
checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl"))
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch']
@@ -156,5 +139,5 @@ if __name__ == '__main__':
print('accuracy: {:.2f}'.format(correct / _sum))
#The model output location is placed under /tmp/output
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1}
torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path, _epoch+1, correct / _sum))
print(os.listdir('{}'.format(output_path)))
torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum))
print(os.listdir('{}'.format(c2net_context.output_path)))

+ 17
- 14
gpu_mnist_example/train_gpu.py View File

@@ -23,8 +23,8 @@ from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
import os
#导入openi
from openi.context import prepare, upload_openi
#导入c2net
from c2net.context import prepare, upload_output

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
@@ -76,27 +76,28 @@ def test(model, test_loader, test_data):

if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
openi_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875"
MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"

#log output
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
epochs = args.epoch_size
train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False)
train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

#如果有保存的模型,则加载模型,并在其基础上继续训练
if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")):
checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl"))
if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")):
checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl"))
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch']
@@ -108,8 +109,10 @@ if __name__ == '__main__':
for epoch in range(start_epoch+1, epochs):
train(model, train_loader, epoch)
test(model, test_loader, test_dataset)
# 保存模型
# 将模型保存到c2net_context.output_path
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
torch.save(state, '{}/mnist_epoch{}.pkl'.format(output_path, epoch))
torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch))
#回传结果
upload_output()



+ 18
- 61
npu_mnist_example/train_npu.py View File

@@ -13,24 +13,22 @@
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用openi的python sdk包
2、用户需要调用c2net的python sdk包
"""

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from dataset_distributed import create_dataset_parallel
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank
import time
#导入openi
from openi.context import prepare, upload_openi
#导入c2net包
from c2net.context import prepare, upload_output


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
@@ -50,59 +48,22 @@ parser.add_argument('--epoch_size',
if __name__ == "__main__":
###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
args, unknown = parser.parse_known_args()
data_dir = ''
pretrain_dir = ''
train_dir = ''
#回传结果到openi
upload_openi()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
device_num = int(os.getenv('RANK_SIZE'))
#使用单卡时
if device_num == 1:
context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
#初始化导入数据集和预训练模型到容器内
openi_context = prepare()
data_dir = openi_context.dataset_path
pretrain_dir = openi_context.pretrain_model_path
train_dir = openi_context.output_path
#使用数据集的方式
ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size)
#使用多卡时
if device_num > 1:
# set device_id and init for multi-card training
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
init()
#Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
local_rank=int(os.getenv('RANK_ID'))
if local_rank%8==0:
###初始化导入数据集和预训练模型到容器内
openi_context = prepare()
#初始化导入数据集和预训练模型到容器内
openi_context = prepare()
data_dir = openi_context.dataset_path
pretrain_dir = openi_context.pretrain_model_path
train_dir = openi_context.output_path
#Set a cache file to determine whether the data has been copied to obs.
#If this file exists during multi-card training, there is no need to copy the dataset multiple times.
f = open("/cache/download_input.txt", 'w')
f.close()
try:
if os.path.exists("/cache/download_input.txt"):
print("download_input succeed")
except Exception as e:
print("download_input failed")
while not os.path.exists("/cache/download_input.txt"):
time.sleep(1)
ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size)

context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
#使用数据集的方式
ds_train = create_dataset(os.path.join(mnistdata_path + "/MNISTData", "train"), cfg.batch_size)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
model = Model(network,
net_loss,
@@ -118,12 +79,8 @@ if __name__ == "__main__":
config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
outputDirectory = train_dir + "/"
if device_num > 1:
outputDirectory = train_dir + "/" + str(get_rank()) + "/"
#将模型保存到c2net_context.output_path
outputDirectory = c2net_context.output_path + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
@@ -134,5 +91,5 @@ if __name__ == "__main__":
print('epoch_size is: ', epoch_size)
model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()])

###上传训练结果到启智平台,注意必须将要输出的模型存储在openi_context.output_path
upload_openi()
###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path
upload_output()

+ 117
- 0
npu_mnist_example/train_npu_multi_card.py View File

@@ -0,0 +1,117 @@


"""
示例选用的数据集是MNISTData.zip
数据集结构是:
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
"""

import os
import argparse
from config import mnist_cfg as cfg
from dataset_distributed import create_dataset_parallel
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank
import time
#导入openi包
from c2net.context import prepare, upload_output


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
args, unknown = parser.parse_known_args()
device_num = int(os.getenv('RANK_SIZE'))
#使用多卡时
# set device_id and init for multi-card training
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
init()
#Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
local_rank=int(os.getenv('RANK_ID'))
if local_rank%8==0:
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
#Set a cache file to determine whether the data has been copied to obs.
#If this file exists during multi-card training, there is no need to copy the dataset multiple times.
f = open("/cache/download_input.txt", 'w')
f.close()
try:
if os.path.exists("/cache/download_input.txt"):
print("download_input succeed")
except Exception as e:
print("download_input failed")
while not os.path.exists("/cache/download_input.txt"):
time.sleep(1)
ds_train = create_dataset_parallel(os.path.join(mnistdata_path, "train"), cfg.batch_size)

network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()])

###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path
upload_output()

Loading…
Cancel
Save