Browse Source

fix bug

pull/3/head
liuzx 2 years ago
parent
commit
cc4be88033
9 changed files with 144 additions and 25 deletions
  1. +5
    -13
      gcu_mnist_example/train.py
  2. +6
    -1
      gpgpu_mnist_example/inference.py
  3. +6
    -1
      gpu_mnist_example/inference.py
  4. +5
    -1
      gpu_mnist_example/train.py
  5. +1
    -0
      npu_mnist_example/README.md
  6. +0
    -3
      npu_mnist_example/read_imagenet.py
  7. +4
    -0
      npu_mnist_example/train.py
  8. +115
    -0
      npu_mnist_example/train_epoch_upload.py
  9. +2
    -6
      npu_mnist_example/train_multi_card.py

+ 5
- 13
gcu_mnist_example/train.py View File

@@ -9,19 +9,11 @@ If there are Chinese comments in the code,please add at the beginning:
数据集结构是: 数据集结构是:
MnistDataset_torch.zip MnistDataset_torch.zip
├── test ├── test
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte
├── train
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte
└── train

预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl


''' '''




+ 6
- 1
gpgpu_mnist_example/inference.py View File

@@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python #!/usr/bin/python
#coding=utf-8 #coding=utf-8


1,The dataset structure of the single-dataset in this example
示例选用的数据集是MnistDataset_torch.zip
数据集结构是:
MnistDataset_torch.zip MnistDataset_torch.zip
├── test ├── test
└── train └── train


预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

''' '''
from model import Model from model import Model
import numpy as np import numpy as np


+ 6
- 1
gpu_mnist_example/inference.py View File

@@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python #!/usr/bin/python
#coding=utf-8 #coding=utf-8


1,The dataset structure of the single-dataset in this example
示例选用的数据集是MnistDataset_torch.zip
数据集结构是:
MnistDataset_torch.zip MnistDataset_torch.zip
├── test ├── test
└── train └── train


预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

''' '''
from model import Model from model import Model
import numpy as np import numpy as np


+ 5
- 1
gpu_mnist_example/train.py View File

@@ -5,11 +5,15 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python #!/usr/bin/python
#coding=utf-8 #coding=utf-8


1,The dataset structure of the single-dataset in this example
数据集结构是:
MnistDataset_torch.zip MnistDataset_torch.zip
├── test ├── test
└── train └── train


预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

''' '''






+ 1
- 0
npu_mnist_example/README.md View File

@@ -85,6 +85,7 @@ upload_output()


- 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释
- 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释
- 训练任务在每个epoch结束后就上传文件,可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释
- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释
- 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释
- 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释 - 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释


+ 0
- 3
npu_mnist_example/read_imagenet.py View File

@@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms
from c2net.context import upload_output from c2net.context import upload_output


parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example')
parser.add_argument('--train_url',
help='output folder to save/load',
default= '/cache/output/')


if __name__ == "__main__": if __name__ == "__main__":
args, unknown = parser.parse_known_args() args, unknown = parser.parse_known_args()


+ 4
- 0
npu_mnist_example/train.py View File

@@ -11,6 +11,10 @@
├── train-images-idx3-ubyte ├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte └── train-labels-idx1-ubyte
模型文件夹结构是:
Mindspore_MNIST_Example_Model
├── checkpoint_lenet-1_1875.ckpt
使用注意事项: 使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包 2、用户需要调用c2net的python sdk包


+ 115
- 0
npu_mnist_example/train_epoch_upload.py View File

@@ -0,0 +1,115 @@


"""
示例选用的数据集是MnistDataset_mindspore.zip
数据集结构是:
MnistDataset_mindspore.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

模型文件夹结构是:
Mindspore_MNIST_Example_Model
├── checkpoint_lenet-1_1875.ckpt
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
"""

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.train.callback import Callback
#导入c2net包
from c2net.context import prepare, upload_output

class EnvToOpenIEpochEnd(Callback):
"""
upload output to openi when epoch end
"""
def epoch_end(self,run_context):
upload_output()


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
#获取预训练模型路径
Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
#获取输出路径
output_path = c2net_context.output_path
context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
#使用数据集的方式
ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#将模型保存到c2net_context.output_path
outputDirectory = output_path + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

# set callback functions
callback =[time_cb,LossMonitor()]
local_rank=int(os.getenv('RANK_ID'))
#非必选,每个epoch结束后,都手动上传训练结果到启智平台,注意这样使用会占用很多内存,只有在部分特殊需要手动上传的任务才需要使用
uploadOutput = EnvToOpenIEpochEnd()
callback.append(uploadOutput)
# for data parallel, only save checkpoint on rank 0
if local_rank==0 :
callback.append(ckpoint_cb)
model.train(epoch_size,ds_train,callbacks=callback)

+ 2
- 6
npu_mnist_example/train_multi_card.py View File

@@ -10,7 +10,7 @@
└── train └── train
├── train-images-idx3-ubyte ├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte └── train-labels-idx1-ubyte
使用注意事项: 使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包 2、用户需要调用c2net的python sdk包
@@ -19,13 +19,11 @@
import os import os
import argparse import argparse
from config import mnist_cfg as cfg from config import mnist_cfg as cfg
from dataset import create_dataset
from dataset_distributed import create_dataset_parallel from dataset_distributed import create_dataset_parallel
from lenet import LeNet5 from lenet import LeNet5
import mindspore.nn as nn import mindspore.nn as nn
from mindspore import context from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model from mindspore.train import Model
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank from mindspore.communication.management import init, get_rank
@@ -64,15 +62,13 @@ if __name__ == "__main__":
c2net_context = prepare() c2net_context = prepare()
#获取数据集路径 #获取数据集路径
MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
#获取预训练模型路径
Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"

output_path = c2net_context.output_path output_path = c2net_context.output_path
ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size)
network = LeNet5(cfg.num_classes) network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
#load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend": if args.device_target != "Ascend":
model = Model(network, model = Model(network,
net_loss, net_loss,


Loading…
Cancel
Save