Browse Source

Merge pull request 'liuzx' (#3) from liuzx into master

Reviewed-on: https://openi.pcl.ac.cn/OpenIOSSG/OpenI_Cloudbrain_Example/pulls/3
liuzx-patch-1
liuzx 2 years ago
parent
commit
d4e58ad271
12 changed files with 168 additions and 53 deletions
  1. +2
    -3
      gcu_mnist_example/README.md
  2. +5
    -13
      gcu_mnist_example/train.py
  3. +2
    -2
      gpgpu_mnist_example/README.md
  4. +6
    -1
      gpgpu_mnist_example/inference.py
  5. +4
    -5
      gpu_mnist_example/README.md
  6. +6
    -1
      gpu_mnist_example/inference.py
  7. +6
    -2
      gpu_mnist_example/train.py
  8. +3
    -2
      npu_mnist_example/README.md
  9. +0
    -3
      npu_mnist_example/read_imagenet.py
  10. +4
    -0
      npu_mnist_example/train.py
  11. +115
    -0
      npu_mnist_example/train_epoch_upload.py
  12. +15
    -21
      npu_mnist_example/train_multi_card.py

+ 2
- 3
gcu_mnist_example/README.md View File

@@ -1,4 +1,3 @@

# 如何在启智平台上进行模型调试和训练—GCU_手写数字识别示例

## 一 ,数据集及预训练模型准备
@@ -39,9 +38,9 @@ c2net_context = prepare()
##### 2,获取代码路径

```
code_path = c2net_context.code_path +"/" +"项目名"
code_path = c2net_context.code_path +"/" +"项目名".lower()
在本示例中代码路径为:
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
```

##### 3,获取数据集路径


+ 5
- 13
gcu_mnist_example/train.py View File

@@ -9,19 +9,11 @@ If there are Chinese comments in the code,please add at the beginning:
数据集结构是:
MnistDataset_torch.zip
├── test
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte
├── train
│ ├── MNIST/processed/test.pt
│ └── MNIST/processed/training.pt
│ ├── MNIST/raw/train-images-idx3-ubyte
│ └── MNIST/raw/train-labels-idx1-ubyte
│ ├── MNIST/raw/t10k-images-idx3-ubyte
│ └── MNIST/raw/t10k-labels-idx1-ubyte
└── train

预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

'''



+ 2
- 2
gpgpu_mnist_example/README.md View File

@@ -38,9 +38,9 @@ c2net_context = prepare()
##### 2,获取代码路径

```
code_path = c2net_context.code_path +"/" +"项目名"
code_path = c2net_context.code_path +"/" + "项目名".lower()
在本示例中代码路径为:
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
```

##### 3,获取数据集路径


+ 6
- 1
gpgpu_mnist_example/inference.py View File

@@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

1,The dataset structure of the single-dataset in this example
示例选用的数据集是MnistDataset_torch.zip
数据集结构是:
MnistDataset_torch.zip
├── test
└── train

预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

'''
from model import Model
import numpy as np


+ 4
- 5
gpu_mnist_example/README.md View File

@@ -1,4 +1,3 @@

# 如何在启智平台上进行模型调试和训练—GPU_手写数字识别示例

## 一 ,数据集及预训练模型准备
@@ -12,9 +11,9 @@

> MnistDataset_torch.zip
>
> ├── test
> ├── test
>
> └── train
> └── train
>

##### 2,预训练模型说明:
@@ -39,9 +38,9 @@ c2net_context = prepare()
##### 2,获取代码路径

```
code_path = c2net_context.code_path +"/" +"项目名"
code_path = c2net_context.code_path +"/" +"项目名".lower()
在本示例中代码路径为:
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
```

##### 3,获取数据集路径


+ 6
- 1
gpu_mnist_example/inference.py View File

@@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

1,The dataset structure of the single-dataset in this example
示例选用的数据集是MnistDataset_torch.zip
数据集结构是:
MnistDataset_torch.zip
├── test
└── train

预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

'''
from model import Model
import numpy as np


+ 6
- 2
gpu_mnist_example/train.py View File

@@ -5,11 +5,15 @@ If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

1,The dataset structure of the single-dataset in this example
数据集结构是:
MnistDataset_torch.zip
├── test
└── train

预训练模型文件夹结构是:
Torch_MNIST_Example_Model
├── mnist_epoch1_0.76.pkl

'''


@@ -105,7 +109,7 @@ if __name__ == '__main__':
start_epoch = 0
print('无保存模型,将从头开始训练!')
for epoch in range(start_epoch+1, epochs):
for epoch in range(start_epoch+1, epochs+1):
train(model, train_loader, epoch)
test(model, test_loader, test_dataset)
# 将模型保存到c2net_context.output_path


+ 3
- 2
npu_mnist_example/README.md View File

@@ -40,9 +40,9 @@ c2net_context = prepare()
##### 2,获取代码路径

```
code_path = c2net_context.code_path +"/" +"项目名"
code_path = c2net_context.code_path +"/" + "项目名".lower()
在本示例中代码路径为:
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
```

##### 3,获取数据集路径
@@ -85,6 +85,7 @@ upload_output()

- 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释
- 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释
- 训练任务在每个epoch结束后就上传文件,可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释
- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释
- 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释
- 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释


+ 0
- 3
npu_mnist_example/read_imagenet.py View File

@@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms
from c2net.context import upload_output

parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example')
parser.add_argument('--train_url',
help='output folder to save/load',
default= '/cache/output/')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()


+ 4
- 0
npu_mnist_example/train.py View File

@@ -11,6 +11,10 @@
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
模型文件夹结构是:
Mindspore_MNIST_Example_Model
├── checkpoint_lenet-1_1875.ckpt
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包


+ 115
- 0
npu_mnist_example/train_epoch_upload.py View File

@@ -0,0 +1,115 @@


"""
示例选用的数据集是MnistDataset_mindspore.zip
数据集结构是:
MnistDataset_mindspore.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

模型文件夹结构是:
Mindspore_MNIST_Example_Model
├── checkpoint_lenet-1_1875.ckpt
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
"""

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.train.callback import Callback
#导入c2net包
from c2net.context import prepare, upload_output

class EnvToOpenIEpochEnd(Callback):
"""
upload output to openi when epoch end
"""
def epoch_end(self,run_context):
upload_output()


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
#获取预训练模型路径
Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
#获取输出路径
output_path = c2net_context.output_path
context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
#使用数据集的方式
ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy"},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#将模型保存到c2net_context.output_path
outputDirectory = output_path + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

# set callback functions
callback =[time_cb,LossMonitor()]
local_rank=int(os.getenv('RANK_ID'))
#非必选,每个epoch结束后,都手动上传训练结果到启智平台,注意这样使用会占用很多内存,只有在部分特殊需要手动上传的任务才需要使用
uploadOutput = EnvToOpenIEpochEnd()
callback.append(uploadOutput)
# for data parallel, only save checkpoint on rank 0
if local_rank==0 :
callback.append(ckpoint_cb)
model.train(epoch_size,ds_train,callbacks=callback)

+ 15
- 21
npu_mnist_example/train_multi_card.py View File

@@ -10,7 +10,7 @@
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
@@ -24,12 +24,10 @@ from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank
import time
#导入openi包
from c2net.context import prepare, upload_output


@@ -53,39 +51,35 @@ if __name__ == "__main__":
device_num = int(os.getenv('RANK_SIZE'))
#使用多卡时
# set device_id and init for multi-card training
# set device_id and init for multi-card training
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
init()
#Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
local_rank=int(os.getenv('RANK_ID'))
if local_rank%8==0:
#初始化导入数据集和预训练模型到容器内
#初始化导入数据集和预训练模型到容器内,并行任务先让0卡拷贝数据,并用一个缓存文件标记0卡已prepare完成
if local_rank == 0:
c2net_context = prepare()
#获取数据集路径
MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
#获取预训练模型路径
Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
output_path = c2net_context.output_path
#Set a cache file to determine whether the data has been copied to obs.
#If this file exists during multi-card training, there is no need to copy the dataset multiple times.
f = open("/cache/download_input.txt", 'w')
f = open("/cache/prepare_completed.txt", 'w')
f.close()
try:
if os.path.exists("/cache/download_input.txt"):
print("download_input succeed")
if os.path.exists("/cache/prepare_completed.txt"):
print("prepare completed!")
except Exception as e:
print("download_input failed")
while not os.path.exists("/cache/download_input.txt"):
time.sleep(1)
ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size)
print("prepare failed")
while not os.path.exists("/cache/prepare_completed.txt"):
time.sleep(1)
c2net_context = prepare()
#获取数据集路径
MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"

output_path = c2net_context.output_path
ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
if args.device_target != "Ascend":
model = Model(network,
net_loss,


Loading…
Cancel
Save