diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md index 6812796..3c9cb80 100644 --- a/gcu_mnist_example/README.md +++ b/gcu_mnist_example/README.md @@ -1,4 +1,3 @@ - # 如何在启智平台上进行模型调试和训练—GCU_手写数字识别示例 ## 一 ,数据集及预训练模型准备 @@ -39,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" +"项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/gcu_mnist_example/train.py b/gcu_mnist_example/train.py index aa385bd..dd461bc 100644 --- a/gcu_mnist_example/train.py +++ b/gcu_mnist_example/train.py @@ -9,19 +9,11 @@ If there are Chinese comments in the code,please add at the beginning: 数据集结构是: MnistDataset_torch.zip ├── test - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - ├── train - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte + └── train + +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl ''' diff --git a/gpgpu_mnist_example/README.md b/gpgpu_mnist_example/README.md index 271f8e1..5713e2f 100644 --- a/gpgpu_mnist_example/README.md +++ b/gpgpu_mnist_example/README.md @@ -38,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" + "项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/gpgpu_mnist_example/inference.py b/gpgpu_mnist_example/inference.py index ee99215..cb2d6c4 100644 --- a/gpgpu_mnist_example/inference.py +++ b/gpgpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/README.md b/gpu_mnist_example/README.md index cf1982e..579a3db 100644 --- a/gpu_mnist_example/README.md +++ b/gpu_mnist_example/README.md @@ -1,4 +1,3 @@ - # 如何在启智平台上进行模型调试和训练—GPU_手写数字识别示例 ## 一 ,数据集及预训练模型准备 @@ -12,9 +11,9 @@ > MnistDataset_torch.zip > - > ├── test + > ├── test > - > └── train + > └── train > ##### 2,预训练模型说明: @@ -39,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" +"项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/gpu_mnist_example/inference.py b/gpu_mnist_example/inference.py index 9ddaf72..08457d8 100644 --- a/gpu_mnist_example/inference.py +++ b/gpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py index 9319ff1..e71eaf2 100644 --- a/gpu_mnist_example/train.py +++ b/gpu_mnist_example/train.py @@ -5,11 +5,15 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' @@ -105,7 +109,7 @@ if __name__ == '__main__': start_epoch = 0 print('无保存模型,将从头开始训练!') - for epoch in range(start_epoch+1, epochs): + for epoch in range(start_epoch+1, epochs+1): train(model, train_loader, epoch) test(model, test_loader, test_dataset) # 将模型保存到c2net_context.output_path diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md index 1ecb27f..408d883 100644 --- a/npu_mnist_example/README.md +++ b/npu_mnist_example/README.md @@ -40,9 +40,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" + "项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 @@ -85,6 +85,7 @@ upload_output() - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 +- 训练任务在每个epoch结束后就上传文件,可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释 - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 - 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释 diff --git a/npu_mnist_example/read_imagenet.py b/npu_mnist_example/read_imagenet.py index eba4f24..923418c 100644 --- a/npu_mnist_example/read_imagenet.py +++ b/npu_mnist_example/read_imagenet.py @@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms from c2net.context import upload_output parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') -parser.add_argument('--train_url', - help='output folder to save/load', - default= '/cache/output/') if __name__ == "__main__": args, unknown = parser.parse_known_args() diff --git a/npu_mnist_example/train.py b/npu_mnist_example/train.py index 1fd1e4c..74adb65 100644 --- a/npu_mnist_example/train.py +++ b/npu_mnist_example/train.py @@ -11,6 +11,10 @@ ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 diff --git a/npu_mnist_example/train_epoch_upload.py b/npu_mnist_example/train_epoch_upload.py new file mode 100644 index 0000000..fcbfc00 --- /dev/null +++ b/npu_mnist_example/train_epoch_upload.py @@ -0,0 +1,115 @@ + + +""" +示例选用的数据集是MnistDataset_mindspore.zip +数据集结构是: + MnistDataset_mindspore.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.train.callback import Callback +#导入c2net包 +from c2net.context import prepare, upload_output + +class EnvToOpenIEpochEnd(Callback): + """ + upload output to openi when epoch end + """ + def epoch_end(self,run_context): + upload_output() + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" + #获取预训练模型路径 + Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + #获取输出路径 + output_path = c2net_context.output_path + + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + #使用数据集的方式 + ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #将模型保存到c2net_context.output_path + outputDirectory = output_path + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + # set callback functions + callback =[time_cb,LossMonitor()] + local_rank=int(os.getenv('RANK_ID')) + #非必选,每个epoch结束后,都手动上传训练结果到启智平台,注意这样使用会占用很多内存,只有在部分特殊需要手动上传的任务才需要使用 + uploadOutput = EnvToOpenIEpochEnd() + callback.append(uploadOutput) + # for data parallel, only save checkpoint on rank 0 + if local_rank==0 : + callback.append(ckpoint_cb) + + model.train(epoch_size,ds_train,callbacks=callback) diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index ffa6491..ffa7ea4 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -10,7 +10,7 @@ └── train ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte - + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 @@ -24,12 +24,10 @@ from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore import load_checkpoint, load_param_into_net from mindspore.train import Model from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank import time -#导入openi包 from c2net.context import prepare, upload_output @@ -53,39 +51,35 @@ if __name__ == "__main__": device_num = int(os.getenv('RANK_SIZE')) #使用多卡时 - # set device_id and init for multi-card training + # set device_id and init for multi-card training context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) init() #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data local_rank=int(os.getenv('RANK_ID')) - if local_rank%8==0: - #初始化导入数据集和预训练模型到容器内 + #初始化导入数据集和预训练模型到容器内,并行任务先让0卡拷贝数据,并用一个缓存文件标记0卡已prepare完成 + if local_rank == 0: c2net_context = prepare() - #获取数据集路径 - MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" - #获取预训练模型路径 - Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" - output_path = c2net_context.output_path - #Set a cache file to determine whether the data has been copied to obs. - #If this file exists during multi-card training, there is no need to copy the dataset multiple times. - f = open("/cache/download_input.txt", 'w') + f = open("/cache/prepare_completed.txt", 'w') f.close() try: - if os.path.exists("/cache/download_input.txt"): - print("download_input succeed") + if os.path.exists("/cache/prepare_completed.txt"): + print("prepare completed!") except Exception as e: - print("download_input failed") - while not os.path.exists("/cache/download_input.txt"): - time.sleep(1) - ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) + print("prepare failed") + while not os.path.exists("/cache/prepare_completed.txt"): + time.sleep(1) + c2net_context = prepare() + #获取数据集路径 + MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" + output_path = c2net_context.output_path + ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss,