From 22007f209e2c6b7189c4ef6c23cf433c0f93375f Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 24 Jan 2024 09:19:36 +0800 Subject: [PATCH 1/5] update readme --- gcu_mnist_example/README.md | 5 ++--- gpgpu_mnist_example/README.md | 4 ++-- gpu_mnist_example/README.md | 9 ++++----- npu_mnist_example/README.md | 4 ++-- npu_mnist_example/train_multi_card.py | 7 +++++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md index 6812796..3c9cb80 100644 --- a/gcu_mnist_example/README.md +++ b/gcu_mnist_example/README.md @@ -1,4 +1,3 @@ - # 如何在启智平台上进行模型调试和训练—GCU_手写数字识别示例 ## 一 ,数据集及预训练模型准备 @@ -39,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" +"项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/gpgpu_mnist_example/README.md b/gpgpu_mnist_example/README.md index 271f8e1..5713e2f 100644 --- a/gpgpu_mnist_example/README.md +++ b/gpgpu_mnist_example/README.md @@ -38,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" + "项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/gpu_mnist_example/README.md b/gpu_mnist_example/README.md index cf1982e..579a3db 100644 --- a/gpu_mnist_example/README.md +++ b/gpu_mnist_example/README.md @@ -1,4 +1,3 @@ - # 如何在启智平台上进行模型调试和训练—GPU_手写数字识别示例 ## 一 ,数据集及预训练模型准备 @@ -12,9 +11,9 @@ > MnistDataset_torch.zip > - > ├── test + > ├── test > - > └── train + > └── train > ##### 2,预训练模型说明: @@ -39,9 +38,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" +"项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md index 1ecb27f..439424f 100644 --- a/npu_mnist_example/README.md +++ b/npu_mnist_example/README.md @@ -40,9 +40,9 @@ c2net_context = prepare() ##### 2,获取代码路径 ``` -code_path = c2net_context.code_path +"/" +"项目名" +code_path = c2net_context.code_path +"/" + "项目名".lower() 在本示例中代码路径为: -code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example" +code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower() ``` ##### 3,获取数据集路径 diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index ffa6491..2741550 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -50,10 +50,13 @@ parser.add_argument('--epoch_size', if __name__ == "__main__": ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 args, unknown = parser.parse_known_args() - + MnistDataset_mindspore_path = '' + Mindspore_MNIST_Example_Model_path = '' + output_path = '' + device_num = int(os.getenv('RANK_SIZE')) #使用多卡时 - # set device_id and init for multi-card training + # set device_id and init for multi-card training context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) From 4fc0d713a8cc791a30e68f1ef262c7b2fc9fd48b Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 25 Jan 2024 09:07:21 +0800 Subject: [PATCH 2/5] update --- npu_mnist_example/train_multi_card.py | 38 ++++++++------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index 2741550..d53a7e8 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -19,6 +19,7 @@ import os import argparse from config import mnist_cfg as cfg +from dataset import create_dataset from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn @@ -29,7 +30,6 @@ from mindspore.train import Model from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank import time -#导入openi包 from c2net.context import prepare, upload_output @@ -50,10 +50,7 @@ parser.add_argument('--epoch_size', if __name__ == "__main__": ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 args, unknown = parser.parse_known_args() - MnistDataset_mindspore_path = '' - Mindspore_MNIST_Example_Model_path = '' - output_path = '' - + device_num = int(os.getenv('RANK_SIZE')) #使用多卡时 # set device_id and init for multi-card training @@ -63,32 +60,19 @@ if __name__ == "__main__": init() #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data local_rank=int(os.getenv('RANK_ID')) - if local_rank%8==0: - #初始化导入数据集和预训练模型到容器内 - c2net_context = prepare() - #获取数据集路径 - MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" - #获取预训练模型路径 - Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" - output_path = c2net_context.output_path - #Set a cache file to determine whether the data has been copied to obs. - #If this file exists during multi-card training, there is no need to copy the dataset multiple times. - f = open("/cache/download_input.txt", 'w') - f.close() - try: - if os.path.exists("/cache/download_input.txt"): - print("download_input succeed") - except Exception as e: - print("download_input failed") - while not os.path.exists("/cache/download_input.txt"): - time.sleep(1) - ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) - + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" + #获取预训练模型路径 + Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + output_path = c2net_context.output_path + ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) + #load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss, From cc4be88033ce4f91d59d3dcd49a01fa0a4f9503d Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 31 Jan 2024 17:12:05 +0800 Subject: [PATCH 3/5] fix bug --- gcu_mnist_example/train.py | 18 ++-- gpgpu_mnist_example/inference.py | 7 +- gpu_mnist_example/inference.py | 7 +- gpu_mnist_example/train.py | 6 +- npu_mnist_example/README.md | 1 + npu_mnist_example/read_imagenet.py | 3 - npu_mnist_example/train.py | 4 + npu_mnist_example/train_epoch_upload.py | 115 ++++++++++++++++++++++++ npu_mnist_example/train_multi_card.py | 8 +- 9 files changed, 144 insertions(+), 25 deletions(-) create mode 100644 npu_mnist_example/train_epoch_upload.py diff --git a/gcu_mnist_example/train.py b/gcu_mnist_example/train.py index aa385bd..dd461bc 100644 --- a/gcu_mnist_example/train.py +++ b/gcu_mnist_example/train.py @@ -9,19 +9,11 @@ If there are Chinese comments in the code,please add at the beginning: 数据集结构是: MnistDataset_torch.zip ├── test - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - ├── train - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte + └── train + +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl ''' diff --git a/gpgpu_mnist_example/inference.py b/gpgpu_mnist_example/inference.py index ee99215..cb2d6c4 100644 --- a/gpgpu_mnist_example/inference.py +++ b/gpgpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/inference.py b/gpu_mnist_example/inference.py index 9ddaf72..08457d8 100644 --- a/gpu_mnist_example/inference.py +++ b/gpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py index 9319ff1..9db7665 100644 --- a/gpu_mnist_example/train.py +++ b/gpu_mnist_example/train.py @@ -5,11 +5,15 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md index 439424f..408d883 100644 --- a/npu_mnist_example/README.md +++ b/npu_mnist_example/README.md @@ -85,6 +85,7 @@ upload_output() - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 +- 训练任务在每个epoch结束后就上传文件,可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释 - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 - 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释 diff --git a/npu_mnist_example/read_imagenet.py b/npu_mnist_example/read_imagenet.py index eba4f24..923418c 100644 --- a/npu_mnist_example/read_imagenet.py +++ b/npu_mnist_example/read_imagenet.py @@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms from c2net.context import upload_output parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') -parser.add_argument('--train_url', - help='output folder to save/load', - default= '/cache/output/') if __name__ == "__main__": args, unknown = parser.parse_known_args() diff --git a/npu_mnist_example/train.py b/npu_mnist_example/train.py index 1fd1e4c..74adb65 100644 --- a/npu_mnist_example/train.py +++ b/npu_mnist_example/train.py @@ -11,6 +11,10 @@ ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 diff --git a/npu_mnist_example/train_epoch_upload.py b/npu_mnist_example/train_epoch_upload.py new file mode 100644 index 0000000..fcbfc00 --- /dev/null +++ b/npu_mnist_example/train_epoch_upload.py @@ -0,0 +1,115 @@ + + +""" +示例选用的数据集是MnistDataset_mindspore.zip +数据集结构是: + MnistDataset_mindspore.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.train.callback import Callback +#导入c2net包 +from c2net.context import prepare, upload_output + +class EnvToOpenIEpochEnd(Callback): + """ + upload output to openi when epoch end + """ + def epoch_end(self,run_context): + upload_output() + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" + #获取预训练模型路径 + Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + #获取输出路径 + output_path = c2net_context.output_path + + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + #使用数据集的方式 + ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #将模型保存到c2net_context.output_path + outputDirectory = output_path + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + # set callback functions + callback =[time_cb,LossMonitor()] + local_rank=int(os.getenv('RANK_ID')) + #非必选,每个epoch结束后,都手动上传训练结果到启智平台,注意这样使用会占用很多内存,只有在部分特殊需要手动上传的任务才需要使用 + uploadOutput = EnvToOpenIEpochEnd() + callback.append(uploadOutput) + # for data parallel, only save checkpoint on rank 0 + if local_rank==0 : + callback.append(ckpoint_cb) + + model.train(epoch_size,ds_train,callbacks=callback) diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index d53a7e8..8e1a8bc 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -10,7 +10,7 @@ └── train ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte - + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 @@ -19,13 +19,11 @@ import os import argparse from config import mnist_cfg as cfg -from dataset import create_dataset from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore import load_checkpoint, load_param_into_net from mindspore.train import Model from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank @@ -64,15 +62,13 @@ if __name__ == "__main__": c2net_context = prepare() #获取数据集路径 MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" - #获取预训练模型路径 - Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + output_path = c2net_context.output_path ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - #load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss, From 5ec299870393eea06fe7e16a4af4f34408eca74a Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 31 Jan 2024 17:14:31 +0800 Subject: [PATCH 4/5] fix bug --- gpu_mnist_example/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py index 9db7665..e71eaf2 100644 --- a/gpu_mnist_example/train.py +++ b/gpu_mnist_example/train.py @@ -109,7 +109,7 @@ if __name__ == '__main__': start_epoch = 0 print('无保存模型,将从头开始训练!') - for epoch in range(start_epoch+1, epochs): + for epoch in range(start_epoch+1, epochs+1): train(model, train_loader, epoch) test(model, test_loader, test_dataset) # 将模型保存到c2net_context.output_path From 1b41008660e86ffd7545b13087343a60bb75e9bb Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 1 Feb 2024 10:03:58 +0800 Subject: [PATCH 5/5] update train_multi_card example --- npu_mnist_example/train_multi_card.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index 8e1a8bc..ffa7ea4 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -58,8 +58,19 @@ if __name__ == "__main__": init() #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data local_rank=int(os.getenv('RANK_ID')) - #初始化导入数据集和预训练模型到容器内 - c2net_context = prepare() + #初始化导入数据集和预训练模型到容器内,并行任务先让0卡拷贝数据,并用一个缓存文件标记0卡已prepare完成 + if local_rank == 0: + c2net_context = prepare() + f = open("/cache/prepare_completed.txt", 'w') + f.close() + try: + if os.path.exists("/cache/prepare_completed.txt"): + print("prepare completed!") + except Exception as e: + print("prepare failed") + while not os.path.exists("/cache/prepare_completed.txt"): + time.sleep(1) + c2net_context = prepare() #获取数据集路径 MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"