From cc4be88033ce4f91d59d3dcd49a01fa0a4f9503d Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 31 Jan 2024 17:12:05 +0800 Subject: [PATCH] fix bug --- gcu_mnist_example/train.py | 18 ++-- gpgpu_mnist_example/inference.py | 7 +- gpu_mnist_example/inference.py | 7 +- gpu_mnist_example/train.py | 6 +- npu_mnist_example/README.md | 1 + npu_mnist_example/read_imagenet.py | 3 - npu_mnist_example/train.py | 4 + npu_mnist_example/train_epoch_upload.py | 115 ++++++++++++++++++++++++ npu_mnist_example/train_multi_card.py | 8 +- 9 files changed, 144 insertions(+), 25 deletions(-) create mode 100644 npu_mnist_example/train_epoch_upload.py diff --git a/gcu_mnist_example/train.py b/gcu_mnist_example/train.py index aa385bd..dd461bc 100644 --- a/gcu_mnist_example/train.py +++ b/gcu_mnist_example/train.py @@ -9,19 +9,11 @@ If there are Chinese comments in the code,please add at the beginning: 数据集结构是: MnistDataset_torch.zip ├── test - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - ├── train - │ ├── MNIST/processed/test.pt - │ └── MNIST/processed/training.pt - │ ├── MNIST/raw/train-images-idx3-ubyte - │ └── MNIST/raw/train-labels-idx1-ubyte - │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte + └── train + +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl ''' diff --git a/gpgpu_mnist_example/inference.py b/gpgpu_mnist_example/inference.py index ee99215..cb2d6c4 100644 --- a/gpgpu_mnist_example/inference.py +++ b/gpgpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/inference.py b/gpu_mnist_example/inference.py index 9ddaf72..08457d8 100644 --- a/gpu_mnist_example/inference.py +++ b/gpu_mnist_example/inference.py @@ -5,11 +5,16 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' from model import Model import numpy as np diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py index 9319ff1..9db7665 100644 --- a/gpu_mnist_example/train.py +++ b/gpu_mnist_example/train.py @@ -5,11 +5,15 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -1,The dataset structure of the single-dataset in this example +数据集结构是: MnistDataset_torch.zip ├── test └── train +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1_0.76.pkl + ''' diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md index 439424f..408d883 100644 --- a/npu_mnist_example/README.md +++ b/npu_mnist_example/README.md @@ -85,6 +85,7 @@ upload_output() - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 +- 训练任务在每个epoch结束后就上传文件,可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释 - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 - 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释 diff --git a/npu_mnist_example/read_imagenet.py b/npu_mnist_example/read_imagenet.py index eba4f24..923418c 100644 --- a/npu_mnist_example/read_imagenet.py +++ b/npu_mnist_example/read_imagenet.py @@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms from c2net.context import upload_output parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') -parser.add_argument('--train_url', - help='output folder to save/load', - default= '/cache/output/') if __name__ == "__main__": args, unknown = parser.parse_known_args() diff --git a/npu_mnist_example/train.py b/npu_mnist_example/train.py index 1fd1e4c..74adb65 100644 --- a/npu_mnist_example/train.py +++ b/npu_mnist_example/train.py @@ -11,6 +11,10 @@ ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 diff --git a/npu_mnist_example/train_epoch_upload.py b/npu_mnist_example/train_epoch_upload.py new file mode 100644 index 0000000..fcbfc00 --- /dev/null +++ b/npu_mnist_example/train_epoch_upload.py @@ -0,0 +1,115 @@ + + +""" +示例选用的数据集是MnistDataset_mindspore.zip +数据集结构是: + MnistDataset_mindspore.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +模型文件夹结构是: +Mindspore_MNIST_Example_Model +├── checkpoint_lenet-1_1875.ckpt + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.train.callback import Callback +#导入c2net包 +from c2net.context import prepare, upload_output + +class EnvToOpenIEpochEnd(Callback): + """ + upload output to openi when epoch end + """ + def epoch_end(self,run_context): + upload_output() + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" + #获取预训练模型路径 + Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + #获取输出路径 + output_path = c2net_context.output_path + + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + #使用数据集的方式 + ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #将模型保存到c2net_context.output_path + outputDirectory = output_path + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + # set callback functions + callback =[time_cb,LossMonitor()] + local_rank=int(os.getenv('RANK_ID')) + #非必选,每个epoch结束后,都手动上传训练结果到启智平台,注意这样使用会占用很多内存,只有在部分特殊需要手动上传的任务才需要使用 + uploadOutput = EnvToOpenIEpochEnd() + callback.append(uploadOutput) + # for data parallel, only save checkpoint on rank 0 + if local_rank==0 : + callback.append(ckpoint_cb) + + model.train(epoch_size,ds_train,callbacks=callback) diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py index d53a7e8..8e1a8bc 100644 --- a/npu_mnist_example/train_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -10,7 +10,7 @@ └── train ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte - + 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 @@ -19,13 +19,11 @@ import os import argparse from config import mnist_cfg as cfg -from dataset import create_dataset from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore import load_checkpoint, load_param_into_net from mindspore.train import Model from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank @@ -64,15 +62,13 @@ if __name__ == "__main__": c2net_context = prepare() #获取数据集路径 MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" - #获取预训练模型路径 - Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model" + output_path = c2net_context.output_path ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - #load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss,