| @@ -1,40 +1,33 @@ | |||
| # 如何在启智平台上进行模型调试和训练 | |||
| ## 1. openi的使用方法 | |||
| ## 1. c2net的sdk使用方法 | |||
| 安装openi包 | |||
| 安装c2net-beta包 | |||
| ``` | |||
| pip install -U openi | |||
| ``` | |||
| 若是在智算训练任务中无网络,可以使用以下指令安装 | |||
| ```text | |||
| import os | |||
| os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) | |||
| pip install -U c2net-beta | |||
| ``` | |||
| 使用openi包 | |||
| 使用c2net的SDK方式 | |||
| ``` | |||
| #导入包 | |||
| from openi.context import prepare, upload_openi | |||
| from c2net.context import prepare, upload_output | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| c2net_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = c2net_context.output_path | |||
| #回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载 | |||
| upload_openi() | |||
| upload_output() | |||
| ``` | |||
| ## 2. 手写数字识别示例 | |||
| * GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md) | |||
| * NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md) | |||
| * GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md) | |||
| @@ -1,3 +1,4 @@ | |||
| <<<<<<< HEAD | |||
| # 如何在启智平台上进行模型训练—GCU示例 | |||
| ## 1.启智集群和智算集群的GCU训练样例 | |||
| @@ -46,5 +47,125 @@ upload_openi 将训练镜像的输出结果拷贝回启智平台 | |||
| ### 3.2 解决参数报错问题: | |||
| 请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`, `--multi_date_url`等参数报错问题 | |||
| ======= | |||
| # 如何在启智平台上进行模型训练 - GCU版本 | |||
| - 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目: | |||
| - 智算集群单数据集的训练示例请参考示例中[train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py)的代码注释 | |||
| - 智算集群中单/多数据集使用方式: | |||
| 如本示例中数据集MNISTDataset_torch.zip的使用方式是:数据集位于/tmp/dataset/下 | |||
| ## 1 概述 | |||
| - 本项目以#LeNet5-MNIST-PyTorch为例,简要介绍如何在启智AI协同平台上使用GCU集群+Pytorch完成训练任务,旨在为AI开发者提供启智训练示例。 | |||
| - 用户可以直接使用提供的[MNIST数据集](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0), 和代码文件创建自己的训练任务。 | |||
| ## 2 准备工作 | |||
| - 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://openi.pcl.ac.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 | |||
| ### 2.1 数据准备 | |||
| #### 数据集获取 | |||
| - 如果你需要试运行本示例,则无需再次上传数据集,因为示例中的数据集MnistDataset_torch.zip已经设置为公开数据集,可以直接引用。 | |||
| - 数据文件说明 | |||
| - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 | |||
| #### 数据集上传 | |||
| 使用GCU进行训练,使用的框架为Pytorch,上传和使用数据集的格式和GPU保存一致,可传到数据集-GPU界面。(此步骤在本示例中不需要,可直接选择公开数据集MNISTDataset_torch.zip) | |||
| ### 2.2 执行脚本准备 | |||
| #### 示例代码 | |||
| - 示例代码可从本仓库中下载,[代码下载](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU) | |||
| - 代码文件说明 | |||
| - [train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py),用于智算网络训练的脚本文件。 | |||
| - [model.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/model.py),使用的训练网络,在单/多数据集训练,智算网络训练中使用到。 | |||
| #### 【重点】GCU-Pytorch代码适配 | |||
| - GCU初始化 | |||
| ``` | |||
| def is_torch_dtu_available(): | |||
| if importlib.util.find_spec("torch_dtu") is None: | |||
| return False | |||
| if importlib.util.find_spec("torch_dtu.core") is None: | |||
| return False | |||
| return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None | |||
| if is_torch_dtu_available(): | |||
| import torch_dtu | |||
| import torch_dtu.distributed as dist | |||
| import torch_dtu.core.dtu_model as dm | |||
| from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP | |||
| ``` | |||
| - device计算设备指定 | |||
| ``` | |||
| if is_torch_dtu_available(): | |||
| device = dm.dtu_device() | |||
| else: | |||
| device = torch.device("cpu") | |||
| ``` | |||
| - 优化器更新接口 | |||
| ``` | |||
| sgd = SGD(model.parameters(), lr=1e-1) | |||
| for _epoch in range(epoch): | |||
| loss.backward() | |||
| if is_torch_dtu_available(): | |||
| dm.optimizer_step(sgd, barrier=True) | |||
| else: | |||
| sgd.step() | |||
| ``` | |||
| ## 3 创建训练任务 | |||
| 准备好数据和执行脚本以后,需要创建训练任务将GCU-Pytorch脚本运行。首次使用的用户可参考本示例代码。 | |||
| 启动脚本选择train_for_c2net.py | |||
| ## 4 查看运行结果 | |||
| ### 4.1 在训练作业界面可以查看运行日志 | |||
| 目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print | |||
| ### 4.2 训练结束后可以下载模型文件 | |||
| >>>>>>> origin/liuzx | |||
| ## 对于示例代码有任何问题,欢迎在本项目中提issue。 | |||
| @@ -5,9 +5,9 @@ If there are Chinese comments in the code,please add at the beginning: | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| 示例选用的数据集是MnistDataset.zip | |||
| 示例选用的数据集是MnistDataset_torch.zip | |||
| 数据集结构是: | |||
| MnistDataset.zip | |||
| MnistDataset_torch.zip | |||
| ├── test | |||
| │ ├── MNIST/processed/test.pt | |||
| │ └── MNIST/processed/training.pt | |||
| @@ -21,17 +21,10 @@ If there are Chinese comments in the code,please add at the beginning: | |||
| │ ├── MNIST/raw/train-images-idx3-ubyte | |||
| │ └── MNIST/raw/train-labels-idx1-ubyte | |||
| │ ├── MNIST/raw/t10k-images-idx3-ubyte | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| 示例选用的预训练模型文件夹为MNIST_PytorchExample_GPU_test34_model_7f9j,模型文件为:mnist_epoch1_0.70.pkl | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| ''' | |||
| import os | |||
| os.system("pip uninstall openi-test") | |||
| os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) | |||
| import torch | |||
| from model import Model | |||
| import numpy as np | |||
| @@ -41,7 +34,9 @@ from torch.optim import SGD | |||
| from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| from openi.context import prepare, upload_openi | |||
| import os | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| import importlib.util | |||
| @@ -57,30 +52,16 @@ parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') | |||
| parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') | |||
| if __name__ == '__main__': | |||
| #获取参数并忽略超参数报错 | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| dataset_path_A = dataset_path + "/MnistDataset" | |||
| pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" | |||
| print("dataset_path:") | |||
| os.listdir(dataset_path) | |||
| print("pretrain_model_path:") | |||
| os.listdir(pretrain_model_path) | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| print("output_path:") | |||
| os.listdir(output_path) | |||
| # load DPU envs-xx.sh | |||
| DTU_FLAG = True | |||
| if is_torch_dtu_available(): | |||
| @@ -100,10 +81,11 @@ if __name__ == '__main__': | |||
| # 参数声明 | |||
| model = Model().to(device) | |||
| optimizer = SGD(model.parameters(), lr=1e-1) | |||
| args, unknown = parser.parse_known_args() | |||
| #log output | |||
| batch_size = args.batch_size | |||
| train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False) | |||
| train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) | |||
| train_loader = DataLoader(train_dataset, batch_size=batch_size) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| model = Model().to(device) | |||
| @@ -113,8 +95,8 @@ if __name__ == '__main__': | |||
| print('epoch_size is:{}'.format(epochs)) | |||
| # 如果有保存的模型,则加载模型,并在其基础上继续训练 | |||
| if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"): | |||
| checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl") | |||
| if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) | |||
| model.load_state_dict(checkpoint['model']) | |||
| optimizer.load_state_dict(checkpoint['optimizer']) | |||
| start_epoch = checkpoint['epoch'] | |||
| @@ -155,8 +137,7 @@ if __name__ == '__main__': | |||
| correct += np.sum(_.numpy(), axis=-1) | |||
| _sum += _.shape[0] | |||
| print('accuracy: {:.2f}'.format(correct / _sum)) | |||
| #The model output location is placed under output_path | |||
| #The model output location is placed under /tmp/output | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} | |||
| torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path,_epoch+1, correct / _sum)) | |||
| print('test:') | |||
| print(os.listdir(output_path)) | |||
| torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) | |||
| print(os.listdir('{}'.format(c2net_context.output_path))) | |||
| @@ -24,8 +24,14 @@ from torch.optim import SGD | |||
| from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| <<<<<<< HEAD | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| ======= | |||
| import os | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| >>>>>>> origin/liuzx | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| @@ -77,13 +83,14 @@ def test(model, test_loader, test_data): | |||
| if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| print("dataset_path:") | |||
| print(os.listdir(dataset_path)) | |||
| @@ -101,14 +108,24 @@ if __name__ == '__main__': | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| batch_size = args.batch_size | |||
| epochs = args.epoch_size | |||
| <<<<<<< HEAD | |||
| train_dataset = mnist.MNIST(root=os.path.join(dataset_path + "/MnistDataset_torch", "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(dataset_path+ "/MnistDataset_torch", "test"), train=False, transform=ToTensor(),download=False) | |||
| ======= | |||
| train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) | |||
| >>>>>>> origin/liuzx | |||
| train_loader = DataLoader(train_dataset, batch_size=batch_size) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| #如果有保存的模型,则加载模型,并在其基础上继续训练 | |||
| <<<<<<< HEAD | |||
| if os.path.exists(os.path.join(pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j", "mnist_epoch1_0.70.pkl")): | |||
| checkpoint = torch.load(os.path.join(pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j", "mnist_epoch1_0.70.pkl")) | |||
| ======= | |||
| if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) | |||
| >>>>>>> origin/liuzx | |||
| model.load_state_dict(checkpoint['model']) | |||
| optimizer.load_state_dict(checkpoint['optimizer']) | |||
| start_epoch = checkpoint['epoch'] | |||
| @@ -120,8 +137,10 @@ if __name__ == '__main__': | |||
| for epoch in range(start_epoch+1, epochs): | |||
| train(model, train_loader, epoch) | |||
| test(model, test_loader, test_dataset) | |||
| # 保存模型 | |||
| # 将模型保存到c2net_context.output_path | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} | |||
| torch.save(state, '{}/mnist_epoch{}.pkl'.format(output_path, epoch)) | |||
| torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) | |||
| #回传结果 | |||
| upload_output() | |||
| @@ -13,7 +13,7 @@ | |||
| 使用注意事项: | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用openi的python sdk包 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import os | |||
| @@ -22,17 +22,15 @@ os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from dataset_distributed import create_dataset_parallel | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank | |||
| import time | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| @@ -51,56 +49,22 @@ parser.add_argument('--epoch_size', | |||
| if __name__ == "__main__": | |||
| ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| args, unknown = parser.parse_known_args() | |||
| data_dir = '' | |||
| pretrain_dir = '' | |||
| train_dir = '' | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| device_num = int(os.getenv('RANK_SIZE')) | |||
| #使用单卡时 | |||
| if device_num == 1: | |||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| data_dir = openi_context.dataset_path | |||
| pretrain_dir = openi_context.pretrain_model_path | |||
| train_dir = openi_context.output_path | |||
| #使用数据集的方式 | |||
| ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||
| #使用多卡时 | |||
| if device_num > 1: | |||
| # set device_id and init for multi-card training | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||
| init() | |||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||
| local_rank=int(os.getenv('RANK_ID')) | |||
| if local_rank%8==0: | |||
| ###初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| data_dir = openi_context.dataset_path | |||
| pretrain_dir = openi_context.pretrain_model_path | |||
| train_dir = openi_context.output_path | |||
| #Set a cache file to determine whether the data has been copied to obs. | |||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||
| f = open("/cache/download_input.txt", 'w') | |||
| f.close() | |||
| try: | |||
| if os.path.exists("/cache/download_input.txt"): | |||
| print("download_input succeed") | |||
| except Exception as e: | |||
| print("download_input failed") | |||
| while not os.path.exists("/cache/download_input.txt"): | |||
| time.sleep(1) | |||
| ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||
| #使用数据集的方式 | |||
| ds_train = create_dataset(os.path.join(mnistdata_path + "/MNISTData", "train"), cfg.batch_size) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| @@ -116,12 +80,8 @@ if __name__ == "__main__": | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| if device_num == 1: | |||
| outputDirectory = train_dir + "/" | |||
| if device_num > 1: | |||
| outputDirectory = train_dir + "/" + str(get_rank()) + "/" | |||
| #将模型保存到c2net_context.output_path | |||
| outputDirectory = c2net_context.output_path + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| @@ -132,5 +92,5 @@ if __name__ == "__main__": | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在openi_context.output_path | |||
| upload_openi() | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path | |||
| upload_output() | |||
| @@ -0,0 +1,117 @@ | |||
| """ | |||
| 示例选用的数据集是MNISTData.zip | |||
| 数据集结构是: | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| 使用注意事项: | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset_distributed import create_dataset_parallel | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank | |||
| import time | |||
| #导入openi包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| if __name__ == "__main__": | |||
| ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| args, unknown = parser.parse_known_args() | |||
| device_num = int(os.getenv('RANK_SIZE')) | |||
| #使用多卡时 | |||
| # set device_id and init for multi-card training | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||
| init() | |||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||
| local_rank=int(os.getenv('RANK_ID')) | |||
| if local_rank%8==0: | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #Set a cache file to determine whether the data has been copied to obs. | |||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||
| f = open("/cache/download_input.txt", 'w') | |||
| f.close() | |||
| try: | |||
| if os.path.exists("/cache/download_input.txt"): | |||
| print("download_input succeed") | |||
| except Exception as e: | |||
| print("download_input failed") | |||
| while not os.path.exists("/cache/download_input.txt"): | |||
| time.sleep(1) | |||
| ds_train = create_dataset_parallel(os.path.join(mnistdata_path, "train"), cfg.batch_size) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy"}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy"}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path | |||
| upload_output() | |||