diff --git a/README.md b/README.md index 453ce47..fd89f54 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,33 @@ - # 如何在启智平台上进行模型调试和训练 -## 1. openi的使用方法 +## 1. c2net的sdk使用方法 -安装openi包 +安装c2net-beta包 ``` -pip install -U openi -``` - -若是在智算训练任务中无网络,可以使用以下指令安装 - -```text -import os -os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) +pip install -U c2net-beta ``` -使用openi包 +使用c2net的SDK方式 ``` #导入包 -from openi.context import prepare, upload_openi +from c2net.context import prepare, upload_output #初始化导入数据集和预训练模型到容器内 -openi_context = prepare() +c2net_context = prepare() #获取数据集路径,预训练模型路径,输出路径 -dataset_path = openi_context.dataset_path -pretrain_model_path = openi_context.pretrain_model_path -output_path = openi_context.output_path +dataset_path = c2net_context.dataset_path +pretrain_model_path = c2net_context.pretrain_model_path +output_path = c2net_context.output_path #回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载 -upload_openi() +upload_output() ``` ## 2. 手写数字识别示例 * GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md) * NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md) +* GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md) diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md index 3b9351e..c741b59 100644 --- a/gcu_mnist_example/README.md +++ b/gcu_mnist_example/README.md @@ -1,3 +1,4 @@ +<<<<<<< HEAD # 如何在启智平台上进行模型训练—GCU示例 ## 1.启智集群和智算集群的GCU训练样例 @@ -46,5 +47,125 @@ upload_openi 将训练镜像的输出结果拷贝回启智平台 ### 3.2 解决参数报错问题: 请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`, `--multi_date_url`等参数报错问题 +======= +# 如何在启智平台上进行模型训练 - GCU版本 + +- 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目: + + - 智算集群单数据集的训练示例请参考示例中[train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py)的代码注释 +- 智算集群中单/多数据集使用方式: + + 如本示例中数据集MNISTDataset_torch.zip的使用方式是:数据集位于/tmp/dataset/下 + +## 1 概述 + +- 本项目以#LeNet5-MNIST-PyTorch为例,简要介绍如何在启智AI协同平台上使用GCU集群+Pytorch完成训练任务,旨在为AI开发者提供启智训练示例。 +- 用户可以直接使用提供的[MNIST数据集](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0), 和代码文件创建自己的训练任务。 + +## 2 准备工作 + +- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://openi.pcl.ac.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 + +### 2.1 数据准备 + +#### 数据集获取 + +- 如果你需要试运行本示例,则无需再次上传数据集,因为示例中的数据集MnistDataset_torch.zip已经设置为公开数据集,可以直接引用。 +- 数据文件说明 + + - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 + +#### 数据集上传 + +使用GCU进行训练,使用的框架为Pytorch,上传和使用数据集的格式和GPU保存一致,可传到数据集-GPU界面。(此步骤在本示例中不需要,可直接选择公开数据集MNISTDataset_torch.zip) + +### 2.2 执行脚本准备 + +#### 示例代码 + +- 示例代码可从本仓库中下载,[代码下载](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU) +- 代码文件说明 + + - [train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py),用于智算网络训练的脚本文件。 + - [model.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/model.py),使用的训练网络,在单/多数据集训练,智算网络训练中使用到。 + +#### 【重点】GCU-Pytorch代码适配 + +- GCU初始化 + + ``` + + def is_torch_dtu_available(): + + if importlib.util.find_spec("torch_dtu") is None: + + return False + + if importlib.util.find_spec("torch_dtu.core") is None: + + return False + + return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None + + + + if is_torch_dtu_available(): + + import torch_dtu + + import torch_dtu.distributed as dist + + import torch_dtu.core.dtu_model as dm + + from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP + + ``` +- device计算设备指定 + + ``` + + if is_torch_dtu_available(): + + device = dm.dtu_device() + + else: + + device = torch.device("cpu") + + ``` +- 优化器更新接口 + + ``` + + sgd = SGD(model.parameters(), lr=1e-1) + + for _epoch in range(epoch): + + loss.backward() + + if is_torch_dtu_available(): + + dm.optimizer_step(sgd, barrier=True) + + else: + + sgd.step() + + ``` + +## 3 创建训练任务 + +准备好数据和执行脚本以后,需要创建训练任务将GCU-Pytorch脚本运行。首次使用的用户可参考本示例代码。 + +启动脚本选择train_for_c2net.py + +## 4 查看运行结果 + +### 4.1 在训练作业界面可以查看运行日志 + +目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print + +### 4.2 训练结束后可以下载模型文件 +>>>>>>> origin/liuzx ## 对于示例代码有任何问题,欢迎在本项目中提issue。 diff --git a/gcu_mnist_example/train_gcu.py b/gcu_mnist_example/train_gcu.py index 51f4c27..b49cc5d 100644 --- a/gcu_mnist_example/train_gcu.py +++ b/gcu_mnist_example/train_gcu.py @@ -5,9 +5,9 @@ If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 -示例选用的数据集是MnistDataset.zip +示例选用的数据集是MnistDataset_torch.zip 数据集结构是: - MnistDataset.zip + MnistDataset_torch.zip ├── test │ ├── MNIST/processed/test.pt │ └── MNIST/processed/training.pt @@ -21,17 +21,10 @@ If there are Chinese comments in the code,please add at the beginning: │ ├── MNIST/raw/train-images-idx3-ubyte │ └── MNIST/raw/train-labels-idx1-ubyte │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - - - -示例选用的预训练模型文件夹为MNIST_PytorchExample_GPU_test34_model_7f9j,模型文件为:mnist_epoch1_0.70.pkl + │ └── MNIST/raw/t10k-labels-idx1-ubyte ''' -import os -os.system("pip uninstall openi-test") -os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) import torch from model import Model import numpy as np @@ -41,7 +34,9 @@ from torch.optim import SGD from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse -from openi.context import prepare, upload_openi +import os +#导入c2net包 +from c2net.context import prepare, upload_output import importlib.util @@ -57,30 +52,16 @@ parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') - if __name__ == '__main__': - #获取参数并忽略超参数报错 args, unknown = parser.parse_known_args() - - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - - #获取数据集路径,预训练模型路径,输出路径 - dataset_path = openi_context.dataset_path - pretrain_model_path = openi_context.pretrain_model_path - output_path = openi_context.output_path - - dataset_path_A = dataset_path + "/MnistDataset" - pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j" - print("dataset_path:") - os.listdir(dataset_path) - - print("pretrain_model_path:") - os.listdir(pretrain_model_path) + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" - print("output_path:") - os.listdir(output_path) # load DPU envs-xx.sh DTU_FLAG = True if is_torch_dtu_available(): @@ -100,10 +81,11 @@ if __name__ == '__main__': # 参数声明 model = Model().to(device) optimizer = SGD(model.parameters(), lr=1e-1) + args, unknown = parser.parse_known_args() #log output batch_size = args.batch_size - train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False) + train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) model = Model().to(device) @@ -113,8 +95,8 @@ if __name__ == '__main__': print('epoch_size is:{}'.format(epochs)) # 如果有保存的模型,则加载模型,并在其基础上继续训练 - if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"): - checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl") + if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): + checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] @@ -155,8 +137,7 @@ if __name__ == '__main__': correct += np.sum(_.numpy(), axis=-1) _sum += _.shape[0] print('accuracy: {:.2f}'.format(correct / _sum)) - #The model output location is placed under output_path + #The model output location is placed under /tmp/output state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} - torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path,_epoch+1, correct / _sum)) - print('test:') - print(os.listdir(output_path)) \ No newline at end of file + torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) + print(os.listdir('{}'.format(c2net_context.output_path))) diff --git a/gpu_mnist_example/train_gpu.py b/gpu_mnist_example/train_gpu.py index 3abe746..4a94d95 100644 --- a/gpu_mnist_example/train_gpu.py +++ b/gpu_mnist_example/train_gpu.py @@ -24,8 +24,14 @@ from torch.optim import SGD from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse +<<<<<<< HEAD #导入openi包 from openi.context import prepare, upload_openi +======= +import os +#导入c2net包 +from c2net.context import prepare, upload_output +>>>>>>> origin/liuzx # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') @@ -77,13 +83,14 @@ def test(model, test_loader, test_data): if __name__ == '__main__': args, unknown = parser.parse_known_args() - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - #获取数据集路径,预训练模型路径,输出路径 - dataset_path = openi_context.dataset_path - pretrain_model_path = openi_context.pretrain_model_path - output_path = openi_context.output_path + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" print("dataset_path:") print(os.listdir(dataset_path)) @@ -101,14 +108,24 @@ if __name__ == '__main__': device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = args.batch_size epochs = args.epoch_size +<<<<<<< HEAD train_dataset = mnist.MNIST(root=os.path.join(dataset_path + "/MnistDataset_torch", "train"), train=True, transform=ToTensor(),download=False) test_dataset = mnist.MNIST(root=os.path.join(dataset_path+ "/MnistDataset_torch", "test"), train=False, transform=ToTensor(),download=False) +======= + train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) +>>>>>>> origin/liuzx train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) #如果有保存的模型,则加载模型,并在其基础上继续训练 +<<<<<<< HEAD if os.path.exists(os.path.join(pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j", "mnist_epoch1_0.70.pkl")): checkpoint = torch.load(os.path.join(pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j", "mnist_epoch1_0.70.pkl")) +======= + if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): + checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) +>>>>>>> origin/liuzx model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] @@ -120,8 +137,10 @@ if __name__ == '__main__': for epoch in range(start_epoch+1, epochs): train(model, train_loader, epoch) test(model, test_loader, test_dataset) - # 保存模型 + # 将模型保存到c2net_context.output_path state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} - torch.save(state, '{}/mnist_epoch{}.pkl'.format(output_path, epoch)) + torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) + #回传结果 + upload_output() diff --git a/npu_mnist_example/train_npu.py b/npu_mnist_example/train_npu.py index 59d4608..c1c776c 100644 --- a/npu_mnist_example/train_npu.py +++ b/npu_mnist_example/train_npu.py @@ -13,7 +13,7 @@ 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 -2、用户需要调用openi的python sdk包 +2、用户需要调用c2net的python sdk包 """ import os @@ -22,17 +22,15 @@ os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) import argparse from config import mnist_cfg as cfg from dataset import create_dataset -from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net from mindspore.train import Model -from mindspore.context import ParallelMode -from mindspore.communication.management import init, get_rank import time -#导入openi包 -from openi.context import prepare, upload_openi +#导入c2net包 +from c2net.context import prepare, upload_output parser = argparse.ArgumentParser(description='MindSpore Lenet Example') @@ -51,56 +49,22 @@ parser.add_argument('--epoch_size', if __name__ == "__main__": ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 args, unknown = parser.parse_known_args() - data_dir = '' - pretrain_dir = '' - train_dir = '' + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" device_num = int(os.getenv('RANK_SIZE')) - #使用单卡时 - if device_num == 1: - context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - data_dir = openi_context.dataset_path - pretrain_dir = openi_context.pretrain_model_path - train_dir = openi_context.output_path - #使用数据集的方式 - ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) - #使用多卡时 - if device_num > 1: - # set device_id and init for multi-card training - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) - init() - #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data - local_rank=int(os.getenv('RANK_ID')) - if local_rank%8==0: - ###初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - data_dir = openi_context.dataset_path - pretrain_dir = openi_context.pretrain_model_path - train_dir = openi_context.output_path - #Set a cache file to determine whether the data has been copied to obs. - #If this file exists during multi-card training, there is no need to copy the dataset multiple times. - f = open("/cache/download_input.txt", 'w') - f.close() - try: - if os.path.exists("/cache/download_input.txt"): - print("download_input succeed") - except Exception as e: - print("download_input failed") - while not os.path.exists("/cache/download_input.txt"): - time.sleep(1) - ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) - + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + #使用数据集的方式 + ds_train = create_dataset(os.path.join(mnistdata_path + "/MNISTData", "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - + load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss, @@ -116,12 +80,8 @@ if __name__ == "__main__": config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) - #Note that this method saves the model file on each card. You need to specify the save path on each card. - # In this example, get_rank() is added to distinguish different paths. - if device_num == 1: - outputDirectory = train_dir + "/" - if device_num > 1: - outputDirectory = train_dir + "/" + str(get_rank()) + "/" + #将模型保存到c2net_context.output_path + outputDirectory = c2net_context.output_path + "/" ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=outputDirectory, config=config_ck) @@ -132,5 +92,5 @@ if __name__ == "__main__": print('epoch_size is: ', epoch_size) model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) - ###上传训练结果到启智平台,注意必须将要输出的模型存储在openi_context.output_path - upload_openi() \ No newline at end of file + ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path + upload_output() \ No newline at end of file diff --git a/npu_mnist_example/train_npu_multi_card.py b/npu_mnist_example/train_npu_multi_card.py new file mode 100644 index 0000000..3b6df62 --- /dev/null +++ b/npu_mnist_example/train_npu_multi_card.py @@ -0,0 +1,117 @@ + + +""" +示例选用的数据集是MNISTData.zip +数据集结构是: + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time +#导入openi包 +from c2net.context import prepare, upload_output + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + + device_num = int(os.getenv('RANK_SIZE')) + #使用多卡时 + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + ds_train = create_dataset_parallel(os.path.join(mnistdata_path, "train"), cfg.batch_size) + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) + + ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path + upload_output() \ No newline at end of file