From faab1b0896483ba5135df7891411b4625e2a2561 Mon Sep 17 00:00:00 2001 From: liuzx Date: Tue, 31 Oct 2023 18:16:02 +0800 Subject: [PATCH 1/3] add gcu example --- gcu_mnist_example/README.md | 120 +++++++++++++++++++++++++ gcu_mnist_example/model.py | 35 ++++++++ gcu_mnist_example/train_gcu.py | 160 +++++++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+) create mode 100644 gcu_mnist_example/README.md create mode 100644 gcu_mnist_example/model.py create mode 100644 gcu_mnist_example/train_gcu.py diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md new file mode 100644 index 0000000..ca723b4 --- /dev/null +++ b/gcu_mnist_example/README.md @@ -0,0 +1,120 @@ +# 如何在启智平台上进行模型训练 - GCU版本 + +- 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目: + + - 智算集群单数据集的训练示例请参考示例中[train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py)的代码注释 +- 智算集群中单/多数据集使用方式: + + 如本示例中数据集MNISTDataset_torch.zip的使用方式是:数据集位于/tmp/dataset/下 + +## 1 概述 + +- 本项目以#LeNet5-MNIST-PyTorch为例,简要介绍如何在启智AI协同平台上使用GCU集群+Pytorch完成训练任务,旨在为AI开发者提供启智训练示例。 +- 用户可以直接使用提供的[MNIST数据集](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0), 和代码文件创建自己的训练任务。 + +## 2 准备工作 + +- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://openi.pcl.ac.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 + +### 2.1 数据准备 + +#### 数据集获取 + +- 如果你需要试运行本示例,则无需再次上传数据集,因为示例中的数据集MnistDataset_torch.zip已经设置为公开数据集,可以直接引用。 +- 数据文件说明 + + - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 + +#### 数据集上传 + +使用GCU进行训练,使用的框架为Pytorch,上传和使用数据集的格式和GPU保存一致,可传到数据集-GPU界面。(此步骤在本示例中不需要,可直接选择公开数据集MNISTDataset_torch.zip) + +### 2.2 执行脚本准备 + +#### 示例代码 + +- 示例代码可从本仓库中下载,[代码下载](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU) +- 代码文件说明 + + - [train_for_c2net.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/train_for_c2net.py),用于智算网络训练的脚本文件。 + - [model.py](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GCU/src/branch/master/model.py),使用的训练网络,在单/多数据集训练,智算网络训练中使用到。 + +#### 【重点】GCU-Pytorch代码适配 + +- GCU初始化 + + ``` + + def is_torch_dtu_available(): + + if importlib.util.find_spec("torch_dtu") is None: + + return False + + if importlib.util.find_spec("torch_dtu.core") is None: + + return False + + return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None + + + + if is_torch_dtu_available(): + + import torch_dtu + + import torch_dtu.distributed as dist + + import torch_dtu.core.dtu_model as dm + + from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP + + ``` +- device计算设备指定 + + ``` + + if is_torch_dtu_available(): + + device = dm.dtu_device() + + else: + + device = torch.device("cpu") + + ``` +- 优化器更新接口 + + ``` + + sgd = SGD(model.parameters(), lr=1e-1) + + for _epoch in range(epoch): + + loss.backward() + + if is_torch_dtu_available(): + + dm.optimizer_step(sgd, barrier=True) + + else: + + sgd.step() + + ``` + +## 3 创建训练任务 + +准备好数据和执行脚本以后,需要创建训练任务将GCU-Pytorch脚本运行。首次使用的用户可参考本示例代码。 + +启动脚本选择train_for_c2net.py + +## 4 查看运行结果 + +### 4.1 在训练作业界面可以查看运行日志 + +目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print + +### 4.2 训练结束后可以下载模型文件 + +## 对于示例代码有任何问题,欢迎在本项目中提issue。 diff --git a/gcu_mnist_example/model.py b/gcu_mnist_example/model.py new file mode 100644 index 0000000..157bad6 --- /dev/null +++ b/gcu_mnist_example/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y \ No newline at end of file diff --git a/gcu_mnist_example/train_gcu.py b/gcu_mnist_example/train_gcu.py new file mode 100644 index 0000000..5572bb7 --- /dev/null +++ b/gcu_mnist_example/train_gcu.py @@ -0,0 +1,160 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +示例选用的数据集是MnistDataset_torch.zip +数据集结构是: + MnistDataset_torch.zip + ├── test + │ ├── MNIST/processed/test.pt + │ └── MNIST/processed/training.pt + │ ├── MNIST/raw/train-images-idx3-ubyte + │ └── MNIST/raw/train-labels-idx1-ubyte + │ ├── MNIST/raw/t10k-images-idx3-ubyte + │ └── MNIST/raw/t10k-labels-idx1-ubyte + ├── train + │ ├── MNIST/processed/test.pt + │ └── MNIST/processed/training.pt + │ ├── MNIST/raw/train-images-idx3-ubyte + │ └── MNIST/raw/train-labels-idx1-ubyte + │ ├── MNIST/raw/t10k-images-idx3-ubyte + │ └── MNIST/raw/t10k-labels-idx1-ubyte + + + +示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl + + +代码会自动放置在/tmp/code目录下。 +数据集在界面选择后,会自动放置在/tmp/dataset目录下。 +预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。 +输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。 +如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train +''' + +import torch +from model import Model +import numpy as np +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os +#导入openi包 +from openi.context import prepare, upload_openi + +import importlib.util + +def is_torch_dtu_available(): + if importlib.util.find_spec("torch_dtu") is None: + return False + if importlib.util.find_spec("torch_dtu.core") is None: + return False + return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') + +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +parser.add_argument('--ckpt_url', default="", help='pretrain model path') +parser.add_argument('--pretrainmodel', default="/tmp/pretrainmodel/mnist_epoch1_0.86.pkl", help='pretrain model path') + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + openi_context = prepare() + + #获取数据集路径,预训练模型路径,输出路径 + dataset_path = openi_context.dataset_path + pretrain_model_path = openi_context.pretrain_model_path + output_path = openi_context.output_path + + # load DPU envs-xx.sh + DTU_FLAG = True + if is_torch_dtu_available(): + import torch_dtu + import torch_dtu.distributed as dist + import torch_dtu.core.dtu_model as dm + from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP + print('dtu is available: True') + device = dm.dtu_device() + DTU_FLAG = True + else: + print('dtu is available: False') + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + DTU_FLAG = False + + + # 参数声明 + model = Model().to(device) + optimizer = SGD(model.parameters(), lr=1e-1) + args, unknown = parser.parse_known_args() + #log output + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epochs = args.epoch_size + print('epoch_size is:{}'.format(epochs)) + + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")): + checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for _epoch in range(start_epoch, epochs): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + if DTU_FLAG: + dm.optimizer_step(sgd, barrier=True) + else: + sgd.step() + + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /tmp/output + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} + torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path, _epoch+1, correct / _sum)) + print(os.listdir('{}'.format(output_path))) \ No newline at end of file From fddd1915bc65799983840520cebdc9b951150ce6 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 4 Jan 2024 10:49:09 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E6=9B=B4=E6=8D=A2=E5=8C=85=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gcu_mnist_example/train_gcu.py | 47 +++------ gpu_mnist_example/train_gpu.py | 31 +++--- npu_mnist_example/train_npu.py | 79 ++++----------- npu_mnist_example/train_npu_multi_card.py | 117 ++++++++++++++++++++++ 4 files changed, 167 insertions(+), 107 deletions(-) create mode 100644 npu_mnist_example/train_npu_multi_card.py diff --git a/gcu_mnist_example/train_gcu.py b/gcu_mnist_example/train_gcu.py index 5572bb7..ee1867e 100644 --- a/gcu_mnist_example/train_gcu.py +++ b/gcu_mnist_example/train_gcu.py @@ -21,18 +21,8 @@ If there are Chinese comments in the code,please add at the beginning: │ ├── MNIST/raw/train-images-idx3-ubyte │ └── MNIST/raw/train-labels-idx1-ubyte │ ├── MNIST/raw/t10k-images-idx3-ubyte - │ └── MNIST/raw/t10k-labels-idx1-ubyte - - - -示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl - + │ └── MNIST/raw/t10k-labels-idx1-ubyte -代码会自动放置在/tmp/code目录下。 -数据集在界面选择后,会自动放置在/tmp/dataset目录下。 -预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。 -输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。 -如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train ''' import torch @@ -45,8 +35,8 @@ from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse import os -#导入openi包 -from openi.context import prepare, upload_openi +#导入c2net包 +from c2net.context import prepare, upload_output import importlib.util @@ -59,25 +49,18 @@ def is_torch_dtu_available(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -#The dataset location is placed under /dataset -parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') -parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') - parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') -parser.add_argument('--ckpt_url', default="", help='pretrain model path') -parser.add_argument('--pretrainmodel', default="/tmp/pretrainmodel/mnist_epoch1_0.86.pkl", help='pretrain model path') - if __name__ == '__main__': args, unknown = parser.parse_known_args() - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - #获取数据集路径,预训练模型路径,输出路径 - dataset_path = openi_context.dataset_path - pretrain_model_path = openi_context.pretrain_model_path - output_path = openi_context.output_path + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" # load DPU envs-xx.sh DTU_FLAG = True @@ -101,8 +84,8 @@ if __name__ == '__main__': args, unknown = parser.parse_known_args() #log output batch_size = args.batch_size - train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False) + train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) model = Model().to(device) @@ -112,8 +95,8 @@ if __name__ == '__main__': print('epoch_size is:{}'.format(epochs)) # 如果有保存的模型,则加载模型,并在其基础上继续训练 - if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")): - checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")) + if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): + checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] @@ -156,5 +139,5 @@ if __name__ == '__main__': print('accuracy: {:.2f}'.format(correct / _sum)) #The model output location is placed under /tmp/output state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} - torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path, _epoch+1, correct / _sum)) - print(os.listdir('{}'.format(output_path))) \ No newline at end of file + torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) + print(os.listdir('{}'.format(c2net_context.output_path))) \ No newline at end of file diff --git a/gpu_mnist_example/train_gpu.py b/gpu_mnist_example/train_gpu.py index f163b42..8636714 100644 --- a/gpu_mnist_example/train_gpu.py +++ b/gpu_mnist_example/train_gpu.py @@ -23,8 +23,8 @@ from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse import os -#导入openi包 -from openi.context import prepare, upload_openi +#导入c2net包 +from c2net.context import prepare, upload_output # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') @@ -76,27 +76,28 @@ def test(model, test_loader, test_data): if __name__ == '__main__': args, unknown = parser.parse_known_args() - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - #获取数据集路径,预训练模型路径,输出路径 - dataset_path = openi_context.dataset_path - pretrain_model_path = openi_context.pretrain_model_path - output_path = openi_context.output_path + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" #log output print('cuda is available:{}'.format(torch.cuda.is_available())) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = args.batch_size epochs = args.epoch_size - train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False) + train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) #如果有保存的模型,则加载模型,并在其基础上继续训练 - if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")): - checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")) + if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): + checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] @@ -108,8 +109,10 @@ if __name__ == '__main__': for epoch in range(start_epoch+1, epochs): train(model, train_loader, epoch) test(model, test_loader, test_dataset) - # 保存模型 + # 将模型保存到c2net_context.output_path state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} - torch.save(state, '{}/mnist_epoch{}.pkl'.format(output_path, epoch)) + torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) + #回传结果 + upload_output() diff --git a/npu_mnist_example/train_npu.py b/npu_mnist_example/train_npu.py index 194f230..186acb9 100644 --- a/npu_mnist_example/train_npu.py +++ b/npu_mnist_example/train_npu.py @@ -13,24 +13,22 @@ 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 -2、用户需要调用openi的python sdk包 +2、用户需要调用c2net的python sdk包 """ import os import argparse from config import mnist_cfg as cfg from dataset import create_dataset -from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net from mindspore.train import Model -from mindspore.context import ParallelMode -from mindspore.communication.management import init, get_rank import time -#导入openi包 -from openi.context import prepare, upload_openi +#导入c2net包 +from c2net.context import prepare, upload_output parser = argparse.ArgumentParser(description='MindSpore Lenet Example') @@ -50,59 +48,22 @@ parser.add_argument('--epoch_size', if __name__ == "__main__": ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 args, unknown = parser.parse_known_args() - data_dir = '' - pretrain_dir = '' - train_dir = '' - - #回传结果到openi - upload_openi() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" device_num = int(os.getenv('RANK_SIZE')) - #使用单卡时 - if device_num == 1: - context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - data_dir = openi_context.dataset_path - pretrain_dir = openi_context.pretrain_model_path - train_dir = openi_context.output_path - #使用数据集的方式 - ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) - #使用多卡时 - if device_num > 1: - # set device_id and init for multi-card training - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) - init() - #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data - local_rank=int(os.getenv('RANK_ID')) - if local_rank%8==0: - ###初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - #初始化导入数据集和预训练模型到容器内 - openi_context = prepare() - data_dir = openi_context.dataset_path - pretrain_dir = openi_context.pretrain_model_path - train_dir = openi_context.output_path - #Set a cache file to determine whether the data has been copied to obs. - #If this file exists during multi-card training, there is no need to copy the dataset multiple times. - f = open("/cache/download_input.txt", 'w') - f.close() - try: - if os.path.exists("/cache/download_input.txt"): - print("download_input succeed") - except Exception as e: - print("download_input failed") - while not os.path.exists("/cache/download_input.txt"): - time.sleep(1) - ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) - + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + #使用数据集的方式 + ds_train = create_dataset(os.path.join(mnistdata_path + "/MNISTData", "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - + load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) if args.device_target != "Ascend": model = Model(network, net_loss, @@ -118,12 +79,8 @@ if __name__ == "__main__": config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) - #Note that this method saves the model file on each card. You need to specify the save path on each card. - # In this example, get_rank() is added to distinguish different paths. - if device_num == 1: - outputDirectory = train_dir + "/" - if device_num > 1: - outputDirectory = train_dir + "/" + str(get_rank()) + "/" + #将模型保存到c2net_context.output_path + outputDirectory = c2net_context.output_path + "/" ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=outputDirectory, config=config_ck) @@ -134,5 +91,5 @@ if __name__ == "__main__": print('epoch_size is: ', epoch_size) model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) - ###上传训练结果到启智平台,注意必须将要输出的模型存储在openi_context.output_path - upload_openi() \ No newline at end of file + ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path + upload_output() \ No newline at end of file diff --git a/npu_mnist_example/train_npu_multi_card.py b/npu_mnist_example/train_npu_multi_card.py new file mode 100644 index 0000000..3b6df62 --- /dev/null +++ b/npu_mnist_example/train_npu_multi_card.py @@ -0,0 +1,117 @@ + + +""" +示例选用的数据集是MNISTData.zip +数据集结构是: + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time +#导入openi包 +from c2net.context import prepare, upload_output + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + + device_num = int(os.getenv('RANK_SIZE')) + #使用多卡时 + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + ds_train = create_dataset_parallel(os.path.join(mnistdata_path, "train"), cfg.batch_size) + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy"}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) + + ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path + upload_output() \ No newline at end of file From c58dee47fb748b021888f58f0f538c7798199268 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 4 Jan 2024 11:04:15 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 453ce47..fd89f54 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,33 @@ - # 如何在启智平台上进行模型调试和训练 -## 1. openi的使用方法 +## 1. c2net的sdk使用方法 -安装openi包 +安装c2net-beta包 ``` -pip install -U openi -``` - -若是在智算训练任务中无网络,可以使用以下指令安装 - -```text -import os -os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) +pip install -U c2net-beta ``` -使用openi包 +使用c2net的SDK方式 ``` #导入包 -from openi.context import prepare, upload_openi +from c2net.context import prepare, upload_output #初始化导入数据集和预训练模型到容器内 -openi_context = prepare() +c2net_context = prepare() #获取数据集路径,预训练模型路径,输出路径 -dataset_path = openi_context.dataset_path -pretrain_model_path = openi_context.pretrain_model_path -output_path = openi_context.output_path +dataset_path = c2net_context.dataset_path +pretrain_model_path = c2net_context.pretrain_model_path +output_path = c2net_context.output_path #回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载 -upload_openi() +upload_output() ``` ## 2. 手写数字识别示例 * GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md) * NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md) +* GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md)