| @@ -21,18 +21,8 @@ If there are Chinese comments in the code,please add at the beginning: | |||
| │ ├── MNIST/raw/train-images-idx3-ubyte | |||
| │ └── MNIST/raw/train-labels-idx1-ubyte | |||
| │ ├── MNIST/raw/t10k-images-idx3-ubyte | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| 示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl | |||
| │ └── MNIST/raw/t10k-labels-idx1-ubyte | |||
| 代码会自动放置在/tmp/code目录下。 | |||
| 数据集在界面选择后,会自动放置在/tmp/dataset目录下。 | |||
| 预训练模型文件在界面选择后,会自动放置在/tmp/pretrainmodel目录下。 | |||
| 输出的模型文件也需要放置在/tmp/output目录下,平台会自动下载/tmp/output目录下的文件。 | |||
| 如果选用了多数据集,则应在/tmp/dataset后带上数据集名称,比如/tmp/dataset/MnistDataset_torch/train | |||
| ''' | |||
| import torch | |||
| @@ -45,8 +35,8 @@ from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| import importlib.util | |||
| @@ -59,25 +49,18 @@ def is_torch_dtu_available(): | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| #The dataset location is placed under /dataset | |||
| parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') | |||
| parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') | |||
| parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') | |||
| parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') | |||
| parser.add_argument('--ckpt_url', default="", help='pretrain model path') | |||
| parser.add_argument('--pretrainmodel', default="/tmp/pretrainmodel/mnist_epoch1_0.86.pkl", help='pretrain model path') | |||
| if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| # load DPU envs-xx.sh | |||
| DTU_FLAG = True | |||
| @@ -101,8 +84,8 @@ if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #log output | |||
| batch_size = args.batch_size | |||
| train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False) | |||
| train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) | |||
| train_loader = DataLoader(train_dataset, batch_size=batch_size) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| model = Model().to(device) | |||
| @@ -112,8 +95,8 @@ if __name__ == '__main__': | |||
| print('epoch_size is:{}'.format(epochs)) | |||
| # 如果有保存的模型,则加载模型,并在其基础上继续训练 | |||
| if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")) | |||
| if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) | |||
| model.load_state_dict(checkpoint['model']) | |||
| optimizer.load_state_dict(checkpoint['optimizer']) | |||
| start_epoch = checkpoint['epoch'] | |||
| @@ -156,5 +139,5 @@ if __name__ == '__main__': | |||
| print('accuracy: {:.2f}'.format(correct / _sum)) | |||
| #The model output location is placed under /tmp/output | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} | |||
| torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(output_path, _epoch+1, correct / _sum)) | |||
| print(os.listdir('{}'.format(output_path))) | |||
| torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) | |||
| print(os.listdir('{}'.format(c2net_context.output_path))) | |||
| @@ -23,8 +23,8 @@ from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| @@ -76,27 +76,28 @@ def test(model, test_loader, test_data): | |||
| if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #log output | |||
| print('cuda is available:{}'.format(torch.cuda.is_available())) | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| batch_size = args.batch_size | |||
| epochs = args.epoch_size | |||
| train_dataset = mnist.MNIST(root=os.path.join(dataset_path, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(dataset_path, "test"), train=False, transform=ToTensor(),download=False) | |||
| train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "train"), train=True, transform=ToTensor(),download=False) | |||
| test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch, "test"), train=False, transform=ToTensor(),download=False) | |||
| train_loader = DataLoader(train_dataset, batch_size=batch_size) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| #如果有保存的模型,则加载模型,并在其基础上继续训练 | |||
| if os.path.exists(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(pretrain_model_path, "mnist_epoch1_0.76.pkl")) | |||
| if os.path.exists(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")): | |||
| checkpoint = torch.load(os.path.join(mnist_example_test2_model_djts_path, "mnist_epoch1_0.76.pkl")) | |||
| model.load_state_dict(checkpoint['model']) | |||
| optimizer.load_state_dict(checkpoint['optimizer']) | |||
| start_epoch = checkpoint['epoch'] | |||
| @@ -108,8 +109,10 @@ if __name__ == '__main__': | |||
| for epoch in range(start_epoch+1, epochs): | |||
| train(model, train_loader, epoch) | |||
| test(model, test_loader, test_dataset) | |||
| # 保存模型 | |||
| # 将模型保存到c2net_context.output_path | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} | |||
| torch.save(state, '{}/mnist_epoch{}.pkl'.format(output_path, epoch)) | |||
| torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) | |||
| #回传结果 | |||
| upload_output() | |||
| @@ -13,24 +13,22 @@ | |||
| 使用注意事项: | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用openi的python sdk包 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from dataset_distributed import create_dataset_parallel | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank | |||
| import time | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| @@ -50,59 +48,22 @@ parser.add_argument('--epoch_size', | |||
| if __name__ == "__main__": | |||
| ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| args, unknown = parser.parse_known_args() | |||
| data_dir = '' | |||
| pretrain_dir = '' | |||
| train_dir = '' | |||
| #回传结果到openi | |||
| upload_openi() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| device_num = int(os.getenv('RANK_SIZE')) | |||
| #使用单卡时 | |||
| if device_num == 1: | |||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| data_dir = openi_context.dataset_path | |||
| pretrain_dir = openi_context.pretrain_model_path | |||
| train_dir = openi_context.output_path | |||
| #使用数据集的方式 | |||
| ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||
| #使用多卡时 | |||
| if device_num > 1: | |||
| # set device_id and init for multi-card training | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||
| init() | |||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||
| local_rank=int(os.getenv('RANK_ID')) | |||
| if local_rank%8==0: | |||
| ###初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| data_dir = openi_context.dataset_path | |||
| pretrain_dir = openi_context.pretrain_model_path | |||
| train_dir = openi_context.output_path | |||
| #Set a cache file to determine whether the data has been copied to obs. | |||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||
| f = open("/cache/download_input.txt", 'w') | |||
| f.close() | |||
| try: | |||
| if os.path.exists("/cache/download_input.txt"): | |||
| print("download_input succeed") | |||
| except Exception as e: | |||
| print("download_input failed") | |||
| while not os.path.exists("/cache/download_input.txt"): | |||
| time.sleep(1) | |||
| ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||
| #使用数据集的方式 | |||
| ds_train = create_dataset(os.path.join(mnistdata_path + "/MNISTData", "train"), cfg.batch_size) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| @@ -118,12 +79,8 @@ if __name__ == "__main__": | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| if device_num == 1: | |||
| outputDirectory = train_dir + "/" | |||
| if device_num > 1: | |||
| outputDirectory = train_dir + "/" + str(get_rank()) + "/" | |||
| #将模型保存到c2net_context.output_path | |||
| outputDirectory = c2net_context.output_path + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| @@ -134,5 +91,5 @@ if __name__ == "__main__": | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在openi_context.output_path | |||
| upload_openi() | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path | |||
| upload_output() | |||
| @@ -0,0 +1,117 @@ | |||
| """ | |||
| 示例选用的数据集是MNISTData.zip | |||
| 数据集结构是: | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| 使用注意事项: | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset_distributed import create_dataset_parallel | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank | |||
| import time | |||
| #导入openi包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| if __name__ == "__main__": | |||
| ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| args, unknown = parser.parse_known_args() | |||
| device_num = int(os.getenv('RANK_SIZE')) | |||
| #使用多卡时 | |||
| # set device_id and init for multi-card training | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||
| init() | |||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||
| local_rank=int(os.getenv('RANK_ID')) | |||
| if local_rank%8==0: | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #Set a cache file to determine whether the data has been copied to obs. | |||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||
| f = open("/cache/download_input.txt", 'w') | |||
| f.close() | |||
| try: | |||
| if os.path.exists("/cache/download_input.txt"): | |||
| print("download_input succeed") | |||
| except Exception as e: | |||
| print("download_input failed") | |||
| while not os.path.exists("/cache/download_input.txt"): | |||
| time.sleep(1) | |||
| ds_train = create_dataset_parallel(os.path.join(mnistdata_path, "train"), cfg.batch_size) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy"}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy"}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path | |||
| upload_output() | |||