| @@ -0,0 +1,149 @@ | |||||
| """ | |||||
| ######################## train lenet example ######################## | |||||
| train lenet and get network model files(.ckpt) | |||||
| """ | |||||
| #!/usr/bin/python | |||||
| #coding=utf-8 | |||||
| import os | |||||
| import argparse | |||||
| import moxing as mox | |||||
| from config import mnist_cfg as cfg | |||||
| from dataset import create_dataset | |||||
| from dataset_distributed import create_dataset_parallel | |||||
| from lenet import LeNet5 | |||||
| import json | |||||
| import mindspore.nn as nn | |||||
| from mindspore import context | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||||
| from mindspore.train import Model | |||||
| from mindspore.nn.metrics import Accuracy | |||||
| from mindspore import load_checkpoint, load_param_into_net | |||||
| from mindspore.context import ParallelMode | |||||
| from mindspore.communication.management import init, get_rank | |||||
| import time | |||||
| ### Copy multiple datasets from obs to training image ### | |||||
| def MultiObsToEnv(multi_data_url, data_dir): | |||||
| #--multi_data_url is json data, need to do json parsing for multi_data_url | |||||
| multi_data_json = json.loads(multi_data_url) | |||||
| for i in range(len(multi_data_json)): | |||||
| path = data_dir + "/" + multi_data_json[i]["dataset_name"] | |||||
| file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] | |||||
| if not os.path.exists(file_path): | |||||
| os.makedirs(file_path) | |||||
| try: | |||||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) | |||||
| #unzip dataset | |||||
| os.system("unzip -d %s %s" % (file_path, path)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format( | |||||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||||
| #Set a cache file to determine whether the data has been copied to obs. | |||||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||||
| f = open("/cache/download_input.txt", 'w') | |||||
| f.close() | |||||
| try: | |||||
| if os.path.exists("/cache/download_input.txt"): | |||||
| print("download_input succeed") | |||||
| except Exception as e: | |||||
| print("download_input failed") | |||||
| return | |||||
| def DownloadFromQizhi(multi_data_url, data_dir): | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||||
| if device_num > 1: | |||||
| # set device_id and init for multi-card training | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||||
| init() | |||||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||||
| local_rank=int(os.getenv('RANK_ID')) | |||||
| if local_rank%8==0: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| #If the cache file does not exist, it means that the copy data has not been completed, | |||||
| #and Wait for 0th card to finish copying data | |||||
| while not os.path.exists("/cache/download_input.txt"): | |||||
| time.sleep(1) | |||||
| return | |||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, | |||||
| ### otherwise an error will be reported. | |||||
| ### There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| ### because they are predefined in the background, you only need to define them in your code. | |||||
| parser.add_argument('--multi_data_url', | |||||
| help='dataset path in obs') | |||||
| parser.add_argument('--ckpt_url', | |||||
| help='pre_train_model path in obs') | |||||
| parser.add_argument( | |||||
| '--device_target', | |||||
| type=str, | |||||
| default="Ascend", | |||||
| choices=['Ascend', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||||
| parser.add_argument('--epoch_size', | |||||
| type=int, | |||||
| default=5, | |||||
| help='Training epochs.') | |||||
| if __name__ == "__main__": | |||||
| args, unknown = parser.parse_known_args() | |||||
| data_dir = '/cache/dataset' | |||||
| train_dir = '/cache/output' | |||||
| if not os.path.exists(data_dir): | |||||
| os.makedirs(data_dir) | |||||
| if not os.path.exists(train_dir): | |||||
| os.makedirs(train_dir) | |||||
| ###Initialize and copy data to training image | |||||
| DownloadFromQizhi(args.multi_data_url, data_dir) | |||||
| ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||||
| if device_num > 1: | |||||
| ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||||
| if ds_train.get_dataset_size() == 0: | |||||
| raise ValueError( | |||||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||||
| network = LeNet5(cfg.num_classes) | |||||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||||
| if args.device_target != "Ascend": | |||||
| model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) | |||||
| else: | |||||
| model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||||
| # In this example, get_rank() is added to distinguish different paths. | |||||
| if device_num == 1: | |||||
| outputDirectory = train_dir + "/" | |||||
| if device_num > 1: | |||||
| outputDirectory = train_dir + "/" + str(get_rank()) + "/" | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||||
| directory=outputDirectory, | |||||
| config=config_ck) | |||||
| print("============== Starting Training ==============") | |||||
| epoch_size = cfg['epoch_size'] | |||||
| if (args.epoch_size): | |||||
| epoch_size = args.epoch_size | |||||
| print('epoch_size is: ', epoch_size) | |||||
| model.train(epoch_size, | |||||
| ds_train, | |||||
| callbacks=[time_cb, ckpoint_cb, | |||||
| LossMonitor()]) | |||||
| @@ -0,0 +1,197 @@ | |||||
| """ | |||||
| ######################## multi-dataset train lenet example ######################## | |||||
| This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset | |||||
| training tutorial train.py. This example cannot be used for a single dataset! | |||||
| """ | |||||
| """ | |||||
| ######################## Instructions for using the training environment ######################## | |||||
| 1、(1)The structure of the dataset uploaded for multi-dataset training in this example | |||||
| MNISTData.zip | |||||
| ├── test | |||||
| └── train | |||||
| checkpoint_lenet-1_1875.zip | |||||
| ├── checkpoint_lenet-1_1875.ckpt | |||||
| (2)The dataset structure in the training image for multiple datasets in this example | |||||
| workroot | |||||
| ├── MNISTData | |||||
| | ├── test | |||||
| | └── train | |||||
| └── checkpoint_lenet-1_1875 | |||||
| ├── checkpoint_lenet-1_1875.ckpt | |||||
| 2、Multi-dataset training requires predefined functions | |||||
| (1)Copy multi-dataset from obs to training image | |||||
| function MultiObsToEnv(multi_data_url, data_dir) | |||||
| (2)Copy the output to obs | |||||
| function EnvToObs(train_dir, obs_train_url) | |||||
| (2)Download the input from Qizhi And Init | |||||
| function DownloadFromQizhi(multi_data_url, data_dir) | |||||
| (2)Upload the output to Qizhi | |||||
| function UploadToQizhi(train_dir, obs_train_url) | |||||
| 3、4 parameters need to be defined | |||||
| --data_url is the first dataset you selected on the Qizhi platform | |||||
| --multi_data_url is the multi-dataset you selected on the Qizhi platform | |||||
| --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, | |||||
| otherwise an error will be reported. | |||||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| because they are predefined in the background, you only need to define them in your code | |||||
| 4、How the dataset is used | |||||
| Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the | |||||
| calling path of the dataset in the training image. | |||||
| For example, the calling path of the train folder in the MNIST_Data dataset in this example is | |||||
| data_dir + "/MNIST_Data" +"/train" | |||||
| For details, please refer to the following sample code. | |||||
| """ | |||||
| import os | |||||
| import argparse | |||||
| import moxing as mox | |||||
| from config import mnist_cfg as cfg | |||||
| from dataset import create_dataset | |||||
| from dataset_distributed import create_dataset_parallel | |||||
| from lenet import LeNet5 | |||||
| import json | |||||
| import mindspore.nn as nn | |||||
| from mindspore import context | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||||
| from mindspore.train import Model | |||||
| from mindspore.nn.metrics import Accuracy | |||||
| from mindspore import load_checkpoint, load_param_into_net | |||||
| from mindspore.context import ParallelMode | |||||
| from mindspore.communication.management import init, get_rank | |||||
| import time | |||||
| ### Copy multiple datasets from obs to training image ### | |||||
| def MultiObsToEnv(multi_data_url, data_dir): | |||||
| #--multi_data_url is json data, need to do json parsing for multi_data_url | |||||
| multi_data_json = json.loads(multi_data_url) | |||||
| for i in range(len(multi_data_json)): | |||||
| path = data_dir + "/" + multi_data_json[i]["dataset_name"] | |||||
| file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] | |||||
| if not os.path.exists(file_path): | |||||
| os.makedirs(file_path) | |||||
| try: | |||||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) | |||||
| #unzip dataset | |||||
| os.system("unzip -d %s %s" % (file_path, path)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format( | |||||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||||
| #Set a cache file to determine whether the data has been copied to obs. | |||||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||||
| f = open("/cache/download_input.txt", 'w') | |||||
| f.close() | |||||
| try: | |||||
| if os.path.exists("/cache/download_input.txt"): | |||||
| print("download_input succeed") | |||||
| except Exception as e: | |||||
| print("download_input failed") | |||||
| return | |||||
| def DownloadFromQizhi(multi_data_url, data_dir): | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||||
| if device_num > 1: | |||||
| # set device_id and init for multi-card training | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||||
| init() | |||||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||||
| local_rank=int(os.getenv('RANK_ID')) | |||||
| if local_rank%8==0: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| #If the cache file does not exist, it means that the copy data has not been completed, | |||||
| #and Wait for 0th card to finish copying data | |||||
| while not os.path.exists("/cache/download_input.txt"): | |||||
| time.sleep(1) | |||||
| return | |||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, | |||||
| ### otherwise an error will be reported. | |||||
| ### There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| ### because they are predefined in the background, you only need to define them in your code. | |||||
| parser.add_argument('--multi_data_url', | |||||
| help='dataset path in obs') | |||||
| parser.add_argument('--ckpt_url', | |||||
| help='pre_train_model path in obs') | |||||
| parser.add_argument( | |||||
| '--device_target', | |||||
| type=str, | |||||
| default="Ascend", | |||||
| choices=['Ascend', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||||
| parser.add_argument('--epoch_size', | |||||
| type=int, | |||||
| default=5, | |||||
| help='Training epochs.') | |||||
| if __name__ == "__main__": | |||||
| args, unknown = parser.parse_known_args() | |||||
| data_dir = '/cache/dataset' | |||||
| train_dir = '/cache/output' | |||||
| if not os.path.exists(data_dir): | |||||
| os.makedirs(data_dir) | |||||
| if not os.path.exists(train_dir): | |||||
| os.makedirs(train_dir) | |||||
| ###Initialize and copy data to training image | |||||
| DownloadFromQizhi(args.multi_data_url, data_dir) | |||||
| ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||||
| if device_num > 1: | |||||
| ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||||
| if ds_train.get_dataset_size() == 0: | |||||
| raise ValueError( | |||||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||||
| network = LeNet5(cfg.num_classes) | |||||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||||
| ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" | |||||
| load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", | |||||
| "checkpoint_lenet-1_1875.ckpt"))) | |||||
| if args.device_target != "Ascend": | |||||
| model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) | |||||
| else: | |||||
| model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||||
| # In this example, get_rank() is added to distinguish different paths. | |||||
| if device_num == 1: | |||||
| outputDirectory = train_dir + "/" | |||||
| if device_num > 1: | |||||
| outputDirectory = train_dir + "/" + str(get_rank()) + "/" | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||||
| directory=outputDirectory, | |||||
| config=config_ck) | |||||
| print("============== Starting Training ==============") | |||||
| epoch_size = cfg['epoch_size'] | |||||
| if (args.epoch_size): | |||||
| epoch_size = args.epoch_size | |||||
| print('epoch_size is: ', epoch_size) | |||||
| model.train(epoch_size, | |||||
| ds_train, | |||||
| callbacks=[time_cb, ckpoint_cb, | |||||
| LossMonitor()]) | |||||
| @@ -0,0 +1,233 @@ | |||||
| """ | |||||
| ######################## single-dataset train lenet example ######################## | |||||
| This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training | |||||
| tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! | |||||
| ######################## Instructions for using the training environment ######################## | |||||
| The image of the debugging environment and the image of the training environment are two different images, | |||||
| and the working local directories are different. In the training task, you need to pay attention to the following points. | |||||
| 1、(1)The structure of the dataset uploaded for single dataset training in this example | |||||
| MNISTData.zip | |||||
| ├── test | |||||
| └── train | |||||
| 2、Single dataset training requires predefined functions | |||||
| (1)Copy single dataset from obs to training image | |||||
| function ObsToEnv(obs_data_url, data_dir) | |||||
| (2)Copy the output to obs | |||||
| function EnvToObs(train_dir, obs_train_url) | |||||
| (3)Download the input from Qizhi And Init | |||||
| function DownloadFromQizhi(obs_data_url, data_dir) | |||||
| (4)Upload the output to Qizhi | |||||
| function UploadToQizhi(train_dir, obs_train_url) | |||||
| (5)Copy ckpt file from obs to training image. | |||||
| function ObsUrlToEnv(obs_ckpt_url, ckpt_url) | |||||
| 3、3 parameters need to be defined | |||||
| --data_url is the dataset you selected on the Qizhi platform | |||||
| --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, | |||||
| otherwise an error will be reported. | |||||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| because they are predefined in the background, you only need to define them in your code. | |||||
| 4、How the dataset is used | |||||
| A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method | |||||
| of the dataset in the image. | |||||
| For details, please refer to the following sample code. | |||||
| 5、How to load the checkpoint file | |||||
| The checkpoint file is loaded by the ckpt_url parameter | |||||
| """ | |||||
| import os | |||||
| import argparse | |||||
| import moxing as mox | |||||
| from config import mnist_cfg as cfg | |||||
| from dataset import create_dataset | |||||
| from dataset_distributed import create_dataset_parallel | |||||
| from lenet import LeNet5 | |||||
| import mindspore.nn as nn | |||||
| from mindspore import context | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||||
| from mindspore import load_checkpoint, load_param_into_net | |||||
| from mindspore.train import Model | |||||
| from mindspore.nn.metrics import Accuracy | |||||
| from mindspore.context import ParallelMode | |||||
| from mindspore.communication.management import init, get_rank | |||||
| import mindspore.ops as ops | |||||
| import time | |||||
| import json | |||||
| #from upload import UploadOutput | |||||
| ### Copy single dataset from obs to training image### | |||||
| def ObsToEnv(obs_data_url, data_dir): | |||||
| try: | |||||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||||
| #Set a cache file to determine whether the data has been copied to obs. | |||||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||||
| f = open("/cache/download_input.txt", 'w') | |||||
| f.close() | |||||
| try: | |||||
| if os.path.exists("/cache/download_input.txt"): | |||||
| print("download_input succeed") | |||||
| except Exception as e: | |||||
| print("download_input failed") | |||||
| return | |||||
| ### Copy ckpt file from obs to training image### | |||||
| ### To operate on folders, use mox.file.copy_parallel. If copying a file. | |||||
| ### Please use mox.file.copy to operate the file, this operation is to operate the file | |||||
| def ObsUrlToEnv(obs_ckpt_url, ckpt_url): | |||||
| try: | |||||
| mox.file.copy(obs_ckpt_url, ckpt_url) | |||||
| print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) | |||||
| return | |||||
| ### Copy multiple datasets from obs to training image ### | |||||
| def MultiObsToEnv(multi_data_url, data_dir): | |||||
| #--multi_data_url is json data, need to do json parsing for multi_data_url | |||||
| multi_data_json = json.loads(multi_data_url) | |||||
| for i in range(len(multi_data_json)): | |||||
| path = data_dir + "/" + multi_data_json[i]["dataset_name"] | |||||
| file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] | |||||
| if not os.path.exists(file_path): | |||||
| os.makedirs(file_path) | |||||
| try: | |||||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) | |||||
| #unzip dataset | |||||
| os.system("unzip -d %s %s" % (file_path, path)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format( | |||||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||||
| #Set a cache file to determine whether the data has been copied to obs. | |||||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||||
| f = open("/cache/download_input.txt", 'w') | |||||
| f.close() | |||||
| try: | |||||
| if os.path.exists("/cache/download_input.txt"): | |||||
| print("download_input succeed") | |||||
| except Exception as e: | |||||
| print("download_input failed") | |||||
| return | |||||
| def DownloadFromQizhi(multi_data_url, data_dir): | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||||
| if device_num > 1: | |||||
| # set device_id and init for multi-card training | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||||
| init() | |||||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||||
| local_rank=int(os.getenv('RANK_ID')) | |||||
| if local_rank%8==0: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| #If the cache file does not exist, it means that the copy data has not been completed, | |||||
| #and Wait for 0th card to finish copying data | |||||
| while not os.path.exists("/cache/download_input.txt"): | |||||
| time.sleep(1) | |||||
| return | |||||
| ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, | |||||
| ### otherwise an error will be reported. | |||||
| ###There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| ###because they are predefined in the background, you only need to define them in your code. | |||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| parser.add_argument('--multi_data_url', | |||||
| help='dataset path in obs') | |||||
| parser.add_argument('--ckpt_url', | |||||
| help='pre_train_model path in obs') | |||||
| parser.add_argument( | |||||
| '--device_target', | |||||
| type=str, | |||||
| default="Ascend", | |||||
| choices=['Ascend', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||||
| parser.add_argument('--epoch_size', | |||||
| type=int, | |||||
| default=5, | |||||
| help='Training epochs.') | |||||
| if __name__ == "__main__": | |||||
| args, unknown = parser.parse_known_args() | |||||
| data_dir = '/cache/dataset' | |||||
| train_dir = '/cache/output' | |||||
| ckpt_url = '/cache/checkpoint.ckpt' | |||||
| if not os.path.exists(data_dir): | |||||
| os.makedirs(data_dir) | |||||
| if not os.path.exists(train_dir): | |||||
| os.makedirs(train_dir) | |||||
| ###Initialize and copy data to training image | |||||
| ###Copy ckpt file from obs to training image | |||||
| ObsUrlToEnv(args.ckpt_url, ckpt_url) | |||||
| ###Copy data from obs to training image | |||||
| DownloadFromQizhi(args.multi_data_url, data_dir) | |||||
| ###The dataset path is used here:data_dir +"/train" | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| ds_train = create_dataset(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) | |||||
| if device_num > 1: | |||||
| ds_train = create_dataset_parallel(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) | |||||
| if ds_train.get_dataset_size() == 0: | |||||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||||
| network = LeNet5(cfg.num_classes) | |||||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||||
| ###The ckpt path is used here:ckpt_url | |||||
| print('-------ckpt_url is:', args.ckpt_url) | |||||
| load_param_into_net(network, load_checkpoint(ckpt_url)) | |||||
| if args.device_target != "Ascend": | |||||
| model = Model(network, | |||||
| net_loss, | |||||
| net_opt, | |||||
| metrics={"accuracy": Accuracy()}) | |||||
| else: | |||||
| model = Model(network, | |||||
| net_loss, | |||||
| net_opt, | |||||
| metrics={"accuracy": Accuracy()}, | |||||
| amp_level="O2") | |||||
| config_ck = CheckpointConfig( | |||||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||||
| # In this example, get_rank() is added to distinguish different paths. | |||||
| if device_num == 1: | |||||
| outputDirectory = train_dir + "/" | |||||
| if device_num > 1: | |||||
| outputDirectory = train_dir + "/" + str(get_rank()) + "/" | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||||
| directory=outputDirectory, | |||||
| config=config_ck) | |||||
| print("============== Starting Training ==============") | |||||
| epoch_size = cfg['epoch_size'] | |||||
| if (args.epoch_size): | |||||
| epoch_size = args.epoch_size | |||||
| print('epoch_size is: ', epoch_size) | |||||
| model.train(epoch_size, | |||||
| ds_train, | |||||
| callbacks=[time_cb, ckpoint_cb, | |||||
| LossMonitor()]) | |||||
| @@ -0,0 +1,114 @@ | |||||
| """ | |||||
| ######################## train lenet example ######################## | |||||
| train lenet and get network model files(.ckpt) | |||||
| """ | |||||
| #!/usr/bin/python | |||||
| #coding=utf-8 | |||||
| import os | |||||
| import argparse | |||||
| import moxing as mox | |||||
| from config import mnist_cfg as cfg | |||||
| from dataset import create_dataset | |||||
| from dataset_distributed import create_dataset_parallel | |||||
| from lenet import LeNet5 | |||||
| import json | |||||
| import mindspore.nn as nn | |||||
| from mindspore import context | |||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||||
| from mindspore.train import Model | |||||
| from mindspore.nn.metrics import Accuracy | |||||
| from mindspore import load_checkpoint, load_param_into_net | |||||
| from mindspore.context import ParallelMode | |||||
| from mindspore.communication.management import init, get_rank | |||||
| import time | |||||
| ### Copy multiple datasets from obs to training image ### | |||||
| def MultiObsToEnv(multi_data_url, data_dir): | |||||
| #--multi_data_url is json data, need to do json parsing for multi_data_url | |||||
| multi_data_json = json.loads(multi_data_url) | |||||
| for i in range(len(multi_data_json)): | |||||
| path = data_dir + "/" + multi_data_json[i]["dataset_name"] | |||||
| file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] | |||||
| if not os.path.exists(file_path): | |||||
| os.makedirs(file_path) | |||||
| try: | |||||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) | |||||
| #unzip dataset | |||||
| os.system("unzip -d %s %s" % (file_path, path)) | |||||
| except Exception as e: | |||||
| print('moxing download {} to {} failed: '.format( | |||||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||||
| #Set a cache file to determine whether the data has been copied to obs. | |||||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||||
| f = open("/cache/download_input.txt", 'w') | |||||
| f.close() | |||||
| try: | |||||
| if os.path.exists("/cache/download_input.txt"): | |||||
| print("download_input succeed") | |||||
| except Exception as e: | |||||
| print("download_input failed") | |||||
| return | |||||
| def DownloadFromQizhi(multi_data_url, data_dir): | |||||
| device_num = int(os.getenv('RANK_SIZE')) | |||||
| if device_num == 1: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||||
| if device_num > 1: | |||||
| # set device_id and init for multi-card training | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) | |||||
| init() | |||||
| #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data | |||||
| local_rank=int(os.getenv('RANK_ID')) | |||||
| if local_rank%8==0: | |||||
| MultiObsToEnv(multi_data_url,data_dir) | |||||
| #If the cache file does not exist, it means that the copy data has not been completed, | |||||
| #and Wait for 0th card to finish copying data | |||||
| while not os.path.exists("/cache/download_input.txt"): | |||||
| time.sleep(1) | |||||
| return | |||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, | |||||
| ### otherwise an error will be reported. | |||||
| ### There is no need to add these parameters to the running parameters of the Qizhi platform, | |||||
| ### because they are predefined in the background, you only need to define them in your code. | |||||
| parser.add_argument('--multi_data_url', | |||||
| help='dataset path in obs') | |||||
| parser.add_argument('--ckpt_url', | |||||
| help='pre_train_model path in obs') | |||||
| parser.add_argument( | |||||
| '--device_target', | |||||
| type=str, | |||||
| default="Ascend", | |||||
| choices=['Ascend', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||||
| parser.add_argument('--epoch_size', | |||||
| type=int, | |||||
| default=5, | |||||
| help='Training epochs.') | |||||
| if __name__ == "__main__": | |||||
| args, unknown = parser.parse_known_args() | |||||
| data_dir = '/cache/dataset' | |||||
| train_dir = '/cache/output' | |||||
| if not os.path.exists(data_dir): | |||||
| os.makedirs(data_dir) | |||||
| if not os.path.exists(train_dir): | |||||
| os.makedirs(train_dir) | |||||
| ###Initialize and copy data to training image | |||||
| DownloadFromQizhi(args.multi_data_url, data_dir) | |||||
| print("--------start ls:") | |||||
| os.system("cd /cache/dataset; ls -al") | |||||
| print("--------end ls-----------") | |||||
| @@ -0,0 +1,33 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """ | |||||
| network config setting, will be used in train.py | |||||
| """ | |||||
| from easydict import EasyDict as edict | |||||
| mnist_cfg = edict({ | |||||
| 'num_classes': 10, | |||||
| 'lr': 0.01, | |||||
| 'momentum': 0.9, | |||||
| 'epoch_size': 10, | |||||
| 'batch_size': 32, | |||||
| 'buffer_size': 1000, | |||||
| 'image_height': 32, | |||||
| 'image_width': 32, | |||||
| 'save_checkpoint_steps': 1875, | |||||
| 'keep_checkpoint_max': 150, | |||||
| 'air_name': "lenet", | |||||
| }) | |||||
| @@ -0,0 +1,60 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """ | |||||
| Produce the dataset | |||||
| """ | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as CV | |||||
| import mindspore.dataset.transforms.c_transforms as C | |||||
| from mindspore.dataset.vision import Inter | |||||
| from mindspore.common import dtype as mstype | |||||
| def create_dataset(data_path, batch_size=32, repeat_size=1, | |||||
| num_parallel_workers=1): | |||||
| """ | |||||
| create dataset for train or test | |||||
| """ | |||||
| # define dataset | |||||
| mnist_ds = ds.MnistDataset(data_path) | |||||
| resize_height, resize_width = 32, 32 | |||||
| rescale = 1.0 / 255.0 | |||||
| shift = 0.0 | |||||
| rescale_nml = 1 / 0.3081 | |||||
| shift_nml = -1 * 0.1307 / 0.3081 | |||||
| # define map operations | |||||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||||
| rescale_op = CV.Rescale(rescale, shift) | |||||
| hwc2chw_op = CV.HWC2CHW() | |||||
| type_cast_op = C.TypeCast(mstype.int32) | |||||
| # apply map operations on images | |||||
| mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| # apply DatasetOps | |||||
| buffer_size = 10000 | |||||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script | |||||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||||
| return mnist_ds | |||||
| @@ -0,0 +1,55 @@ | |||||
| """ | |||||
| Produce the dataset: | |||||
| 与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: | |||||
| get_rank:获取当前设备在集群中的ID。 | |||||
| get_group_size:获取集群数量。 | |||||
| """ | |||||
| import mindspore.dataset as ds | |||||
| import mindspore.dataset.vision.c_transforms as CV | |||||
| import mindspore.dataset.transforms.c_transforms as C | |||||
| from mindspore.dataset.vision import Inter | |||||
| from mindspore.common import dtype as mstype | |||||
| from mindspore.communication.management import init, get_rank, get_group_size | |||||
| def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, | |||||
| num_parallel_workers=1, shard_id=0, num_shards=8): | |||||
| """ | |||||
| create dataset for train or test | |||||
| """ | |||||
| resize_height, resize_width = 32, 32 | |||||
| rescale = 1.0 / 255.0 | |||||
| shift = 0.0 | |||||
| rescale_nml = 1 / 0.3081 | |||||
| shift_nml = -1 * 0.1307 / 0.3081 | |||||
| # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. | |||||
| shard_id = get_rank() | |||||
| num_shards = get_group_size() | |||||
| # define dataset | |||||
| mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) | |||||
| # define map operations | |||||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||||
| rescale_op = CV.Rescale(rescale, shift) | |||||
| hwc2chw_op = CV.HWC2CHW() | |||||
| type_cast_op = C.TypeCast(mstype.int32) | |||||
| # apply map operations on images | |||||
| mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||||
| # apply DatasetOps | |||||
| buffer_size = 10000 | |||||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script | |||||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||||
| return mnist_ds | |||||
| @@ -0,0 +1,60 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """LeNet.""" | |||||
| import mindspore.nn as nn | |||||
| from mindspore.common.initializer import Normal | |||||
| class LeNet5(nn.Cell): | |||||
| """ | |||||
| Lenet network | |||||
| Args: | |||||
| num_class (int): Number of classes. Default: 10. | |||||
| num_channel (int): Number of channels. Default: 1. | |||||
| Returns: | |||||
| Tensor, output tensor | |||||
| Examples: | |||||
| >>> LeNet(num_class=10) | |||||
| """ | |||||
| def __init__(self, num_class=10, num_channel=1, include_top=True): | |||||
| super(LeNet5, self).__init__() | |||||
| self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') | |||||
| self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') | |||||
| self.relu = nn.ReLU() | |||||
| self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) | |||||
| self.include_top = include_top | |||||
| if self.include_top: | |||||
| self.flatten = nn.Flatten() | |||||
| self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) | |||||
| self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) | |||||
| self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) | |||||
| def construct(self, x): | |||||
| x = self.conv1(x) | |||||
| x = self.relu(x) | |||||
| x = self.max_pool2d(x) | |||||
| x = self.conv2(x) | |||||
| x = self.relu(x) | |||||
| x = self.max_pool2d(x) | |||||
| if not self.include_top: | |||||
| return x | |||||
| x = self.flatten(x) | |||||
| x = self.relu(self.fc1(x)) | |||||
| x = self.relu(self.fc2(x)) | |||||
| x = self.fc3(x) | |||||
| return x | |||||