| @@ -1,33 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """ | |||
| network config setting, will be used in train.py | |||
| """ | |||
| from easydict import EasyDict as edict | |||
| mnist_cfg = edict({ | |||
| 'num_classes': 10, | |||
| 'lr': 0.01, | |||
| 'momentum': 0.9, | |||
| 'epoch_size': 10, | |||
| 'batch_size': 32, | |||
| 'buffer_size': 1000, | |||
| 'image_height': 32, | |||
| 'image_width': 32, | |||
| 'save_checkpoint_steps': 1875, | |||
| 'keep_checkpoint_max': 150, | |||
| 'air_name': "lenet", | |||
| }) | |||
| @@ -1,60 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """ | |||
| Produce the dataset | |||
| """ | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as CV | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore.dataset.vision import Inter | |||
| from mindspore.common import dtype as mstype | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, | |||
| num_parallel_workers=1): | |||
| """ | |||
| create dataset for train or test | |||
| """ | |||
| # define dataset | |||
| mnist_ds = ds.MnistDataset(data_path) | |||
| resize_height, resize_width = 32, 32 | |||
| rescale = 1.0 / 255.0 | |||
| shift = 0.0 | |||
| rescale_nml = 1 / 0.3081 | |||
| shift_nml = -1 * 0.1307 / 0.3081 | |||
| # define map operations | |||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||
| rescale_op = CV.Rescale(rescale, shift) | |||
| hwc2chw_op = CV.HWC2CHW() | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| # apply map operations on images | |||
| mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| # apply DatasetOps | |||
| buffer_size = 10000 | |||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script | |||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||
| return mnist_ds | |||
| @@ -1,55 +0,0 @@ | |||
| """ | |||
| Produce the dataset: | |||
| 与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: | |||
| get_rank:获取当前设备在集群中的ID。 | |||
| get_group_size:获取集群数量。 | |||
| """ | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.vision.c_transforms as CV | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore.dataset.vision import Inter | |||
| from mindspore.common import dtype as mstype | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, | |||
| num_parallel_workers=1, shard_id=0, num_shards=8): | |||
| """ | |||
| create dataset for train or test | |||
| """ | |||
| resize_height, resize_width = 32, 32 | |||
| rescale = 1.0 / 255.0 | |||
| shift = 0.0 | |||
| rescale_nml = 1 / 0.3081 | |||
| shift_nml = -1 * 0.1307 / 0.3081 | |||
| # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. | |||
| shard_id = get_rank() | |||
| num_shards = get_group_size() | |||
| # define dataset | |||
| mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) | |||
| # define map operations | |||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||
| rescale_op = CV.Rescale(rescale, shift) | |||
| hwc2chw_op = CV.HWC2CHW() | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| # apply map operations on images | |||
| mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
| # apply DatasetOps | |||
| buffer_size = 10000 | |||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script | |||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||
| return mnist_ds | |||
| @@ -1,202 +0,0 @@ | |||
| """ | |||
| ######################## single-dataset inference lenet example ######################## | |||
| This example is a single-dataset inference tutorial. | |||
| ######################## Instructions for using the inference environment ######################## | |||
| The image of the debugging environment and the image of the inference environment are two different images, | |||
| and the working local directories are different. In the inference task, you need to pay attention to the following points. | |||
| 1、(1)The structure of the dataset uploaded for single dataset inference in this example | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| (2)The dataset structure of the single dataset in the inference image in this example | |||
| workroot | |||
| ├── data | |||
| | ├── test | |||
| | └── train | |||
| 2、Inference task requires predefined functions | |||
| (1)Defines whether the task is a inference environment or a debugging environment. | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image | |||
| elif environment == 'debug': | |||
| workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| (2)Copy single dataset from obs to inference image. | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| (3)Copy ckpt file from obs to inference image. | |||
| def ObsUrlToEnv(obs_ckpt_url, ckpt_url): | |||
| try: | |||
| mox.file.copy(obs_ckpt_url, ckpt_url) | |||
| print("Successfully Download {} to {}".format(obs_ckpt_url, | |||
| ckpt_url)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format( | |||
| obs_ckpt_url, ckpt_url) + str(e)) | |||
| return | |||
| (4)Copy the output result to obs. | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| 3、4 parameters need to be defined. | |||
| --data_url is the dataset you selected on the Qizhi platform | |||
| --ckpt_url is the weight file you choose on the Qizhi platform | |||
| --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, | |||
| otherwise an error will be reported. | |||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| because they are predefined in the background, you only need to define them in your code. | |||
| 4、How the dataset is used | |||
| Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method | |||
| of the dataset in the image. | |||
| For details, please refer to the following sample code. | |||
| """ | |||
| import os | |||
| import argparse | |||
| import moxing as mox | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore import Tensor | |||
| import numpy as np | |||
| from glob import glob | |||
| from dataset import create_dataset | |||
| from config import mnist_cfg as cfg | |||
| from lenet import LeNet5 | |||
| ### Defines whether the task is a inference environment or a debugging environment ### | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' | |||
| elif environment == 'debug': | |||
| workroot = '/home/work' | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| ### Copy single dataset from obs to inference image ### | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| ### Copy ckpt file from obs to inference image### | |||
| ### To operate on folders, use mox.file.copy_parallel. If copying a file. | |||
| ### Please use mox.file.copy to operate the file, this operation is to operate the file | |||
| def ObsUrlToEnv(obs_ckpt_url, ckpt_url): | |||
| try: | |||
| mox.file.copy(obs_ckpt_url, ckpt_url) | |||
| print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) | |||
| return | |||
| ### Copy the output result to obs### | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| ### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, | |||
| ### otherwise an error will be reported. | |||
| ### There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| ### because they are predefined in the background, you only need to define them in your code. | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument('--data_url', | |||
| type=str, | |||
| default= WorkEnvironment('train') + '/data/', | |||
| help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_url', | |||
| help='model to save/load', | |||
| default= WorkEnvironment('train') + '/checkpoint.ckpt') | |||
| parser.add_argument('--result_url', | |||
| help='result folder to save/load', | |||
| default= WorkEnvironment('train') + '/result/') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| ### defining the training environment | |||
| environment = 'train' | |||
| workroot = WorkEnvironment(environment) | |||
| ###Initialize the data and result directories in the inference image### | |||
| data_dir = workroot + '/data' | |||
| result_dir = workroot + '/result' | |||
| ckpt_url = workroot + '/checkpoint.ckpt' | |||
| if not os.path.exists(data_dir): | |||
| os.makedirs(data_dir) | |||
| if not os.path.exists(result_dir): | |||
| os.makedirs(result_dir) | |||
| ###Copy dataset from obs to inference image | |||
| obs_data_url = args.data_url | |||
| ObsToEnv(obs_data_url, data_dir) | |||
| ###Copy ckpt file from obs to inference image | |||
| obs_ckpt_url = args.ckpt_url | |||
| ObsUrlToEnv(obs_ckpt_url, ckpt_url) | |||
| ###Set output path result_url | |||
| obs_result_url = args.result_url | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| repeat_size = cfg.epoch_size | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| print("============== Starting Testing ==============") | |||
| param_dict = load_checkpoint(os.path.join(ckpt_url)) | |||
| load_param_into_net(network, param_dict) | |||
| ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator() | |||
| data = next(ds_test) | |||
| images = data["image"].asnumpy() | |||
| labels = data["label"].asnumpy() | |||
| print('Tensor:', Tensor(data['image'])) | |||
| output = model.predict(Tensor(data['image'])) | |||
| predicted = np.argmax(output.asnumpy(), axis=1) | |||
| pred = np.argmax(output.asnumpy(), axis=1) | |||
| print('predicted:', predicted) | |||
| print('pred:', pred) | |||
| print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') | |||
| filename = 'result.txt' | |||
| file_path = os.path.join(result_dir, filename) | |||
| with open(file_path, 'a+') as file: | |||
| file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) | |||
| ###Copy result data from the local running environment back to obs, | |||
| ###and download it in the inference task corresponding to the Qizhi platform | |||
| EnvToObs(result_dir, obs_result_url) | |||
| @@ -1,60 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """LeNet.""" | |||
| import mindspore.nn as nn | |||
| from mindspore.common.initializer import Normal | |||
| class LeNet5(nn.Cell): | |||
| """ | |||
| Lenet network | |||
| Args: | |||
| num_class (int): Number of classes. Default: 10. | |||
| num_channel (int): Number of channels. Default: 1. | |||
| Returns: | |||
| Tensor, output tensor | |||
| Examples: | |||
| >>> LeNet(num_class=10) | |||
| """ | |||
| def __init__(self, num_class=10, num_channel=1, include_top=True): | |||
| super(LeNet5, self).__init__() | |||
| self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') | |||
| self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') | |||
| self.relu = nn.ReLU() | |||
| self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) | |||
| self.include_top = include_top | |||
| if self.include_top: | |||
| self.flatten = nn.Flatten() | |||
| self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) | |||
| self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) | |||
| self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) | |||
| def construct(self, x): | |||
| x = self.conv1(x) | |||
| x = self.relu(x) | |||
| x = self.max_pool2d(x) | |||
| x = self.conv2(x) | |||
| x = self.relu(x) | |||
| x = self.max_pool2d(x) | |||
| if not self.include_top: | |||
| return x | |||
| x = self.flatten(x) | |||
| x = self.relu(self.fc1(x)) | |||
| x = self.relu(self.fc2(x)) | |||
| x = self.fc3(x) | |||
| return x | |||
| @@ -1,193 +0,0 @@ | |||
| """ | |||
| ######################## single-dataset train lenet example ######################## | |||
| This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training | |||
| tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! | |||
| ######################## Instructions for using the training environment ######################## | |||
| The image of the debugging environment and the image of the training environment are two different images, | |||
| and the working local directories are different. In the training task, you need to pay attention to the following points. | |||
| 1、(1)The structure of the dataset uploaded for single dataset training in this example | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| (2)The dataset structure of the single dataset in the training image in this example | |||
| workroot | |||
| ├── data | |||
| | ├── test | |||
| | └── train | |||
| 2、Single dataset training requires predefined functions | |||
| (1)Defines whether the task is a training environment or a debugging environment. | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image | |||
| elif environment == 'debug': | |||
| workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| (2)Copy single dataset from obs to training image. | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| (3)Copy the output model to obs. | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| 3、3 parameters need to be defined | |||
| --data_url is the dataset you selected on the Qizhi platform | |||
| --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, | |||
| otherwise an error will be reported. | |||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| because they are predefined in the background, you only need to define them in your code. | |||
| 4、How the dataset is used | |||
| A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method | |||
| of the dataset in the image. | |||
| For details, please refer to the following sample code. | |||
| """ | |||
| import os | |||
| import argparse | |||
| import moxing as mox | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.common import set_seed | |||
| ### Defines whether the task is a training environment or a debugging environment ### | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' | |||
| elif environment == 'debug': | |||
| workroot = '/home/work' | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| ### Copy single dataset from obs to training image### | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| ### Copy the output model to obs### | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, | |||
| ### otherwise an error will be reported. | |||
| ###There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| ###because they are predefined in the background, you only need to define them in your code. | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument('--data_url', | |||
| help='path to training/inference dataset folder', | |||
| default= WorkEnvironment('train') + '/data/') | |||
| parser.add_argument('--train_url', | |||
| help='model folder to save/load', | |||
| default= WorkEnvironment('train') + '/model/') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| ### defining the training environment | |||
| environment = 'train' | |||
| workroot = WorkEnvironment(environment) | |||
| ###Initialize the data and model directories in the training image### | |||
| data_dir = workroot + '/data' | |||
| train_dir = workroot + '/model' | |||
| if not os.path.exists(data_dir): | |||
| os.makedirs(data_dir) | |||
| if not os.path.exists(train_dir): | |||
| os.makedirs(train_dir) | |||
| ### Copy the dataset from obs to the training image ### | |||
| ObsToEnv(args.data_url,data_dir) | |||
| ###Specifies the device CPU or Ascend NPU used for training### | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| device_target=args.device_target) | |||
| ds_train = create_dataset(os.path.join(data_dir, "train"), | |||
| cfg.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError( | |||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=train_dir, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, | |||
| ds_train, | |||
| callbacks=[time_cb, ckpoint_cb, | |||
| LossMonitor()]) | |||
| ###Copy the trained model data from the local running environment back to obs, | |||
| ###and download it in the training task corresponding to the Qizhi platform | |||
| EnvToObs(train_dir, args.train_url) | |||
| @@ -1,205 +0,0 @@ | |||
| """ | |||
| ######################## single-dataset train lenet example ######################## | |||
| This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training | |||
| tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! | |||
| ######################## Instructions for using the training environment ######################## | |||
| The image of the debugging environment and the image of the training environment are two different images, | |||
| and the working local directories are different. In the training task, you need to pay attention to the following points. | |||
| 1、(1)The structure of the dataset uploaded for single dataset training in this example | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| (2)The dataset structure of the single dataset in the training image in this example | |||
| workroot | |||
| ├── data | |||
| | ├── test | |||
| | └── train | |||
| 2、Single dataset training requires predefined functions | |||
| (1)Defines whether the task is a training environment or a debugging environment. | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image | |||
| elif environment == 'debug': | |||
| workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| (2)Copy single dataset from obs to training image. | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| (3)Copy the output model to obs. | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| 3、3 parameters need to be defined | |||
| --data_url is the dataset you selected on the Qizhi platform | |||
| --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, | |||
| otherwise an error will be reported. | |||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| because they are predefined in the background, you only need to define them in your code. | |||
| 4、How the dataset is used | |||
| A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method | |||
| of the dataset in the image. | |||
| For details, please refer to the following sample code. | |||
| """ | |||
| import os | |||
| import argparse | |||
| from dataset_distributed import create_dataset_parallel | |||
| import moxing as mox | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.common import set_seed | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| import mindspore.ops as ops | |||
| # set device_id and init | |||
| device_id = int(os.getenv('DEVICE_ID')) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") | |||
| context.set_context(device_id=device_id) | |||
| init() | |||
| ### Defines whether the task is a training environment or a debugging environment ### | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' | |||
| elif environment == 'debug': | |||
| workroot = '/home/work' | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| ### Copy single dataset from obs to training image### | |||
| def ObsToEnv(obs_data_url, data_dir): | |||
| try: | |||
| mox.file.copy_parallel(obs_data_url, data_dir) | |||
| print("Successfully Download {} to {}".format(obs_data_url, data_dir)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) | |||
| return | |||
| ### Copy the output model to obs### | |||
| def EnvToObs(train_dir, obs_train_url): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) | |||
| return | |||
| ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, | |||
| ### otherwise an error will be reported. | |||
| ###There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| ###because they are predefined in the background, you only need to define them in your code. | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument('--data_url', | |||
| help='path to training/inference dataset folder', | |||
| default= WorkEnvironment('train') + '/data/') | |||
| parser.add_argument('--train_url', | |||
| help='model folder to save/load', | |||
| default= WorkEnvironment('train') + '/model/') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| set_seed(114514) | |||
| if __name__ == "__main__": | |||
| args = parser.parse_args() | |||
| ### defining the training environment | |||
| environment = 'train' | |||
| workroot = WorkEnvironment(environment) | |||
| ###Initialize the data and model directories in the training image### | |||
| data_dir = workroot + '/data' | |||
| train_dir = workroot + '/model' | |||
| if not os.path.exists(data_dir): | |||
| os.makedirs(data_dir) | |||
| if not os.path.exists(train_dir): | |||
| os.makedirs(train_dir) | |||
| ### Copy the dataset from obs to the training image ### | |||
| ObsToEnv(args.data_url,data_dir) | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) | |||
| ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), | |||
| cfg.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError( | |||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In the example, get_rank() is added to distinguish different paths. | |||
| ckpoint_cb = ModelCheckpoint(prefix="data_parallel", | |||
| directory=train_dir + "/" + str(get_rank()) + "/", | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, | |||
| ds_train, | |||
| callbacks=[time_cb, ckpoint_cb, | |||
| LossMonitor()], dataset_sink_mode=True) | |||
| ###Copy the trained model data from the local running environment back to obs, | |||
| ###and download it in the training task corresponding to the Qizhi platform | |||
| EnvToObs(train_dir, args.train_url) | |||
| @@ -1,92 +0,0 @@ | |||
| """ | |||
| ######################## train lenet example ######################## | |||
| train lenet and get network model files(.ckpt) | |||
| The training of the intelligent computing network currently supports single dataset training, and does not require | |||
| the obs copy process.It only needs to define two parameters and then call it directly: | |||
| train_dir = '/cache/output' #The location of the output | |||
| data_dir = '/cache/dataset' #The location of the dataset | |||
| """ | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.common import set_seed | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| set_seed(1) | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| print('args:') | |||
| print(args) | |||
| ###define two parameters and then call it directly### | |||
| train_dir = '/cache/output' | |||
| data_dir = '/cache/dataset' | |||
| ###Specifies the device CPU or Ascend NPU used for training### | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| device_target=args.device_target) | |||
| ds_train = create_dataset(os.path.join(data_dir, "train"), | |||
| cfg.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError( | |||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=train_dir, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, | |||
| ds_train, | |||
| callbacks=[time_cb, ckpoint_cb, | |||
| LossMonitor()]) | |||
| print("============== Finish Training ==============") | |||
| @@ -1,237 +0,0 @@ | |||
| """ | |||
| ######################## multi-dataset train lenet example ######################## | |||
| This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset | |||
| training tutorial train.py. This example cannot be used for a single dataset! | |||
| """ | |||
| """ | |||
| ######################## Instructions for using the training environment ######################## | |||
| 1、(1)The structure of the dataset uploaded for multi-dataset training in this example | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| checkpoint_lenet-1_1875.zip | |||
| ├── checkpoint_lenet-1_1875.ckpt | |||
| (2)The dataset structure in the training image for multiple datasets in this example | |||
| workroot | |||
| ├── MNISTData | |||
| | ├── test | |||
| | └── train | |||
| └── checkpoint_lenet-1_1875 | |||
| ├── checkpoint_lenet-1_1875.ckpt | |||
| 2、Multi-dataset training requires predefined functions | |||
| (1)Defines whether the task is a training environment or a debugging environment. | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image | |||
| elif environment == 'debug': | |||
| workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| (2)Copy multiple datasets from obs to training image | |||
| def MultiObsToEnv(multi_data_url, workroot): | |||
| multi_data_json = json.loads(multi_data_url) #Parse multi_data_url | |||
| for i in range(len(multi_data_json)): | |||
| path = workroot + "/" + multi_data_json[i]["dataset_name"] | |||
| if not os.path.exists(path): | |||
| os.makedirs(path) | |||
| try: | |||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], | |||
| path)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format( | |||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||
| return | |||
| ***The input and output of the MultiObsToEnv function in this example: | |||
| Input for multi_data_url: | |||
| [ | |||
| { | |||
| "dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e | |||
| ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset | |||
| "dataset_name": "MNIST_Data" #the name of the dataset | |||
| }, | |||
| { | |||
| "dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c | |||
| 59be66-64ec-41ca-b311-f51a486eabf8/", | |||
| "dataset_name": "checkpoint_lenet-1_1875" | |||
| } | |||
| ] | |||
| Purpose of multi_data_url: | |||
| The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image | |||
| and build the dataset path in the training image. | |||
| For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData, | |||
| The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875 | |||
| (3)Copy the output model to obs. | |||
| def EnvToObs(obs_train_url, train_dir): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir, | |||
| obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir, | |||
| obs_train_url) + str(e)) | |||
| return | |||
| 3、4 parameters need to be defined | |||
| --data_url is the first dataset you selected on the Qizhi platform | |||
| --multi_data_url is the multi-dataset you selected on the Qizhi platform | |||
| --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, | |||
| otherwise an error will be reported. | |||
| There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| because they are predefined in the background, you only need to define them in your code | |||
| 4、How the dataset is used | |||
| Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the | |||
| calling path of the dataset in the training image. | |||
| For example, the calling path of the train folder in the MNIST_Data dataset in this example is | |||
| workroot + "/MNIST_Data" +"/train" | |||
| For details, please refer to the following sample code. | |||
| """ | |||
| import os | |||
| import argparse | |||
| import moxing as mox | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from lenet import LeNet5 | |||
| import json | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.common import set_seed | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| ### Defines whether the task is a training environment or a debugging environment ### | |||
| def WorkEnvironment(environment): | |||
| if environment == 'train': | |||
| workroot = '/home/work/user-job-dir' | |||
| elif environment == 'debug': | |||
| workroot = '/home/ma-user/work' | |||
| print('current work mode:' + environment + ', workroot:' + workroot) | |||
| return workroot | |||
| ### Copy multiple datasets from obs to training image ### | |||
| def MultiObsToEnv(multi_data_url, workroot): | |||
| multi_data_json = json.loads(multi_data_url) | |||
| for i in range(len(multi_data_json)): | |||
| path = workroot + "/" + multi_data_json[i]["dataset_name"] | |||
| if not os.path.exists(path): | |||
| os.makedirs(path) | |||
| try: | |||
| mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) | |||
| print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], | |||
| path)) | |||
| except Exception as e: | |||
| print('moxing download {} to {} failed: '.format( | |||
| multi_data_json[i]["dataset_url"], path) + str(e)) | |||
| return | |||
| ### Copy the output model to obs ### | |||
| def EnvToObs(obs_train_url, train_dir): | |||
| try: | |||
| mox.file.copy_parallel(train_dir, obs_train_url) | |||
| print("Successfully Upload {} to {}".format(train_dir, | |||
| obs_train_url)) | |||
| except Exception as e: | |||
| print('moxing upload {} to {} failed: '.format(train_dir, | |||
| obs_train_url) + str(e)) | |||
| return | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| ### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, | |||
| ### otherwise an error will be reported. | |||
| ### There is no need to add these parameters to the running parameters of the Qizhi platform, | |||
| ### because they are predefined in the background, you only need to define them in your code. | |||
| parser.add_argument('--data_url', | |||
| help='path to training/inference dataset folder', | |||
| default= WorkEnvironment('train') + '/data/') | |||
| parser.add_argument('--multi_data_url', | |||
| help='path to multi dataset', | |||
| default= WorkEnvironment('train')) | |||
| parser.add_argument('--train_url', | |||
| help='model folder to save/load', | |||
| default= WorkEnvironment('train') + '/model/') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| # After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to | |||
| # copy multiple datasets from obs to the training image | |||
| environment = 'train' | |||
| workroot = WorkEnvironment(environment) | |||
| MultiObsToEnv(args.multi_data_url, workroot) | |||
| ### Define the output path in the training image | |||
| train_dir = workroot + '/model' | |||
| if not os.path.exists(train_dir): | |||
| os.makedirs(train_dir) | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| device_target=args.device_target) | |||
| #The dataset path is used here:workroot + "/MNIST_Data" +"/train" "" | |||
| ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"), | |||
| cfg.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError( | |||
| "Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| ### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" | |||
| load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875", | |||
| "checkpoint_lenet-1_1875.ckpt"))) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=train_dir, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, | |||
| ds_train, | |||
| callbacks=[time_cb, ckpoint_cb, | |||
| LossMonitor()]) | |||
| ###Copy the trained model data from the local running environment back to obs, | |||
| ###and download it in the training task corresponding to the Qizhi platform | |||
| EnvToObs(train_dir, args.train_url) | |||