| @@ -0,0 +1,61 @@ | |||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||
| enable_modelarts: False | |||
| data_url: "" | |||
| train_url: "" | |||
| checkpoint_url: "" | |||
| data_path: "/cache/data" | |||
| output_path: "/cache/train" | |||
| load_path: "/cache/checkpoint_path" | |||
| checkpoint_path: './checkpoint/' | |||
| checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' | |||
| device_target: Ascend | |||
| enable_profiling: False | |||
| data_path_local: '/data/hcm/data/ImageNet_Original/' | |||
| ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' | |||
| # ============================================================================== | |||
| # Training options | |||
| num_classes: 1000 | |||
| learning_rate: 0.13 | |||
| momentum: 0.9 | |||
| epoch_size: 150 | |||
| batch_size: 256 | |||
| buffer_size: None | |||
| image_height: 224 | |||
| image_width: 224 | |||
| save_checkpoint_steps: 625 | |||
| keep_checkpoint_max: 10 | |||
| air_name: 'alexnet.air' | |||
| weight_decay: 0.0001 | |||
| loss_scale: 1024 | |||
| is_dynamic_loss_scale: 0 | |||
| # Model Description | |||
| model_name: alexnet | |||
| file_name: 'alexnet' | |||
| file_format: 'AIR' | |||
| dataset_name: 'imagenet' | |||
| sink_size: -1 | |||
| dataset_sink_mode: True | |||
| device_id: 0 | |||
| save_checkpoint: True | |||
| save_checkpoint_epochs: 2 | |||
| lr: 0.01 | |||
| --- | |||
| # Config description for each option | |||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||
| data_url: 'Dataset url for obs' | |||
| train_url: 'Training output url for obs' | |||
| data_path: 'Dataset path for local' | |||
| output_path: 'Training output path for local' | |||
| device_target: 'Target device type' | |||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||
| --- | |||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||
| @@ -0,0 +1,56 @@ | |||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||
| enable_modelarts: False | |||
| data_url: "" | |||
| train_url: "" | |||
| checkpoint_url: "" | |||
| data_path: "/cache/data" | |||
| output_path: "/cache/train" | |||
| load_path: "/cache/checkpoint_path" | |||
| checkpoint_path: './checkpoint/' | |||
| checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' | |||
| device_target: Ascend | |||
| enable_profiling: False | |||
| data_path_local: '/data/hcm/data/cifar-10-batches-bin/' | |||
| ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' | |||
| # ============================================================================== | |||
| # Training options | |||
| epoch_size: 30 | |||
| keep_checkpoint_max: 10 | |||
| num_classes: 10 | |||
| learning_rate: 0.002 | |||
| momentum: 0.9 | |||
| batch_size: 32 | |||
| buffer_size: 1000 | |||
| image_height: 227 | |||
| image_width: 227 | |||
| save_checkpoint_steps: 1562 | |||
| air_name: 'alexnet.air' | |||
| dataset_name: 'cifar10' | |||
| sink_size: -1 | |||
| dataset_sink_mode: True | |||
| device_id: 0 | |||
| save_checkpoint: True | |||
| save_checkpoint_epochs: 2 | |||
| lr: 0.01 | |||
| # Model Description | |||
| model_name: alexnet | |||
| file_name: 'alexnet' | |||
| file_format: 'AIR' | |||
| --- | |||
| # Config description for each option | |||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||
| data_url: 'Dataset url for obs' | |||
| train_url: 'Training output url for obs' | |||
| data_path: 'Dataset path for local' | |||
| output_path: 'Training output path for local' | |||
| device_target: 'Target device type' | |||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||
| --- | |||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||
| @@ -18,9 +18,13 @@ eval alexnet according to model file: | |||
| python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||
| """ | |||
| import ast | |||
| import argparse | |||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||
| import os | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| from utils.moxing_adapter import moxing_wrapper | |||
| from utils.device_adapter import get_device_id, get_device_num | |||
| from src.dataset import create_dataset_cifar10, create_dataset_imagenet | |||
| from src.alexnet import AlexNet | |||
| import mindspore.nn as nn | |||
| @@ -28,51 +32,52 @@ from mindspore import context | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.communication.management import init | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') | |||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||
| help='dataset name.') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||
| path where the trained ckpt file') | |||
| parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, | |||
| default=True, help='dataset_sink_mode is False or True') | |||
| parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') | |||
| args = parser.parse_args() | |||
| if os.path.exists(config.data_path_local): | |||
| config.data_path = config.data_path_local | |||
| load_path = config.ckpt_path_local | |||
| else: | |||
| load_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| def modelarts_process(): | |||
| pass | |||
| @moxing_wrapper(pre_process=modelarts_process) | |||
| def eval_alexnet(): | |||
| print("============== Starting Testing ==============") | |||
| if args.dataset_name == 'cifar10': | |||
| cfg = alexnet_cifar10_cfg | |||
| network = AlexNet(cfg.num_classes, phase='test') | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) | |||
| ds_eval = create_dataset_cifar10(args.data_path, cfg.batch_size, status="test", target=args.device_target) | |||
| device_num = get_device_num() | |||
| if device_num > 1: | |||
| # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) | |||
| if config.device_target == "Ascend": | |||
| context.set_context(device_id=get_device_id()) | |||
| init() | |||
| elif config.device_target == "GPU": | |||
| init() | |||
| param_dict = load_checkpoint(args.ckpt_path) | |||
| print("load checkpoint from [{}].".format(args.ckpt_path)) | |||
| if config.dataset_name == 'cifar10': | |||
| network = AlexNet(config.num_classes, phase='test') | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) | |||
| ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ | |||
| target=config.device_target) | |||
| param_dict = load_checkpoint(load_path) | |||
| print("load checkpoint from [{}].".format(load_path)) | |||
| load_param_into_net(network, param_dict) | |||
| network.set_train(False) | |||
| model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) | |||
| elif args.dataset_name == 'imagenet': | |||
| cfg = alexnet_imagenet_cfg | |||
| network = AlexNet(cfg.num_classes, phase='test') | |||
| elif config.dataset_name == 'imagenet': | |||
| network = AlexNet(config.num_classes, phase='test') | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| ds_eval = create_dataset_imagenet(args.data_path, cfg.batch_size, training=False) | |||
| param_dict = load_checkpoint(args.ckpt_path) | |||
| print("load checkpoint from [{}].".format(args.ckpt_path)) | |||
| ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) | |||
| param_dict = load_checkpoint(load_path) | |||
| print("load checkpoint from [{}].".format(load_path)) | |||
| load_param_into_net(network, param_dict) | |||
| network.set_train(False) | |||
| model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) | |||
| else: | |||
| @@ -81,5 +86,9 @@ if __name__ == "__main__": | |||
| if ds_eval.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| result = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) | |||
| result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) | |||
| print("result : {}".format(result)) | |||
| if __name__ == "__main__": | |||
| eval_alexnet() | |||
| @@ -16,44 +16,32 @@ | |||
| ##############export checkpoint file into air, onnx, mindir models################# | |||
| python export.py | |||
| """ | |||
| import argparse | |||
| import numpy as np | |||
| import os | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| import numpy as np | |||
| import mindspore as ms | |||
| from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export | |||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||
| from src.alexnet import AlexNet | |||
| parser = argparse.ArgumentParser(description='Classification') | |||
| parser.add_argument("--device_id", type=int, default=0, help="Device id") | |||
| parser.add_argument("--batch_size", type=int, default=1, help="batch size") | |||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||
| help='please choose dataset: imagenet or cifar10.') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", | |||
| choices=['Ascend', 'GPU', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") | |||
| parser.add_argument("--file_name", type=str, default="alexnet", help="output file name.") | |||
| parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") | |||
| args_opt = parser.parse_args() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) | |||
| if args_opt.device_target == "Ascend": | |||
| context.set_context(device_id=args_opt.device_id) | |||
| if __name__ == '__main__': | |||
| if args_opt.dataset_name == 'cifar10': | |||
| cfg = alexnet_cifar10_cfg | |||
| elif args_opt.dataset_name == 'imagenet': | |||
| cfg = alexnet_imagenet_cfg | |||
| else: | |||
| raise ValueError("dataset is not support.") | |||
| if os.path.exists(config.data_path_local): | |||
| ckpt_path = config.ckpt_path_local | |||
| else: | |||
| ckpt_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| if config.device_target == "Ascend": | |||
| context.set_context(device_id=config.device_id) | |||
| net = AlexNet(num_classes=cfg.num_classes) | |||
| if __name__ == '__main__': | |||
| net = AlexNet(num_classes=config.num_classes) | |||
| param_dict = load_checkpoint(args_opt.ckpt_file) | |||
| param_dict = load_checkpoint(ckpt_path) | |||
| load_param_into_net(net, param_dict) | |||
| input_arr = Tensor(np.zeros([args_opt.batch_size, 3, cfg.image_height, cfg.image_width]), ms.float32) | |||
| export(net, input_arr, file_name=args_opt.file_name, file_format=args_opt.file_format) | |||
| input_arr = Tensor(np.zeros([config.batch_size, 3, config.image_height, config.image_width]), ms.float32) | |||
| export(net, input_arr, file_name=config.file_name, file_format=config.file_format) | |||
| @@ -0,0 +1,35 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| # an simple tutorial as follows, more parameters can be setting | |||
| # echo "Usage: sh run_standalone_eval_ascend.sh [cifar10|imagenet] [DATA_PATH] [CKPT_PATH] [DEVICE_ID]" | |||
| BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) | |||
| if [ $# -ge 1 ]; then | |||
| if [ $1 == 'imagenet' ]; then | |||
| CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" | |||
| elif [ $1 == 'cifar10' ]; then | |||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||
| else | |||
| echo "Unrecognized parameter" | |||
| exit 1 | |||
| fi | |||
| else | |||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||
| fi | |||
| # python eval.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --ckpt_path=$CKPT_PATH --device_id=$DEVICE_ID --device_target="Ascend" > eval_log 2>&1 & | |||
| python ../eval.py --config_path=$CONFIG_FILE > eval_log 2>&1 & | |||
| @@ -0,0 +1,35 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| # an simple tutorial as follows, more parameters can be setting | |||
| # echo "Usage: sh run_standalone_train_ascend.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]" | |||
| BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) | |||
| if [ $# -ge 1 ]; then | |||
| if [ $1 == 'imagenet' ]; then | |||
| CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" | |||
| elif [ $1 == 'cifar10' ]; then | |||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||
| else | |||
| echo "Unrecognized parameter" | |||
| exit 1 | |||
| fi | |||
| else | |||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||
| fi | |||
| # python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --device_id=$DEVICE_ID --device_target="Ascend" > log 2>&1 & | |||
| python ../train.py --config_path=$CONFIG_FILE > log 2>&1 & | |||
| @@ -17,7 +17,6 @@ Produce the dataset | |||
| """ | |||
| import os | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| import mindspore.dataset.vision.c_transforms as CV | |||
| @@ -18,10 +18,14 @@ train alexnet and get network model files(.ckpt) : | |||
| python train.py --data_path /YourDataPath | |||
| """ | |||
| import ast | |||
| import argparse | |||
| import os | |||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| from utils.moxing_adapter import moxing_wrapper | |||
| from utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| # from src.config import alexnet_cifar10_config, alexnet_imagenet_config | |||
| from src.dataset import create_dataset_cifar10, create_dataset_imagenet | |||
| from src.generator_lr import get_lr_cifar10, get_lr_imagenet | |||
| from src.alexnet import AlexNet | |||
| @@ -40,88 +44,84 @@ from mindspore.common import set_seed | |||
| set_seed(1) | |||
| de.config.set_seed(1) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') | |||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||
| help='dataset name.') | |||
| parser.add_argument('--sink_size', type=int, default=-1, help='control the amount of data in each sink') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||
| path where the trained ckpt file') | |||
| parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, | |||
| default=True, help='dataset_sink_mode is False or True') | |||
| parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') | |||
| args = parser.parse_args() | |||
| device_num = int(os.environ.get("DEVICE_NUM", 1)) | |||
| if args.dataset_name == "cifar10": | |||
| cfg = alexnet_cifar10_cfg | |||
| if os.path.exists(config.data_path_local): | |||
| config.data_path = config.data_path_local | |||
| config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) | |||
| else: | |||
| config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) | |||
| def modelarts_pre_process(): | |||
| pass | |||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||
| def train_alexnet(): | |||
| print(config) | |||
| print('device id:', get_device_id()) | |||
| print('device num:', get_device_num()) | |||
| print('rank id:', get_rank_id()) | |||
| print('job id:', get_job_id()) | |||
| device_target = config.device_target | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| context.set_context(save_graphs=False) | |||
| device_num = get_device_num() | |||
| if config.dataset_name == "cifar10": | |||
| if device_num > 1: | |||
| cfg.learning_rate = cfg.learning_rate * device_num | |||
| cfg.epoch_size = cfg.epoch_size * 2 | |||
| elif args.dataset_name == "imagenet": | |||
| cfg = alexnet_imagenet_cfg | |||
| config.learning_rate = config.learning_rate * device_num | |||
| config.epoch_size = config.epoch_size * 2 | |||
| elif config.dataset_name == "imagenet": | |||
| pass | |||
| else: | |||
| raise ValueError("Unsupported dataset.") | |||
| device_target = args.device_target | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| context.set_context(save_graphs=False) | |||
| if device_target == "Ascend": | |||
| context.set_context(device_id=args.device_id) | |||
| if device_num > 1: | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||
| gradients_mean=True) | |||
| if device_num > 1: | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num=device_num, \ | |||
| parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) | |||
| if device_target == "Ascend": | |||
| context.set_context(device_id=get_device_id()) | |||
| init() | |||
| elif device_target == "GPU": | |||
| if device_num > 1: | |||
| elif device_target == "GPU": | |||
| init() | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||
| gradients_mean=True) | |||
| else: | |||
| raise ValueError("Unsupported platform.") | |||
| context.set_context(device_id=get_device_id()) | |||
| if args.dataset_name == "cifar10": | |||
| ds_train = create_dataset_cifar10(args.data_path, cfg.batch_size, target=args.device_target) | |||
| elif args.dataset_name == "imagenet": | |||
| ds_train = create_dataset_imagenet(args.data_path, cfg.batch_size) | |||
| if config.dataset_name == "cifar10": | |||
| ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) | |||
| elif config.dataset_name == "imagenet": | |||
| ds_train = create_dataset_imagenet(config.data_path, config.batch_size) | |||
| else: | |||
| raise ValueError("Unsupported dataset.") | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = AlexNet(cfg.num_classes, phase='train') | |||
| network = AlexNet(config.num_classes, phase='train') | |||
| loss_scale_manager = None | |||
| metrics = None | |||
| step_per_epoch = ds_train.get_dataset_size() if args.sink_size == -1 else args.sink_size | |||
| if args.dataset_name == 'cifar10': | |||
| step_per_epoch = ds_train.get_dataset_size() if config.sink_size == -1 else config.sink_size | |||
| if config.dataset_name == 'cifar10': | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| lr = Tensor(get_lr_cifar10(0, cfg.learning_rate, cfg.epoch_size, step_per_epoch)) | |||
| opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum) | |||
| lr = Tensor(get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) | |||
| opt = nn.Momentum(network.trainable_params(), lr, config.momentum) | |||
| metrics = {"Accuracy": Accuracy()} | |||
| elif args.dataset_name == 'imagenet': | |||
| elif config.dataset_name == 'imagenet': | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| lr = Tensor(get_lr_imagenet(cfg.learning_rate, cfg.epoch_size, step_per_epoch)) | |||
| lr = Tensor(get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) | |||
| opt = nn.Momentum(params=get_param_groups(network), | |||
| learning_rate=lr, | |||
| momentum=cfg.momentum, | |||
| weight_decay=cfg.weight_decay, | |||
| loss_scale=cfg.loss_scale) | |||
| momentum=config.momentum, | |||
| weight_decay=config.weight_decay, | |||
| loss_scale=config.loss_scale) | |||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | |||
| if cfg.is_dynamic_loss_scale == 1: | |||
| if config.is_dynamic_loss_scale == 1: | |||
| loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) | |||
| else: | |||
| loss_scale_manager = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) | |||
| loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | |||
| else: | |||
| raise ValueError("Unsupported dataset.") | |||
| @@ -135,15 +135,18 @@ if __name__ == "__main__": | |||
| raise ValueError("Unsupported platform.") | |||
| if device_num > 1: | |||
| ckpt_save_dir = os.path.join(args.ckpt_path + "_" + str(get_rank())) | |||
| ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) | |||
| else: | |||
| ckpt_save_dir = args.ckpt_path | |||
| ckpt_save_dir = config.checkpoint_path | |||
| time_cb = TimeMonitor(data_size=step_per_epoch) | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, | |||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| model.train(cfg.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], | |||
| dataset_sink_mode=args.dataset_sink_mode, sink_size=args.sink_size) | |||
| model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], | |||
| dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size) | |||
| if __name__ == "__main__": | |||
| train_alexnet() | |||
| @@ -0,0 +1,127 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Parse arguments""" | |||
| import os | |||
| import ast | |||
| import argparse | |||
| from pprint import pprint, pformat | |||
| import yaml | |||
| class Config: | |||
| """ | |||
| Configuration namespace. Convert dictionary to members. | |||
| """ | |||
| def __init__(self, cfg_dict): | |||
| for k, v in cfg_dict.items(): | |||
| if isinstance(v, (list, tuple)): | |||
| setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) | |||
| else: | |||
| setattr(self, k, Config(v) if isinstance(v, dict) else v) | |||
| def __str__(self): | |||
| return pformat(self.__dict__) | |||
| def __repr__(self): | |||
| return self.__str__() | |||
| def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): | |||
| """ | |||
| Parse command line arguments to the configuration according to the default yaml. | |||
| Args: | |||
| parser: Parent parser. | |||
| cfg: Base configuration. | |||
| helper: Helper description. | |||
| cfg_path: Path to the default yaml config. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", | |||
| parents=[parser]) | |||
| helper = {} if helper is None else helper | |||
| choices = {} if choices is None else choices | |||
| for item in cfg: | |||
| if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): | |||
| help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) | |||
| choice = choices[item] if item in choices else None | |||
| if isinstance(cfg[item], bool): | |||
| parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| else: | |||
| parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| args = parser.parse_args() | |||
| return args | |||
| def parse_yaml(yaml_path): | |||
| """ | |||
| Parse the yaml config file. | |||
| Args: | |||
| yaml_path: Path to the yaml config. | |||
| """ | |||
| with open(yaml_path, 'r') as fin: | |||
| try: | |||
| cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) | |||
| cfgs = [x for x in cfgs] | |||
| if len(cfgs) == 1: | |||
| cfg_helper = {} | |||
| cfg = cfgs[0] | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 2: | |||
| cfg, cfg_helper = cfgs | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 3: | |||
| cfg, cfg_helper, cfg_choices = cfgs | |||
| else: | |||
| raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") | |||
| print(cfg_helper) | |||
| except: | |||
| raise ValueError("Failed to parse yaml") | |||
| return cfg, cfg_helper, cfg_choices | |||
| def merge(args, cfg): | |||
| """ | |||
| Merge the base config from yaml file and command line arguments. | |||
| Args: | |||
| args: Command line arguments. | |||
| cfg: Base configuration. | |||
| """ | |||
| args_var = vars(args) | |||
| for item in args_var: | |||
| cfg[item] = args_var[item] | |||
| return cfg | |||
| def get_config(): | |||
| """ | |||
| Get Config according to the yaml file and cli arguments. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="default name", add_help=False) | |||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||
| parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), | |||
| help="Config file path") | |||
| path_args, _ = parser.parse_known_args() | |||
| default, helper, choices = parse_yaml(path_args.config_path) | |||
| pprint(default) | |||
| args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) | |||
| final_config = merge(args, default) | |||
| return Config(final_config) | |||
| config = get_config() | |||
| @@ -0,0 +1,27 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Device adapter for ModelArts""" | |||
| from .config import config | |||
| if config.enable_modelarts: | |||
| from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| else: | |||
| from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| __all__ = [ | |||
| "get_device_id", "get_device_num", "get_rank_id", "get_job_id" | |||
| ] | |||
| @@ -0,0 +1,36 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Local adapter""" | |||
| import os | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| return "Local Job" | |||
| @@ -0,0 +1,122 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Moxing adapter for ModelArts""" | |||
| import os | |||
| import functools | |||
| from mindspore import context | |||
| from mindspore.profiler import Profiler | |||
| from .config import config | |||
| _global_sync_count = 0 | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| job_id = os.getenv('JOB_ID') | |||
| job_id = job_id if job_id != "" else "default" | |||
| return job_id | |||
| def sync_data(from_path, to_path): | |||
| """ | |||
| Download data from remote obs to local directory if the first url is remote url and the second one is local path | |||
| Upload data from local directory to remote obs in contrast. | |||
| """ | |||
| import moxing as mox | |||
| import time | |||
| global _global_sync_count | |||
| sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) | |||
| _global_sync_count += 1 | |||
| # Each server contains 8 devices as most. | |||
| if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): | |||
| print("from path: ", from_path) | |||
| print("to path: ", to_path) | |||
| mox.file.copy_parallel(from_path, to_path) | |||
| print("===finish data synchronization===") | |||
| try: | |||
| os.mknod(sync_lock) | |||
| except IOError: | |||
| pass | |||
| print("===save flag===") | |||
| while True: | |||
| if os.path.exists(sync_lock): | |||
| break | |||
| time.sleep(1) | |||
| print("Finish sync data from {} to {}.".format(from_path, to_path)) | |||
| def moxing_wrapper(pre_process=None, post_process=None): | |||
| """ | |||
| Moxing wrapper to download dataset and upload outputs. | |||
| """ | |||
| def wrapper(run_func): | |||
| @functools.wraps(run_func) | |||
| def wrapped_func(*args, **kwargs): | |||
| # Download data from data_url | |||
| if config.enable_modelarts: | |||
| if config.data_url: | |||
| sync_data(config.data_url, config.data_path) | |||
| print("Dataset downloaded: ", os.listdir(config.data_path)) | |||
| if config.checkpoint_url: | |||
| sync_data(config.checkpoint_url, config.load_path) | |||
| print("Preload downloaded: ", os.listdir(config.load_path)) | |||
| if config.train_url: | |||
| sync_data(config.train_url, config.output_path) | |||
| print("Workspace downloaded: ", os.listdir(config.output_path)) | |||
| context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) | |||
| config.device_num = get_device_num() | |||
| config.device_id = get_device_id() | |||
| if not os.path.exists(config.output_path): | |||
| os.makedirs(config.output_path) | |||
| if pre_process: | |||
| pre_process() | |||
| if config.enable_profiling: | |||
| profiler = Profiler() | |||
| run_func(*args, **kwargs) | |||
| if config.enable_profiling: | |||
| profiler.analyse() | |||
| # Upload data to train_url | |||
| if config.enable_modelarts: | |||
| if post_process: | |||
| post_process() | |||
| if config.train_url: | |||
| print("Start to copy output directory") | |||
| sync_data(config.output_path, config.train_url) | |||
| return wrapped_func | |||
| return wrapper | |||
| @@ -0,0 +1,56 @@ | |||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||
| enable_modelarts: False | |||
| data_url: "" | |||
| train_url: "" | |||
| checkpoint_url: "" | |||
| data_path: "/cache/data" | |||
| output_path: "/cache/train" | |||
| load_path: "/cache/checkpoint_path" | |||
| checkpoint_path: './checkpoint/' | |||
| checkpoint_file: './checkpoint/checkpoint_lenet-10_1875.ckpt' | |||
| device_target: Ascend | |||
| enable_profiling: False | |||
| data_path_local: '/data/hcm/data/MNIST_Data/' | |||
| ckpt_path_local: '/data/hcm/data/ckpt_lenet/checkpoint_lenet-10_1875.ckpt' | |||
| # ============================================================================== | |||
| # Training options | |||
| num_classes: 10 | |||
| lr: 0.01 | |||
| momentum: 0.9 | |||
| epoch_size: 10 | |||
| batch_size: 15 # 32 | |||
| buffer_size: 1000 | |||
| image_height: 32 | |||
| image_width: 32 | |||
| save_checkpoint_steps: 1875 | |||
| keep_checkpoint_max: 10 | |||
| air_name: "lenet" | |||
| device_id: 0 | |||
| file_name: "lenet" | |||
| file_format: "AIR" | |||
| model_name: lenet | |||
| learning_rate: 0.002 | |||
| dataset_name: 'mnist' | |||
| sink_size: -1 | |||
| dataset_sink_mode: True | |||
| save_checkpoint: True | |||
| save_checkpoint_epochs: 2 | |||
| --- | |||
| # Config description for each option | |||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||
| data_url: 'Dataset url for obs' | |||
| train_url: 'Training output url for obs' | |||
| data_path: 'Dataset path for local' | |||
| output_path: 'Training output path for local' | |||
| device_target: 'Target device type' | |||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||
| file_name: 'output file name.' | |||
| file_format: 'file format' | |||
| --- | |||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||
| file_format: ['AIR', 'ONNX', 'MINDIR'] | |||
| @@ -19,43 +19,51 @@ python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||
| """ | |||
| import os | |||
| import argparse | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| from utils.moxing_adapter import moxing_wrapper | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from src.dataset import create_dataset | |||
| from src.config import mnist_cfg as cfg | |||
| from src.lenet import LeNet5 | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./Data", | |||
| help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_path', type=str, default="", help='if mode is test, must provide\ | |||
| path where the trained ckpt file') | |||
| if os.path.exists(config.data_path_local): | |||
| config.data_path = config.data_path_local | |||
| ckpt_path = config.ckpt_path_local | |||
| else: | |||
| ckpt_path = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') | |||
| def modelarts_process(): | |||
| pass | |||
| args = parser.parse_args() | |||
| @moxing_wrapper(pre_process=modelarts_process) | |||
| def eval_lenet(): | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| network = LeNet5(cfg.num_classes) | |||
| network = LeNet5(config.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| repeat_size = cfg.epoch_size | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| # repeat_size = config.epoch_size | |||
| net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| print("============== Starting Testing ==============") | |||
| param_dict = load_checkpoint(args.ckpt_path) | |||
| param_dict = load_checkpoint(ckpt_path) | |||
| load_param_into_net(network, param_dict) | |||
| ds_eval = create_dataset(os.path.join(args.data_path, "test"), | |||
| cfg.batch_size, | |||
| ds_eval = create_dataset(os.path.join(config.data_path, "test"), | |||
| config.batch_size, | |||
| 1) | |||
| if ds_eval.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| acc = model.eval(ds_eval) | |||
| print("============== {} ==============".format(acc)) | |||
| if __name__ == "__main__": | |||
| eval_lenet() | |||
| @@ -14,37 +14,35 @@ | |||
| # ============================================================================ | |||
| """export checkpoint file into air, onnx, mindir models""" | |||
| import argparse | |||
| import numpy as np | |||
| import os | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| from utils.device_adapter import get_device_id | |||
| import numpy as np | |||
| import mindspore | |||
| from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export | |||
| from src.config import mnist_cfg as cfg | |||
| from src.lenet import LeNet5 | |||
| parser = argparse.ArgumentParser(description='MindSpore MNIST Example') | |||
| parser.add_argument("--device_id", type=int, default=0, help="Device id") | |||
| parser.add_argument("--batch_size", type=int, default=1, help="batch size") | |||
| parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") | |||
| parser.add_argument("--file_name", type=str, default="lenet", help="output file name.") | |||
| parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") | |||
| parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend", | |||
| help="device target") | |||
| args = parser.parse_args() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| if args.device_target == "Ascend": | |||
| context.set_context(device_id=args.device_id) | |||
| if os.path.exists(config.data_path_local): | |||
| ckpt_file = config.ckpt_path_local | |||
| else: | |||
| ckpt_file = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| if config.device_target == "Ascend": | |||
| context.set_context(device_id=get_device_id()) | |||
| if __name__ == "__main__": | |||
| # define fusion network | |||
| network = LeNet5(cfg.num_classes) | |||
| network = LeNet5(config.num_classes) | |||
| # load network checkpoint | |||
| param_dict = load_checkpoint(args.ckpt_file) | |||
| param_dict = load_checkpoint(ckpt_file) | |||
| load_param_into_net(network, param_dict) | |||
| # export network | |||
| inputs = Tensor(np.ones([args.batch_size, 1, cfg.image_height, cfg.image_width]), mindspore.float32) | |||
| export(network, inputs, file_name=args.file_name, file_format=args.file_format) | |||
| inputs = Tensor(np.ones([config.batch_size, 1, config.image_height, config.image_width]), mindspore.float32) | |||
| export(network, inputs, file_name=config.file_name, file_format=config.file_format) | |||
| @@ -17,6 +17,7 @@ | |||
| # an simple tutorial as follows, more parameters can be setting | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| DATA_PATH=$1 | |||
| CKPT_PATH=$2 | |||
| python -s ${self_path}/../eval.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & | |||
| # DATA_PATH=$1 | |||
| # CKPT_PATH=$2 | |||
| # --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH | |||
| python -s ${self_path}/../eval.py > log_eval.txt 2>&1 & | |||
| @@ -17,6 +17,7 @@ | |||
| # an simple tutorial as follows, more parameters can be setting | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| DATA_PATH=$1 | |||
| CKPT_PATH=$2 | |||
| python -s ${self_path}/../train.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & | |||
| # DATA_PATH=$1 | |||
| # CKPT_PATH=$2 | |||
| # --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH | |||
| python -s ${self_path}/../train.py > log.txt 2>&1 & | |||
| @@ -19,8 +19,12 @@ python train.py --data_path /YourDataPath | |||
| """ | |||
| import os | |||
| import argparse | |||
| from src.config import mnist_cfg as cfg | |||
| # import sys | |||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||
| from utils.config import config | |||
| from utils.moxing_adapter import moxing_wrapper | |||
| from utils.device_adapter import get_rank_id | |||
| from src.dataset import create_dataset | |||
| from src.lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| @@ -30,36 +34,40 @@ from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.common import set_seed | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./Data", | |||
| help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||
| path where the trained ckpt file') | |||
| args = parser.parse_args() | |||
| set_seed(1) | |||
| if os.path.exists(config.data_path_local): | |||
| config.data_path = config.data_path_local | |||
| config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) | |||
| else: | |||
| config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) | |||
| if __name__ == "__main__": | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size) | |||
| def modelarts_pre_process(): | |||
| pass | |||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||
| def train_lenet(): | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| ds_train = create_dataset(os.path.join(config.data_path, "train"), config.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| network = LeNet5(config.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=args.ckpt_path, config=config_ck) | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, | |||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=config.checkpoint_path, config=config_ck) | |||
| if args.device_target != "Ascend": | |||
| if config.device_target != "Ascend": | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") | |||
| print("============== Starting Training ==============") | |||
| model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) | |||
| model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) | |||
| if __name__ == "__main__": | |||
| train_lenet() | |||
| @@ -0,0 +1,127 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Parse arguments""" | |||
| import os | |||
| import ast | |||
| import argparse | |||
| from pprint import pprint, pformat | |||
| import yaml | |||
| class Config: | |||
| """ | |||
| Configuration namespace. Convert dictionary to members. | |||
| """ | |||
| def __init__(self, cfg_dict): | |||
| for k, v in cfg_dict.items(): | |||
| if isinstance(v, (list, tuple)): | |||
| setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) | |||
| else: | |||
| setattr(self, k, Config(v) if isinstance(v, dict) else v) | |||
| def __str__(self): | |||
| return pformat(self.__dict__) | |||
| def __repr__(self): | |||
| return self.__str__() | |||
| def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): | |||
| """ | |||
| Parse command line arguments to the configuration according to the default yaml. | |||
| Args: | |||
| parser: Parent parser. | |||
| cfg: Base configuration. | |||
| helper: Helper description. | |||
| cfg_path: Path to the default yaml config. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", | |||
| parents=[parser]) | |||
| helper = {} if helper is None else helper | |||
| choices = {} if choices is None else choices | |||
| for item in cfg: | |||
| if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): | |||
| help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) | |||
| choice = choices[item] if item in choices else None | |||
| if isinstance(cfg[item], bool): | |||
| parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| else: | |||
| parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| args = parser.parse_args() | |||
| return args | |||
| def parse_yaml(yaml_path): | |||
| """ | |||
| Parse the yaml config file. | |||
| Args: | |||
| yaml_path: Path to the yaml config. | |||
| """ | |||
| with open(yaml_path, 'r') as fin: | |||
| try: | |||
| cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) | |||
| cfgs = [x for x in cfgs] | |||
| if len(cfgs) == 1: | |||
| cfg_helper = {} | |||
| cfg = cfgs[0] | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 2: | |||
| cfg, cfg_helper = cfgs | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 3: | |||
| cfg, cfg_helper, cfg_choices = cfgs | |||
| else: | |||
| raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") | |||
| print(cfg_helper) | |||
| except: | |||
| raise ValueError("Failed to parse yaml") | |||
| return cfg, cfg_helper, cfg_choices | |||
| def merge(args, cfg): | |||
| """ | |||
| Merge the base config from yaml file and command line arguments. | |||
| Args: | |||
| args: Command line arguments. | |||
| cfg: Base configuration. | |||
| """ | |||
| args_var = vars(args) | |||
| for item in args_var: | |||
| cfg[item] = args_var[item] | |||
| return cfg | |||
| def get_config(): | |||
| """ | |||
| Get Config according to the yaml file and cli arguments. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="default name", add_help=False) | |||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||
| parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), | |||
| help="Config file path") | |||
| path_args, _ = parser.parse_known_args() | |||
| default, helper, choices = parse_yaml(path_args.config_path) | |||
| pprint(default) | |||
| args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) | |||
| final_config = merge(args, default) | |||
| return Config(final_config) | |||
| config = get_config() | |||
| @@ -0,0 +1,27 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Device adapter for ModelArts""" | |||
| from .config import config | |||
| if config.enable_modelarts: | |||
| from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| else: | |||
| from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| __all__ = [ | |||
| "get_device_id", "get_device_num", "get_rank_id", "get_job_id" | |||
| ] | |||
| @@ -0,0 +1,36 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Local adapter""" | |||
| import os | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| return "Local Job" | |||
| @@ -0,0 +1,122 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Moxing adapter for ModelArts""" | |||
| import os | |||
| import functools | |||
| from mindspore import context | |||
| from mindspore.profiler import Profiler | |||
| from .config import config | |||
| _global_sync_count = 0 | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| job_id = os.getenv('JOB_ID') | |||
| job_id = job_id if job_id != "" else "default" | |||
| return job_id | |||
| def sync_data(from_path, to_path): | |||
| """ | |||
| Download data from remote obs to local directory if the first url is remote url and the second one is local path | |||
| Upload data from local directory to remote obs in contrast. | |||
| """ | |||
| import moxing as mox | |||
| import time | |||
| global _global_sync_count | |||
| sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) | |||
| _global_sync_count += 1 | |||
| # Each server contains 8 devices as most. | |||
| if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): | |||
| print("from path: ", from_path) | |||
| print("to path: ", to_path) | |||
| mox.file.copy_parallel(from_path, to_path) | |||
| print("===finish data synchronization===") | |||
| try: | |||
| os.mknod(sync_lock) | |||
| except IOError: | |||
| pass | |||
| print("===save flag===") | |||
| while True: | |||
| if os.path.exists(sync_lock): | |||
| break | |||
| time.sleep(1) | |||
| print("Finish sync data from {} to {}.".format(from_path, to_path)) | |||
| def moxing_wrapper(pre_process=None, post_process=None): | |||
| """ | |||
| Moxing wrapper to download dataset and upload outputs. | |||
| """ | |||
| def wrapper(run_func): | |||
| @functools.wraps(run_func) | |||
| def wrapped_func(*args, **kwargs): | |||
| # Download data from data_url | |||
| if config.enable_modelarts: | |||
| if config.data_url: | |||
| sync_data(config.data_url, config.data_path) | |||
| print("Dataset downloaded: ", os.listdir(config.data_path)) | |||
| if config.checkpoint_url: | |||
| sync_data(config.checkpoint_url, config.load_path) | |||
| print("Preload downloaded: ", os.listdir(config.load_path)) | |||
| if config.train_url: | |||
| sync_data(config.train_url, config.output_path) | |||
| print("Workspace downloaded: ", os.listdir(config.output_path)) | |||
| context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) | |||
| config.device_num = get_device_num() | |||
| config.device_id = get_device_id() | |||
| if not os.path.exists(config.output_path): | |||
| os.makedirs(config.output_path) | |||
| if pre_process: | |||
| pre_process() | |||
| if config.enable_profiling: | |||
| profiler = Profiler() | |||
| run_func(*args, **kwargs) | |||
| if config.enable_profiling: | |||
| profiler.analyse() | |||
| # Upload data to train_url | |||
| if config.enable_modelarts: | |||
| if post_process: | |||
| post_process() | |||
| if config.train_url: | |||
| print("Start to copy output directory") | |||
| sync_data(config.output_path, config.train_url) | |||
| return wrapped_func | |||
| return wrapper | |||