| @@ -0,0 +1,61 @@ | |||||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||||
| enable_modelarts: False | |||||
| data_url: "" | |||||
| train_url: "" | |||||
| checkpoint_url: "" | |||||
| data_path: "/cache/data" | |||||
| output_path: "/cache/train" | |||||
| load_path: "/cache/checkpoint_path" | |||||
| checkpoint_path: './checkpoint/' | |||||
| checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' | |||||
| device_target: Ascend | |||||
| enable_profiling: False | |||||
| data_path_local: '/data/hcm/data/ImageNet_Original/' | |||||
| ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' | |||||
| # ============================================================================== | |||||
| # Training options | |||||
| num_classes: 1000 | |||||
| learning_rate: 0.13 | |||||
| momentum: 0.9 | |||||
| epoch_size: 150 | |||||
| batch_size: 256 | |||||
| buffer_size: None | |||||
| image_height: 224 | |||||
| image_width: 224 | |||||
| save_checkpoint_steps: 625 | |||||
| keep_checkpoint_max: 10 | |||||
| air_name: 'alexnet.air' | |||||
| weight_decay: 0.0001 | |||||
| loss_scale: 1024 | |||||
| is_dynamic_loss_scale: 0 | |||||
| # Model Description | |||||
| model_name: alexnet | |||||
| file_name: 'alexnet' | |||||
| file_format: 'AIR' | |||||
| dataset_name: 'imagenet' | |||||
| sink_size: -1 | |||||
| dataset_sink_mode: True | |||||
| device_id: 0 | |||||
| save_checkpoint: True | |||||
| save_checkpoint_epochs: 2 | |||||
| lr: 0.01 | |||||
| --- | |||||
| # Config description for each option | |||||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||||
| data_url: 'Dataset url for obs' | |||||
| train_url: 'Training output url for obs' | |||||
| data_path: 'Dataset path for local' | |||||
| output_path: 'Training output path for local' | |||||
| device_target: 'Target device type' | |||||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||||
| --- | |||||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||||
| @@ -0,0 +1,56 @@ | |||||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||||
| enable_modelarts: False | |||||
| data_url: "" | |||||
| train_url: "" | |||||
| checkpoint_url: "" | |||||
| data_path: "/cache/data" | |||||
| output_path: "/cache/train" | |||||
| load_path: "/cache/checkpoint_path" | |||||
| checkpoint_path: './checkpoint/' | |||||
| checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' | |||||
| device_target: Ascend | |||||
| enable_profiling: False | |||||
| data_path_local: '/data/hcm/data/cifar-10-batches-bin/' | |||||
| ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' | |||||
| # ============================================================================== | |||||
| # Training options | |||||
| epoch_size: 30 | |||||
| keep_checkpoint_max: 10 | |||||
| num_classes: 10 | |||||
| learning_rate: 0.002 | |||||
| momentum: 0.9 | |||||
| batch_size: 32 | |||||
| buffer_size: 1000 | |||||
| image_height: 227 | |||||
| image_width: 227 | |||||
| save_checkpoint_steps: 1562 | |||||
| air_name: 'alexnet.air' | |||||
| dataset_name: 'cifar10' | |||||
| sink_size: -1 | |||||
| dataset_sink_mode: True | |||||
| device_id: 0 | |||||
| save_checkpoint: True | |||||
| save_checkpoint_epochs: 2 | |||||
| lr: 0.01 | |||||
| # Model Description | |||||
| model_name: alexnet | |||||
| file_name: 'alexnet' | |||||
| file_format: 'AIR' | |||||
| --- | |||||
| # Config description for each option | |||||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||||
| data_url: 'Dataset url for obs' | |||||
| train_url: 'Training output url for obs' | |||||
| data_path: 'Dataset path for local' | |||||
| output_path: 'Training output path for local' | |||||
| device_target: 'Target device type' | |||||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||||
| --- | |||||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||||
| @@ -18,9 +18,13 @@ eval alexnet according to model file: | |||||
| python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | ||||
| """ | """ | ||||
| import ast | |||||
| import argparse | |||||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||||
| import os | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| from utils.moxing_adapter import moxing_wrapper | |||||
| from utils.device_adapter import get_device_id, get_device_num | |||||
| from src.dataset import create_dataset_cifar10, create_dataset_imagenet | from src.dataset import create_dataset_cifar10, create_dataset_imagenet | ||||
| from src.alexnet import AlexNet | from src.alexnet import AlexNet | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| @@ -28,51 +32,52 @@ from mindspore import context | |||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| from mindspore.train import Model | from mindspore.train import Model | ||||
| from mindspore.nn.metrics import Accuracy | from mindspore.nn.metrics import Accuracy | ||||
| from mindspore.communication.management import init | |||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') | |||||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||||
| help='dataset name.') | |||||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], | |||||
| help='device where the code will be implemented (default: Ascend)') | |||||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||||
| path where the trained ckpt file') | |||||
| parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, | |||||
| default=True, help='dataset_sink_mode is False or True') | |||||
| parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') | |||||
| args = parser.parse_args() | |||||
| if os.path.exists(config.data_path_local): | |||||
| config.data_path = config.data_path_local | |||||
| load_path = config.ckpt_path_local | |||||
| else: | |||||
| load_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||||
| def modelarts_process(): | |||||
| pass | |||||
| @moxing_wrapper(pre_process=modelarts_process) | |||||
| def eval_alexnet(): | |||||
| print("============== Starting Testing ==============") | print("============== Starting Testing ==============") | ||||
| if args.dataset_name == 'cifar10': | |||||
| cfg = alexnet_cifar10_cfg | |||||
| network = AlexNet(cfg.num_classes, phase='test') | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||||
| opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) | |||||
| ds_eval = create_dataset_cifar10(args.data_path, cfg.batch_size, status="test", target=args.device_target) | |||||
| device_num = get_device_num() | |||||
| if device_num > 1: | |||||
| # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) | |||||
| if config.device_target == "Ascend": | |||||
| context.set_context(device_id=get_device_id()) | |||||
| init() | |||||
| elif config.device_target == "GPU": | |||||
| init() | |||||
| param_dict = load_checkpoint(args.ckpt_path) | |||||
| print("load checkpoint from [{}].".format(args.ckpt_path)) | |||||
| if config.dataset_name == 'cifar10': | |||||
| network = AlexNet(config.num_classes, phase='test') | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||||
| opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) | |||||
| ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ | |||||
| target=config.device_target) | |||||
| param_dict = load_checkpoint(load_path) | |||||
| print("load checkpoint from [{}].".format(load_path)) | |||||
| load_param_into_net(network, param_dict) | load_param_into_net(network, param_dict) | ||||
| network.set_train(False) | network.set_train(False) | ||||
| model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) | model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) | ||||
| elif args.dataset_name == 'imagenet': | |||||
| cfg = alexnet_imagenet_cfg | |||||
| network = AlexNet(cfg.num_classes, phase='test') | |||||
| elif config.dataset_name == 'imagenet': | |||||
| network = AlexNet(config.num_classes, phase='test') | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| ds_eval = create_dataset_imagenet(args.data_path, cfg.batch_size, training=False) | |||||
| param_dict = load_checkpoint(args.ckpt_path) | |||||
| print("load checkpoint from [{}].".format(args.ckpt_path)) | |||||
| ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) | |||||
| param_dict = load_checkpoint(load_path) | |||||
| print("load checkpoint from [{}].".format(load_path)) | |||||
| load_param_into_net(network, param_dict) | load_param_into_net(network, param_dict) | ||||
| network.set_train(False) | network.set_train(False) | ||||
| model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) | model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) | ||||
| else: | else: | ||||
| @@ -81,5 +86,9 @@ if __name__ == "__main__": | |||||
| if ds_eval.get_dataset_size() == 0: | if ds_eval.get_dataset_size() == 0: | ||||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | ||||
| result = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) | |||||
| result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) | |||||
| print("result : {}".format(result)) | print("result : {}".format(result)) | ||||
| if __name__ == "__main__": | |||||
| eval_alexnet() | |||||
| @@ -16,44 +16,32 @@ | |||||
| ##############export checkpoint file into air, onnx, mindir models################# | ##############export checkpoint file into air, onnx, mindir models################# | ||||
| python export.py | python export.py | ||||
| """ | """ | ||||
| import argparse | |||||
| import numpy as np | |||||
| import os | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| import numpy as np | |||||
| import mindspore as ms | import mindspore as ms | ||||
| from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export | from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export | ||||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||||
| from src.alexnet import AlexNet | from src.alexnet import AlexNet | ||||
| parser = argparse.ArgumentParser(description='Classification') | |||||
| parser.add_argument("--device_id", type=int, default=0, help="Device id") | |||||
| parser.add_argument("--batch_size", type=int, default=1, help="batch size") | |||||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||||
| help='please choose dataset: imagenet or cifar10.') | |||||
| parser.add_argument('--device_target', type=str, default="Ascend", | |||||
| choices=['Ascend', 'GPU', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend)') | |||||
| parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") | |||||
| parser.add_argument("--file_name", type=str, default="alexnet", help="output file name.") | |||||
| parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") | |||||
| args_opt = parser.parse_args() | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) | |||||
| if args_opt.device_target == "Ascend": | |||||
| context.set_context(device_id=args_opt.device_id) | |||||
| if __name__ == '__main__': | |||||
| if args_opt.dataset_name == 'cifar10': | |||||
| cfg = alexnet_cifar10_cfg | |||||
| elif args_opt.dataset_name == 'imagenet': | |||||
| cfg = alexnet_imagenet_cfg | |||||
| else: | |||||
| raise ValueError("dataset is not support.") | |||||
| if os.path.exists(config.data_path_local): | |||||
| ckpt_path = config.ckpt_path_local | |||||
| else: | |||||
| ckpt_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| if config.device_target == "Ascend": | |||||
| context.set_context(device_id=config.device_id) | |||||
| net = AlexNet(num_classes=cfg.num_classes) | |||||
| if __name__ == '__main__': | |||||
| net = AlexNet(num_classes=config.num_classes) | |||||
| param_dict = load_checkpoint(args_opt.ckpt_file) | |||||
| param_dict = load_checkpoint(ckpt_path) | |||||
| load_param_into_net(net, param_dict) | load_param_into_net(net, param_dict) | ||||
| input_arr = Tensor(np.zeros([args_opt.batch_size, 3, cfg.image_height, cfg.image_width]), ms.float32) | |||||
| export(net, input_arr, file_name=args_opt.file_name, file_format=args_opt.file_format) | |||||
| input_arr = Tensor(np.zeros([config.batch_size, 3, config.image_height, config.image_width]), ms.float32) | |||||
| export(net, input_arr, file_name=config.file_name, file_format=config.file_format) | |||||
| @@ -0,0 +1,35 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| # an simple tutorial as follows, more parameters can be setting | |||||
| # echo "Usage: sh run_standalone_eval_ascend.sh [cifar10|imagenet] [DATA_PATH] [CKPT_PATH] [DEVICE_ID]" | |||||
| BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) | |||||
| if [ $# -ge 1 ]; then | |||||
| if [ $1 == 'imagenet' ]; then | |||||
| CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" | |||||
| elif [ $1 == 'cifar10' ]; then | |||||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||||
| else | |||||
| echo "Unrecognized parameter" | |||||
| exit 1 | |||||
| fi | |||||
| else | |||||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||||
| fi | |||||
| # python eval.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --ckpt_path=$CKPT_PATH --device_id=$DEVICE_ID --device_target="Ascend" > eval_log 2>&1 & | |||||
| python ../eval.py --config_path=$CONFIG_FILE > eval_log 2>&1 & | |||||
| @@ -0,0 +1,35 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| # an simple tutorial as follows, more parameters can be setting | |||||
| # echo "Usage: sh run_standalone_train_ascend.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]" | |||||
| BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) | |||||
| if [ $# -ge 1 ]; then | |||||
| if [ $1 == 'imagenet' ]; then | |||||
| CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" | |||||
| elif [ $1 == 'cifar10' ]; then | |||||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||||
| else | |||||
| echo "Unrecognized parameter" | |||||
| exit 1 | |||||
| fi | |||||
| else | |||||
| CONFIG_FILE="${BASE_PATH}/../default_config.yaml" | |||||
| fi | |||||
| # python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --device_id=$DEVICE_ID --device_target="Ascend" > log 2>&1 & | |||||
| python ../train.py --config_path=$CONFIG_FILE > log 2>&1 & | |||||
| @@ -17,7 +17,6 @@ Produce the dataset | |||||
| """ | """ | ||||
| import os | import os | ||||
| import mindspore.dataset as ds | import mindspore.dataset as ds | ||||
| import mindspore.dataset.transforms.c_transforms as C | import mindspore.dataset.transforms.c_transforms as C | ||||
| import mindspore.dataset.vision.c_transforms as CV | import mindspore.dataset.vision.c_transforms as CV | ||||
| @@ -18,10 +18,14 @@ train alexnet and get network model files(.ckpt) : | |||||
| python train.py --data_path /YourDataPath | python train.py --data_path /YourDataPath | ||||
| """ | """ | ||||
| import ast | |||||
| import argparse | |||||
| import os | import os | ||||
| from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| from utils.moxing_adapter import moxing_wrapper | |||||
| from utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||||
| # from src.config import alexnet_cifar10_config, alexnet_imagenet_config | |||||
| from src.dataset import create_dataset_cifar10, create_dataset_imagenet | from src.dataset import create_dataset_cifar10, create_dataset_imagenet | ||||
| from src.generator_lr import get_lr_cifar10, get_lr_imagenet | from src.generator_lr import get_lr_cifar10, get_lr_imagenet | ||||
| from src.alexnet import AlexNet | from src.alexnet import AlexNet | ||||
| @@ -40,88 +44,84 @@ from mindspore.common import set_seed | |||||
| set_seed(1) | set_seed(1) | ||||
| de.config.set_seed(1) | de.config.set_seed(1) | ||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') | |||||
| parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], | |||||
| help='dataset name.') | |||||
| parser.add_argument('--sink_size', type=int, default=-1, help='control the amount of data in each sink') | |||||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], | |||||
| help='device where the code will be implemented (default: Ascend)') | |||||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||||
| path where the trained ckpt file') | |||||
| parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, | |||||
| default=True, help='dataset_sink_mode is False or True') | |||||
| parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') | |||||
| args = parser.parse_args() | |||||
| device_num = int(os.environ.get("DEVICE_NUM", 1)) | |||||
| if args.dataset_name == "cifar10": | |||||
| cfg = alexnet_cifar10_cfg | |||||
| if os.path.exists(config.data_path_local): | |||||
| config.data_path = config.data_path_local | |||||
| config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) | |||||
| else: | |||||
| config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) | |||||
| def modelarts_pre_process(): | |||||
| pass | |||||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||||
| def train_alexnet(): | |||||
| print(config) | |||||
| print('device id:', get_device_id()) | |||||
| print('device num:', get_device_num()) | |||||
| print('rank id:', get_rank_id()) | |||||
| print('job id:', get_job_id()) | |||||
| device_target = config.device_target | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| context.set_context(save_graphs=False) | |||||
| device_num = get_device_num() | |||||
| if config.dataset_name == "cifar10": | |||||
| if device_num > 1: | if device_num > 1: | ||||
| cfg.learning_rate = cfg.learning_rate * device_num | |||||
| cfg.epoch_size = cfg.epoch_size * 2 | |||||
| elif args.dataset_name == "imagenet": | |||||
| cfg = alexnet_imagenet_cfg | |||||
| config.learning_rate = config.learning_rate * device_num | |||||
| config.epoch_size = config.epoch_size * 2 | |||||
| elif config.dataset_name == "imagenet": | |||||
| pass | |||||
| else: | else: | ||||
| raise ValueError("Unsupported dataset.") | raise ValueError("Unsupported dataset.") | ||||
| device_target = args.device_target | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||||
| context.set_context(save_graphs=False) | |||||
| if device_target == "Ascend": | |||||
| context.set_context(device_id=args.device_id) | |||||
| if device_num > 1: | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||||
| gradients_mean=True) | |||||
| if device_num > 1: | |||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num=device_num, \ | |||||
| parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) | |||||
| if device_target == "Ascend": | |||||
| context.set_context(device_id=get_device_id()) | |||||
| init() | init() | ||||
| elif device_target == "GPU": | |||||
| if device_num > 1: | |||||
| elif device_target == "GPU": | |||||
| init() | init() | ||||
| context.reset_auto_parallel_context() | |||||
| context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||||
| gradients_mean=True) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported platform.") | |||||
| context.set_context(device_id=get_device_id()) | |||||
| if args.dataset_name == "cifar10": | |||||
| ds_train = create_dataset_cifar10(args.data_path, cfg.batch_size, target=args.device_target) | |||||
| elif args.dataset_name == "imagenet": | |||||
| ds_train = create_dataset_imagenet(args.data_path, cfg.batch_size) | |||||
| if config.dataset_name == "cifar10": | |||||
| ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) | |||||
| elif config.dataset_name == "imagenet": | |||||
| ds_train = create_dataset_imagenet(config.data_path, config.batch_size) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported dataset.") | raise ValueError("Unsupported dataset.") | ||||
| if ds_train.get_dataset_size() == 0: | if ds_train.get_dataset_size() == 0: | ||||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | ||||
| network = AlexNet(cfg.num_classes, phase='train') | |||||
| network = AlexNet(config.num_classes, phase='train') | |||||
| loss_scale_manager = None | loss_scale_manager = None | ||||
| metrics = None | metrics = None | ||||
| step_per_epoch = ds_train.get_dataset_size() if args.sink_size == -1 else args.sink_size | |||||
| if args.dataset_name == 'cifar10': | |||||
| step_per_epoch = ds_train.get_dataset_size() if config.sink_size == -1 else config.sink_size | |||||
| if config.dataset_name == 'cifar10': | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| lr = Tensor(get_lr_cifar10(0, cfg.learning_rate, cfg.epoch_size, step_per_epoch)) | |||||
| opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum) | |||||
| lr = Tensor(get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) | |||||
| opt = nn.Momentum(network.trainable_params(), lr, config.momentum) | |||||
| metrics = {"Accuracy": Accuracy()} | metrics = {"Accuracy": Accuracy()} | ||||
| elif args.dataset_name == 'imagenet': | |||||
| elif config.dataset_name == 'imagenet': | |||||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| lr = Tensor(get_lr_imagenet(cfg.learning_rate, cfg.epoch_size, step_per_epoch)) | |||||
| lr = Tensor(get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) | |||||
| opt = nn.Momentum(params=get_param_groups(network), | opt = nn.Momentum(params=get_param_groups(network), | ||||
| learning_rate=lr, | learning_rate=lr, | ||||
| momentum=cfg.momentum, | |||||
| weight_decay=cfg.weight_decay, | |||||
| loss_scale=cfg.loss_scale) | |||||
| momentum=config.momentum, | |||||
| weight_decay=config.weight_decay, | |||||
| loss_scale=config.loss_scale) | |||||
| from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager | ||||
| if cfg.is_dynamic_loss_scale == 1: | |||||
| if config.is_dynamic_loss_scale == 1: | |||||
| loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) | loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) | ||||
| else: | else: | ||||
| loss_scale_manager = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) | |||||
| loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | |||||
| else: | else: | ||||
| raise ValueError("Unsupported dataset.") | raise ValueError("Unsupported dataset.") | ||||
| @@ -135,15 +135,18 @@ if __name__ == "__main__": | |||||
| raise ValueError("Unsupported platform.") | raise ValueError("Unsupported platform.") | ||||
| if device_num > 1: | if device_num > 1: | ||||
| ckpt_save_dir = os.path.join(args.ckpt_path + "_" + str(get_rank())) | |||||
| ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) | |||||
| else: | else: | ||||
| ckpt_save_dir = args.ckpt_path | |||||
| ckpt_save_dir = config.checkpoint_path | |||||
| time_cb = TimeMonitor(data_size=step_per_epoch) | time_cb = TimeMonitor(data_size=step_per_epoch) | ||||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, | |||||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) | ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) | ||||
| print("============== Starting Training ==============") | print("============== Starting Training ==============") | ||||
| model.train(cfg.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], | |||||
| dataset_sink_mode=args.dataset_sink_mode, sink_size=args.sink_size) | |||||
| model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], | |||||
| dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size) | |||||
| if __name__ == "__main__": | |||||
| train_alexnet() | |||||
| @@ -0,0 +1,127 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Parse arguments""" | |||||
| import os | |||||
| import ast | |||||
| import argparse | |||||
| from pprint import pprint, pformat | |||||
| import yaml | |||||
| class Config: | |||||
| """ | |||||
| Configuration namespace. Convert dictionary to members. | |||||
| """ | |||||
| def __init__(self, cfg_dict): | |||||
| for k, v in cfg_dict.items(): | |||||
| if isinstance(v, (list, tuple)): | |||||
| setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) | |||||
| else: | |||||
| setattr(self, k, Config(v) if isinstance(v, dict) else v) | |||||
| def __str__(self): | |||||
| return pformat(self.__dict__) | |||||
| def __repr__(self): | |||||
| return self.__str__() | |||||
| def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): | |||||
| """ | |||||
| Parse command line arguments to the configuration according to the default yaml. | |||||
| Args: | |||||
| parser: Parent parser. | |||||
| cfg: Base configuration. | |||||
| helper: Helper description. | |||||
| cfg_path: Path to the default yaml config. | |||||
| """ | |||||
| parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", | |||||
| parents=[parser]) | |||||
| helper = {} if helper is None else helper | |||||
| choices = {} if choices is None else choices | |||||
| for item in cfg: | |||||
| if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): | |||||
| help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) | |||||
| choice = choices[item] if item in choices else None | |||||
| if isinstance(cfg[item], bool): | |||||
| parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, | |||||
| help=help_description) | |||||
| else: | |||||
| parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, | |||||
| help=help_description) | |||||
| args = parser.parse_args() | |||||
| return args | |||||
| def parse_yaml(yaml_path): | |||||
| """ | |||||
| Parse the yaml config file. | |||||
| Args: | |||||
| yaml_path: Path to the yaml config. | |||||
| """ | |||||
| with open(yaml_path, 'r') as fin: | |||||
| try: | |||||
| cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) | |||||
| cfgs = [x for x in cfgs] | |||||
| if len(cfgs) == 1: | |||||
| cfg_helper = {} | |||||
| cfg = cfgs[0] | |||||
| cfg_choices = {} | |||||
| elif len(cfgs) == 2: | |||||
| cfg, cfg_helper = cfgs | |||||
| cfg_choices = {} | |||||
| elif len(cfgs) == 3: | |||||
| cfg, cfg_helper, cfg_choices = cfgs | |||||
| else: | |||||
| raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") | |||||
| print(cfg_helper) | |||||
| except: | |||||
| raise ValueError("Failed to parse yaml") | |||||
| return cfg, cfg_helper, cfg_choices | |||||
| def merge(args, cfg): | |||||
| """ | |||||
| Merge the base config from yaml file and command line arguments. | |||||
| Args: | |||||
| args: Command line arguments. | |||||
| cfg: Base configuration. | |||||
| """ | |||||
| args_var = vars(args) | |||||
| for item in args_var: | |||||
| cfg[item] = args_var[item] | |||||
| return cfg | |||||
| def get_config(): | |||||
| """ | |||||
| Get Config according to the yaml file and cli arguments. | |||||
| """ | |||||
| parser = argparse.ArgumentParser(description="default name", add_help=False) | |||||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||||
| parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), | |||||
| help="Config file path") | |||||
| path_args, _ = parser.parse_known_args() | |||||
| default, helper, choices = parse_yaml(path_args.config_path) | |||||
| pprint(default) | |||||
| args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) | |||||
| final_config = merge(args, default) | |||||
| return Config(final_config) | |||||
| config = get_config() | |||||
| @@ -0,0 +1,27 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Device adapter for ModelArts""" | |||||
| from .config import config | |||||
| if config.enable_modelarts: | |||||
| from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||||
| else: | |||||
| from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||||
| __all__ = [ | |||||
| "get_device_id", "get_device_num", "get_rank_id", "get_job_id" | |||||
| ] | |||||
| @@ -0,0 +1,36 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Local adapter""" | |||||
| import os | |||||
| def get_device_id(): | |||||
| device_id = os.getenv('DEVICE_ID', '0') | |||||
| return int(device_id) | |||||
| def get_device_num(): | |||||
| device_num = os.getenv('RANK_SIZE', '1') | |||||
| return int(device_num) | |||||
| def get_rank_id(): | |||||
| global_rank_id = os.getenv('RANK_ID', '0') | |||||
| return int(global_rank_id) | |||||
| def get_job_id(): | |||||
| return "Local Job" | |||||
| @@ -0,0 +1,122 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Moxing adapter for ModelArts""" | |||||
| import os | |||||
| import functools | |||||
| from mindspore import context | |||||
| from mindspore.profiler import Profiler | |||||
| from .config import config | |||||
| _global_sync_count = 0 | |||||
| def get_device_id(): | |||||
| device_id = os.getenv('DEVICE_ID', '0') | |||||
| return int(device_id) | |||||
| def get_device_num(): | |||||
| device_num = os.getenv('RANK_SIZE', '1') | |||||
| return int(device_num) | |||||
| def get_rank_id(): | |||||
| global_rank_id = os.getenv('RANK_ID', '0') | |||||
| return int(global_rank_id) | |||||
| def get_job_id(): | |||||
| job_id = os.getenv('JOB_ID') | |||||
| job_id = job_id if job_id != "" else "default" | |||||
| return job_id | |||||
| def sync_data(from_path, to_path): | |||||
| """ | |||||
| Download data from remote obs to local directory if the first url is remote url and the second one is local path | |||||
| Upload data from local directory to remote obs in contrast. | |||||
| """ | |||||
| import moxing as mox | |||||
| import time | |||||
| global _global_sync_count | |||||
| sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) | |||||
| _global_sync_count += 1 | |||||
| # Each server contains 8 devices as most. | |||||
| if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): | |||||
| print("from path: ", from_path) | |||||
| print("to path: ", to_path) | |||||
| mox.file.copy_parallel(from_path, to_path) | |||||
| print("===finish data synchronization===") | |||||
| try: | |||||
| os.mknod(sync_lock) | |||||
| except IOError: | |||||
| pass | |||||
| print("===save flag===") | |||||
| while True: | |||||
| if os.path.exists(sync_lock): | |||||
| break | |||||
| time.sleep(1) | |||||
| print("Finish sync data from {} to {}.".format(from_path, to_path)) | |||||
| def moxing_wrapper(pre_process=None, post_process=None): | |||||
| """ | |||||
| Moxing wrapper to download dataset and upload outputs. | |||||
| """ | |||||
| def wrapper(run_func): | |||||
| @functools.wraps(run_func) | |||||
| def wrapped_func(*args, **kwargs): | |||||
| # Download data from data_url | |||||
| if config.enable_modelarts: | |||||
| if config.data_url: | |||||
| sync_data(config.data_url, config.data_path) | |||||
| print("Dataset downloaded: ", os.listdir(config.data_path)) | |||||
| if config.checkpoint_url: | |||||
| sync_data(config.checkpoint_url, config.load_path) | |||||
| print("Preload downloaded: ", os.listdir(config.load_path)) | |||||
| if config.train_url: | |||||
| sync_data(config.train_url, config.output_path) | |||||
| print("Workspace downloaded: ", os.listdir(config.output_path)) | |||||
| context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) | |||||
| config.device_num = get_device_num() | |||||
| config.device_id = get_device_id() | |||||
| if not os.path.exists(config.output_path): | |||||
| os.makedirs(config.output_path) | |||||
| if pre_process: | |||||
| pre_process() | |||||
| if config.enable_profiling: | |||||
| profiler = Profiler() | |||||
| run_func(*args, **kwargs) | |||||
| if config.enable_profiling: | |||||
| profiler.analyse() | |||||
| # Upload data to train_url | |||||
| if config.enable_modelarts: | |||||
| if post_process: | |||||
| post_process() | |||||
| if config.train_url: | |||||
| print("Start to copy output directory") | |||||
| sync_data(config.output_path, config.train_url) | |||||
| return wrapped_func | |||||
| return wrapper | |||||
| @@ -0,0 +1,56 @@ | |||||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||||
| enable_modelarts: False | |||||
| data_url: "" | |||||
| train_url: "" | |||||
| checkpoint_url: "" | |||||
| data_path: "/cache/data" | |||||
| output_path: "/cache/train" | |||||
| load_path: "/cache/checkpoint_path" | |||||
| checkpoint_path: './checkpoint/' | |||||
| checkpoint_file: './checkpoint/checkpoint_lenet-10_1875.ckpt' | |||||
| device_target: Ascend | |||||
| enable_profiling: False | |||||
| data_path_local: '/data/hcm/data/MNIST_Data/' | |||||
| ckpt_path_local: '/data/hcm/data/ckpt_lenet/checkpoint_lenet-10_1875.ckpt' | |||||
| # ============================================================================== | |||||
| # Training options | |||||
| num_classes: 10 | |||||
| lr: 0.01 | |||||
| momentum: 0.9 | |||||
| epoch_size: 10 | |||||
| batch_size: 15 # 32 | |||||
| buffer_size: 1000 | |||||
| image_height: 32 | |||||
| image_width: 32 | |||||
| save_checkpoint_steps: 1875 | |||||
| keep_checkpoint_max: 10 | |||||
| air_name: "lenet" | |||||
| device_id: 0 | |||||
| file_name: "lenet" | |||||
| file_format: "AIR" | |||||
| model_name: lenet | |||||
| learning_rate: 0.002 | |||||
| dataset_name: 'mnist' | |||||
| sink_size: -1 | |||||
| dataset_sink_mode: True | |||||
| save_checkpoint: True | |||||
| save_checkpoint_epochs: 2 | |||||
| --- | |||||
| # Config description for each option | |||||
| enable_modelarts: 'Whether training on modelarts, default: False' | |||||
| data_url: 'Dataset url for obs' | |||||
| train_url: 'Training output url for obs' | |||||
| data_path: 'Dataset path for local' | |||||
| output_path: 'Training output path for local' | |||||
| device_target: 'Target device type' | |||||
| enable_profiling: 'Whether enable profiling while training, default: False' | |||||
| file_name: 'output file name.' | |||||
| file_format: 'file format' | |||||
| --- | |||||
| device_target: ['Ascend', 'GPU', 'CPU'] | |||||
| file_format: ['AIR', 'ONNX', 'MINDIR'] | |||||
| @@ -19,43 +19,51 @@ python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||||
| """ | """ | ||||
| import os | import os | ||||
| import argparse | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| from utils.moxing_adapter import moxing_wrapper | |||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| from mindspore import context | from mindspore import context | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| from mindspore.train import Model | from mindspore.train import Model | ||||
| from mindspore.nn.metrics import Accuracy | from mindspore.nn.metrics import Accuracy | ||||
| from src.dataset import create_dataset | from src.dataset import create_dataset | ||||
| from src.config import mnist_cfg as cfg | |||||
| from src.lenet import LeNet5 | from src.lenet import LeNet5 | ||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend)') | |||||
| parser.add_argument('--data_path', type=str, default="./Data", | |||||
| help='path where the dataset is saved') | |||||
| parser.add_argument('--ckpt_path', type=str, default="", help='if mode is test, must provide\ | |||||
| path where the trained ckpt file') | |||||
| if os.path.exists(config.data_path_local): | |||||
| config.data_path = config.data_path_local | |||||
| ckpt_path = config.ckpt_path_local | |||||
| else: | |||||
| ckpt_path = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') | |||||
| def modelarts_process(): | |||||
| pass | |||||
| args = parser.parse_args() | |||||
| @moxing_wrapper(pre_process=modelarts_process) | |||||
| def eval_lenet(): | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| network = LeNet5(cfg.num_classes) | |||||
| network = LeNet5(config.num_classes) | |||||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| repeat_size = cfg.epoch_size | |||||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||||
| # repeat_size = config.epoch_size | |||||
| net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) | |||||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | ||||
| print("============== Starting Testing ==============") | print("============== Starting Testing ==============") | ||||
| param_dict = load_checkpoint(args.ckpt_path) | |||||
| param_dict = load_checkpoint(ckpt_path) | |||||
| load_param_into_net(network, param_dict) | load_param_into_net(network, param_dict) | ||||
| ds_eval = create_dataset(os.path.join(args.data_path, "test"), | |||||
| cfg.batch_size, | |||||
| ds_eval = create_dataset(os.path.join(config.data_path, "test"), | |||||
| config.batch_size, | |||||
| 1) | 1) | ||||
| if ds_eval.get_dataset_size() == 0: | if ds_eval.get_dataset_size() == 0: | ||||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | ||||
| acc = model.eval(ds_eval) | acc = model.eval(ds_eval) | ||||
| print("============== {} ==============".format(acc)) | print("============== {} ==============".format(acc)) | ||||
| if __name__ == "__main__": | |||||
| eval_lenet() | |||||
| @@ -14,37 +14,35 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """export checkpoint file into air, onnx, mindir models""" | """export checkpoint file into air, onnx, mindir models""" | ||||
| import argparse | |||||
| import numpy as np | |||||
| import os | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| from utils.device_adapter import get_device_id | |||||
| import numpy as np | |||||
| import mindspore | import mindspore | ||||
| from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export | from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export | ||||
| from src.config import mnist_cfg as cfg | |||||
| from src.lenet import LeNet5 | from src.lenet import LeNet5 | ||||
| parser = argparse.ArgumentParser(description='MindSpore MNIST Example') | |||||
| parser.add_argument("--device_id", type=int, default=0, help="Device id") | |||||
| parser.add_argument("--batch_size", type=int, default=1, help="batch size") | |||||
| parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") | |||||
| parser.add_argument("--file_name", type=str, default="lenet", help="output file name.") | |||||
| parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") | |||||
| parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend", | |||||
| help="device target") | |||||
| args = parser.parse_args() | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||||
| if args.device_target == "Ascend": | |||||
| context.set_context(device_id=args.device_id) | |||||
| if os.path.exists(config.data_path_local): | |||||
| ckpt_file = config.ckpt_path_local | |||||
| else: | |||||
| ckpt_file = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| if config.device_target == "Ascend": | |||||
| context.set_context(device_id=get_device_id()) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| # define fusion network | # define fusion network | ||||
| network = LeNet5(cfg.num_classes) | |||||
| network = LeNet5(config.num_classes) | |||||
| # load network checkpoint | # load network checkpoint | ||||
| param_dict = load_checkpoint(args.ckpt_file) | |||||
| param_dict = load_checkpoint(ckpt_file) | |||||
| load_param_into_net(network, param_dict) | load_param_into_net(network, param_dict) | ||||
| # export network | # export network | ||||
| inputs = Tensor(np.ones([args.batch_size, 1, cfg.image_height, cfg.image_width]), mindspore.float32) | |||||
| export(network, inputs, file_name=args.file_name, file_format=args.file_format) | |||||
| inputs = Tensor(np.ones([config.batch_size, 1, config.image_height, config.image_width]), mindspore.float32) | |||||
| export(network, inputs, file_name=config.file_name, file_format=config.file_format) | |||||
| @@ -17,6 +17,7 @@ | |||||
| # an simple tutorial as follows, more parameters can be setting | # an simple tutorial as follows, more parameters can be setting | ||||
| script_self=$(readlink -f "$0") | script_self=$(readlink -f "$0") | ||||
| self_path=$(dirname "${script_self}") | self_path=$(dirname "${script_self}") | ||||
| DATA_PATH=$1 | |||||
| CKPT_PATH=$2 | |||||
| python -s ${self_path}/../eval.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & | |||||
| # DATA_PATH=$1 | |||||
| # CKPT_PATH=$2 | |||||
| # --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH | |||||
| python -s ${self_path}/../eval.py > log_eval.txt 2>&1 & | |||||
| @@ -17,6 +17,7 @@ | |||||
| # an simple tutorial as follows, more parameters can be setting | # an simple tutorial as follows, more parameters can be setting | ||||
| script_self=$(readlink -f "$0") | script_self=$(readlink -f "$0") | ||||
| self_path=$(dirname "${script_self}") | self_path=$(dirname "${script_self}") | ||||
| DATA_PATH=$1 | |||||
| CKPT_PATH=$2 | |||||
| python -s ${self_path}/../train.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & | |||||
| # DATA_PATH=$1 | |||||
| # CKPT_PATH=$2 | |||||
| # --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH | |||||
| python -s ${self_path}/../train.py > log.txt 2>&1 & | |||||
| @@ -19,8 +19,12 @@ python train.py --data_path /YourDataPath | |||||
| """ | """ | ||||
| import os | import os | ||||
| import argparse | |||||
| from src.config import mnist_cfg as cfg | |||||
| # import sys | |||||
| # sys.path.append(os.path.join(os.getcwd(), 'utils')) | |||||
| from utils.config import config | |||||
| from utils.moxing_adapter import moxing_wrapper | |||||
| from utils.device_adapter import get_rank_id | |||||
| from src.dataset import create_dataset | from src.dataset import create_dataset | ||||
| from src.lenet import LeNet5 | from src.lenet import LeNet5 | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| @@ -30,36 +34,40 @@ from mindspore.train import Model | |||||
| from mindspore.nn.metrics import Accuracy | from mindspore.nn.metrics import Accuracy | ||||
| from mindspore.common import set_seed | from mindspore.common import set_seed | ||||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||||
| help='device where the code will be implemented (default: Ascend)') | |||||
| parser.add_argument('--data_path', type=str, default="./Data", | |||||
| help='path where the dataset is saved') | |||||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||||
| path where the trained ckpt file') | |||||
| args = parser.parse_args() | |||||
| set_seed(1) | set_seed(1) | ||||
| if os.path.exists(config.data_path_local): | |||||
| config.data_path = config.data_path_local | |||||
| config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) | |||||
| else: | |||||
| config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) | |||||
| if __name__ == "__main__": | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||||
| ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size) | |||||
| def modelarts_pre_process(): | |||||
| pass | |||||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||||
| def train_lenet(): | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||||
| ds_train = create_dataset(os.path.join(config.data_path, "train"), config.batch_size) | |||||
| if ds_train.get_dataset_size() == 0: | if ds_train.get_dataset_size() == 0: | ||||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | ||||
| network = LeNet5(cfg.num_classes) | |||||
| network = LeNet5(config.num_classes) | |||||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||||
| net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) | |||||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | ||||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=args.ckpt_path, config=config_ck) | |||||
| config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, | |||||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=config.checkpoint_path, config=config_ck) | |||||
| if args.device_target != "Ascend": | |||||
| if config.device_target != "Ascend": | |||||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | ||||
| else: | else: | ||||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") | model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") | ||||
| print("============== Starting Training ==============") | print("============== Starting Training ==============") | ||||
| model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) | |||||
| model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) | |||||
| if __name__ == "__main__": | |||||
| train_lenet() | |||||
| @@ -0,0 +1,127 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Parse arguments""" | |||||
| import os | |||||
| import ast | |||||
| import argparse | |||||
| from pprint import pprint, pformat | |||||
| import yaml | |||||
| class Config: | |||||
| """ | |||||
| Configuration namespace. Convert dictionary to members. | |||||
| """ | |||||
| def __init__(self, cfg_dict): | |||||
| for k, v in cfg_dict.items(): | |||||
| if isinstance(v, (list, tuple)): | |||||
| setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) | |||||
| else: | |||||
| setattr(self, k, Config(v) if isinstance(v, dict) else v) | |||||
| def __str__(self): | |||||
| return pformat(self.__dict__) | |||||
| def __repr__(self): | |||||
| return self.__str__() | |||||
| def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): | |||||
| """ | |||||
| Parse command line arguments to the configuration according to the default yaml. | |||||
| Args: | |||||
| parser: Parent parser. | |||||
| cfg: Base configuration. | |||||
| helper: Helper description. | |||||
| cfg_path: Path to the default yaml config. | |||||
| """ | |||||
| parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", | |||||
| parents=[parser]) | |||||
| helper = {} if helper is None else helper | |||||
| choices = {} if choices is None else choices | |||||
| for item in cfg: | |||||
| if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): | |||||
| help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) | |||||
| choice = choices[item] if item in choices else None | |||||
| if isinstance(cfg[item], bool): | |||||
| parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, | |||||
| help=help_description) | |||||
| else: | |||||
| parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, | |||||
| help=help_description) | |||||
| args = parser.parse_args() | |||||
| return args | |||||
| def parse_yaml(yaml_path): | |||||
| """ | |||||
| Parse the yaml config file. | |||||
| Args: | |||||
| yaml_path: Path to the yaml config. | |||||
| """ | |||||
| with open(yaml_path, 'r') as fin: | |||||
| try: | |||||
| cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) | |||||
| cfgs = [x for x in cfgs] | |||||
| if len(cfgs) == 1: | |||||
| cfg_helper = {} | |||||
| cfg = cfgs[0] | |||||
| cfg_choices = {} | |||||
| elif len(cfgs) == 2: | |||||
| cfg, cfg_helper = cfgs | |||||
| cfg_choices = {} | |||||
| elif len(cfgs) == 3: | |||||
| cfg, cfg_helper, cfg_choices = cfgs | |||||
| else: | |||||
| raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") | |||||
| print(cfg_helper) | |||||
| except: | |||||
| raise ValueError("Failed to parse yaml") | |||||
| return cfg, cfg_helper, cfg_choices | |||||
| def merge(args, cfg): | |||||
| """ | |||||
| Merge the base config from yaml file and command line arguments. | |||||
| Args: | |||||
| args: Command line arguments. | |||||
| cfg: Base configuration. | |||||
| """ | |||||
| args_var = vars(args) | |||||
| for item in args_var: | |||||
| cfg[item] = args_var[item] | |||||
| return cfg | |||||
| def get_config(): | |||||
| """ | |||||
| Get Config according to the yaml file and cli arguments. | |||||
| """ | |||||
| parser = argparse.ArgumentParser(description="default name", add_help=False) | |||||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||||
| parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), | |||||
| help="Config file path") | |||||
| path_args, _ = parser.parse_known_args() | |||||
| default, helper, choices = parse_yaml(path_args.config_path) | |||||
| pprint(default) | |||||
| args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) | |||||
| final_config = merge(args, default) | |||||
| return Config(final_config) | |||||
| config = get_config() | |||||
| @@ -0,0 +1,27 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Device adapter for ModelArts""" | |||||
| from .config import config | |||||
| if config.enable_modelarts: | |||||
| from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||||
| else: | |||||
| from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||||
| __all__ = [ | |||||
| "get_device_id", "get_device_num", "get_rank_id", "get_job_id" | |||||
| ] | |||||
| @@ -0,0 +1,36 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Local adapter""" | |||||
| import os | |||||
| def get_device_id(): | |||||
| device_id = os.getenv('DEVICE_ID', '0') | |||||
| return int(device_id) | |||||
| def get_device_num(): | |||||
| device_num = os.getenv('RANK_SIZE', '1') | |||||
| return int(device_num) | |||||
| def get_rank_id(): | |||||
| global_rank_id = os.getenv('RANK_ID', '0') | |||||
| return int(global_rank_id) | |||||
| def get_job_id(): | |||||
| return "Local Job" | |||||
| @@ -0,0 +1,122 @@ | |||||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Moxing adapter for ModelArts""" | |||||
| import os | |||||
| import functools | |||||
| from mindspore import context | |||||
| from mindspore.profiler import Profiler | |||||
| from .config import config | |||||
| _global_sync_count = 0 | |||||
| def get_device_id(): | |||||
| device_id = os.getenv('DEVICE_ID', '0') | |||||
| return int(device_id) | |||||
| def get_device_num(): | |||||
| device_num = os.getenv('RANK_SIZE', '1') | |||||
| return int(device_num) | |||||
| def get_rank_id(): | |||||
| global_rank_id = os.getenv('RANK_ID', '0') | |||||
| return int(global_rank_id) | |||||
| def get_job_id(): | |||||
| job_id = os.getenv('JOB_ID') | |||||
| job_id = job_id if job_id != "" else "default" | |||||
| return job_id | |||||
| def sync_data(from_path, to_path): | |||||
| """ | |||||
| Download data from remote obs to local directory if the first url is remote url and the second one is local path | |||||
| Upload data from local directory to remote obs in contrast. | |||||
| """ | |||||
| import moxing as mox | |||||
| import time | |||||
| global _global_sync_count | |||||
| sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) | |||||
| _global_sync_count += 1 | |||||
| # Each server contains 8 devices as most. | |||||
| if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): | |||||
| print("from path: ", from_path) | |||||
| print("to path: ", to_path) | |||||
| mox.file.copy_parallel(from_path, to_path) | |||||
| print("===finish data synchronization===") | |||||
| try: | |||||
| os.mknod(sync_lock) | |||||
| except IOError: | |||||
| pass | |||||
| print("===save flag===") | |||||
| while True: | |||||
| if os.path.exists(sync_lock): | |||||
| break | |||||
| time.sleep(1) | |||||
| print("Finish sync data from {} to {}.".format(from_path, to_path)) | |||||
| def moxing_wrapper(pre_process=None, post_process=None): | |||||
| """ | |||||
| Moxing wrapper to download dataset and upload outputs. | |||||
| """ | |||||
| def wrapper(run_func): | |||||
| @functools.wraps(run_func) | |||||
| def wrapped_func(*args, **kwargs): | |||||
| # Download data from data_url | |||||
| if config.enable_modelarts: | |||||
| if config.data_url: | |||||
| sync_data(config.data_url, config.data_path) | |||||
| print("Dataset downloaded: ", os.listdir(config.data_path)) | |||||
| if config.checkpoint_url: | |||||
| sync_data(config.checkpoint_url, config.load_path) | |||||
| print("Preload downloaded: ", os.listdir(config.load_path)) | |||||
| if config.train_url: | |||||
| sync_data(config.train_url, config.output_path) | |||||
| print("Workspace downloaded: ", os.listdir(config.output_path)) | |||||
| context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) | |||||
| config.device_num = get_device_num() | |||||
| config.device_id = get_device_id() | |||||
| if not os.path.exists(config.output_path): | |||||
| os.makedirs(config.output_path) | |||||
| if pre_process: | |||||
| pre_process() | |||||
| if config.enable_profiling: | |||||
| profiler = Profiler() | |||||
| run_func(*args, **kwargs) | |||||
| if config.enable_profiling: | |||||
| profiler.analyse() | |||||
| # Upload data to train_url | |||||
| if config.enable_modelarts: | |||||
| if post_process: | |||||
| post_process() | |||||
| if config.train_url: | |||||
| print("Start to copy output directory") | |||||
| sync_data(config.output_path, config.train_url) | |||||
| return wrapped_func | |||||
| return wrapper | |||||