From 5efb0ae94037612b6c8fda936fbbafd5ee422d26 Mon Sep 17 00:00:00 2001 From: wwx691809 Date: Tue, 27 Apr 2021 19:55:14 +0800 Subject: [PATCH] cloud --- .../official/cv/alexnet/config_imagenet.yaml | 61 ++++++++ .../official/cv/alexnet/default_config.yaml | 56 ++++++++ model_zoo/official/cv/alexnet/eval.py | 79 ++++++----- model_zoo/official/cv/alexnet/export.py | 50 +++---- .../scripts/run_eval_standalone_ascend.sh | 35 +++++ .../scripts/run_train_standalone_ascend.sh | 35 +++++ model_zoo/official/cv/alexnet/src/dataset.py | 1 - model_zoo/official/cv/alexnet/train.py | 131 +++++++++--------- .../official/cv/alexnet/utils/__init__.py | 0 model_zoo/official/cv/alexnet/utils/config.py | 127 +++++++++++++++++ .../cv/alexnet/utils/device_adapter.py | 27 ++++ .../cv/alexnet/utils/local_adapter.py | 36 +++++ .../cv/alexnet/utils/moxing_adapter.py | 122 ++++++++++++++++ .../official/cv/lenet/default_config.yaml | 56 ++++++++ model_zoo/official/cv/lenet/eval.py | 44 +++--- model_zoo/official/cv/lenet/export.py | 38 +++-- .../scripts/run_standalone_eval_ascend.sh | 7 +- .../scripts/run_standalone_train_ascend.sh | 7 +- model_zoo/official/cv/lenet/train.py | 50 ++++--- model_zoo/official/cv/lenet/utils/__init__.py | 0 model_zoo/official/cv/lenet/utils/config.py | 127 +++++++++++++++++ .../official/cv/lenet/utils/device_adapter.py | 27 ++++ .../official/cv/lenet/utils/local_adapter.py | 36 +++++ .../official/cv/lenet/utils/moxing_adapter.py | 122 ++++++++++++++++ 24 files changed, 1078 insertions(+), 196 deletions(-) create mode 100644 model_zoo/official/cv/alexnet/config_imagenet.yaml create mode 100644 model_zoo/official/cv/alexnet/default_config.yaml create mode 100644 model_zoo/official/cv/alexnet/scripts/run_eval_standalone_ascend.sh create mode 100644 model_zoo/official/cv/alexnet/scripts/run_train_standalone_ascend.sh create mode 100644 model_zoo/official/cv/alexnet/utils/__init__.py create mode 100644 model_zoo/official/cv/alexnet/utils/config.py create mode 100644 model_zoo/official/cv/alexnet/utils/device_adapter.py create mode 100644 model_zoo/official/cv/alexnet/utils/local_adapter.py create mode 100644 model_zoo/official/cv/alexnet/utils/moxing_adapter.py create mode 100644 model_zoo/official/cv/lenet/default_config.yaml create mode 100644 model_zoo/official/cv/lenet/utils/__init__.py create mode 100644 model_zoo/official/cv/lenet/utils/config.py create mode 100644 model_zoo/official/cv/lenet/utils/device_adapter.py create mode 100644 model_zoo/official/cv/lenet/utils/local_adapter.py create mode 100644 model_zoo/official/cv/lenet/utils/moxing_adapter.py diff --git a/model_zoo/official/cv/alexnet/config_imagenet.yaml b/model_zoo/official/cv/alexnet/config_imagenet.yaml new file mode 100644 index 0000000000..f8bd8940c7 --- /dev/null +++ b/model_zoo/official/cv/alexnet/config_imagenet.yaml @@ -0,0 +1,61 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' +device_target: Ascend +enable_profiling: False + +data_path_local: '/data/hcm/data/ImageNet_Original/' +ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' + +# ============================================================================== +# Training options +num_classes: 1000 +learning_rate: 0.13 +momentum: 0.9 +epoch_size: 150 +batch_size: 256 +buffer_size: None +image_height: 224 +image_width: 224 +save_checkpoint_steps: 625 +keep_checkpoint_max: 10 +air_name: 'alexnet.air' + +weight_decay: 0.0001 +loss_scale: 1024 +is_dynamic_loss_scale: 0 + +# Model Description +model_name: alexnet +file_name: 'alexnet' +file_format: 'AIR' + +dataset_name: 'imagenet' +sink_size: -1 +dataset_sink_mode: True +device_id: 0 +save_checkpoint: True +save_checkpoint_epochs: 2 +lr: 0.01 + + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +device_target: 'Target device type' +enable_profiling: 'Whether enable profiling while training, default: False' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] \ No newline at end of file diff --git a/model_zoo/official/cv/alexnet/default_config.yaml b/model_zoo/official/cv/alexnet/default_config.yaml new file mode 100644 index 0000000000..bdac871cc6 --- /dev/null +++ b/model_zoo/official/cv/alexnet/default_config.yaml @@ -0,0 +1,56 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/checkpoint_alexnet-30_1562.ckpt' +device_target: Ascend +enable_profiling: False + +data_path_local: '/data/hcm/data/cifar-10-batches-bin/' +ckpt_path_local: '/data/hcm/data/ckpt_alexnet/checkpoint_alexnet-30_1562.ckpt' +# ============================================================================== +# Training options +epoch_size: 30 +keep_checkpoint_max: 10 +num_classes: 10 +learning_rate: 0.002 +momentum: 0.9 +batch_size: 32 +buffer_size: 1000 +image_height: 227 +image_width: 227 +save_checkpoint_steps: 1562 +air_name: 'alexnet.air' + +dataset_name: 'cifar10' +sink_size: -1 +dataset_sink_mode: True +device_id: 0 +save_checkpoint: True +save_checkpoint_epochs: 2 +lr: 0.01 + +# Model Description +model_name: alexnet +file_name: 'alexnet' +file_format: 'AIR' + + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +device_target: 'Target device type' +enable_profiling: 'Whether enable profiling while training, default: False' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] \ No newline at end of file diff --git a/model_zoo/official/cv/alexnet/eval.py b/model_zoo/official/cv/alexnet/eval.py index 2cb7dfcbf4..6e2236e5fb 100644 --- a/model_zoo/official/cv/alexnet/eval.py +++ b/model_zoo/official/cv/alexnet/eval.py @@ -18,9 +18,13 @@ eval alexnet according to model file: python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt """ -import ast -import argparse -from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg +import os +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config +from utils.moxing_adapter import moxing_wrapper +from utils.device_adapter import get_device_id, get_device_num + from src.dataset import create_dataset_cifar10, create_dataset_imagenet from src.alexnet import AlexNet import mindspore.nn as nn @@ -28,51 +32,52 @@ from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train import Model from mindspore.nn.metrics import Accuracy +from mindspore.communication.management import init -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') - parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], - help='dataset name.') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], - help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') - parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ - path where the trained ckpt file') - parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, - default=True, help='dataset_sink_mode is False or True') - parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') - args = parser.parse_args() +if os.path.exists(config.data_path_local): + config.data_path = config.data_path_local + load_path = config.ckpt_path_local +else: + load_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) +def modelarts_process(): + pass +@moxing_wrapper(pre_process=modelarts_process) +def eval_alexnet(): print("============== Starting Testing ==============") - if args.dataset_name == 'cifar10': - cfg = alexnet_cifar10_cfg - network = AlexNet(cfg.num_classes, phase='test') - loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) - ds_eval = create_dataset_cifar10(args.data_path, cfg.batch_size, status="test", target=args.device_target) + device_num = get_device_num() + if device_num > 1: + # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) + if config.device_target == "Ascend": + context.set_context(device_id=get_device_id()) + init() + elif config.device_target == "GPU": + init() - param_dict = load_checkpoint(args.ckpt_path) - print("load checkpoint from [{}].".format(args.ckpt_path)) + if config.dataset_name == 'cifar10': + network = AlexNet(config.num_classes, phase='test') + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) + ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ + target=config.device_target) + param_dict = load_checkpoint(load_path) + print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) - model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) - elif args.dataset_name == 'imagenet': - cfg = alexnet_imagenet_cfg - network = AlexNet(cfg.num_classes, phase='test') + elif config.dataset_name == 'imagenet': + network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - ds_eval = create_dataset_imagenet(args.data_path, cfg.batch_size, training=False) - - param_dict = load_checkpoint(args.ckpt_path) - print("load checkpoint from [{}].".format(args.ckpt_path)) + ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) + param_dict = load_checkpoint(load_path) + print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) - model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) else: @@ -81,5 +86,9 @@ if __name__ == "__main__": if ds_eval.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") - result = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) + result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) print("result : {}".format(result)) + + +if __name__ == "__main__": + eval_alexnet() diff --git a/model_zoo/official/cv/alexnet/export.py b/model_zoo/official/cv/alexnet/export.py index 3ad95a944d..3da3d56685 100644 --- a/model_zoo/official/cv/alexnet/export.py +++ b/model_zoo/official/cv/alexnet/export.py @@ -16,44 +16,32 @@ ##############export checkpoint file into air, onnx, mindir models################# python export.py """ -import argparse -import numpy as np +import os +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config + +import numpy as np import mindspore as ms from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export - -from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg from src.alexnet import AlexNet -parser = argparse.ArgumentParser(description='Classification') -parser.add_argument("--device_id", type=int, default=0, help="Device id") -parser.add_argument("--batch_size", type=int, default=1, help="batch size") -parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], - help='please choose dataset: imagenet or cifar10.') -parser.add_argument('--device_target', type=str, default="Ascend", - choices=['Ascend', 'GPU', 'CPU'], - help='device where the code will be implemented (default: Ascend)') -parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") -parser.add_argument("--file_name", type=str, default="alexnet", help="output file name.") -parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") -args_opt = parser.parse_args() - -context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) -if args_opt.device_target == "Ascend": - context.set_context(device_id=args_opt.device_id) -if __name__ == '__main__': - if args_opt.dataset_name == 'cifar10': - cfg = alexnet_cifar10_cfg - elif args_opt.dataset_name == 'imagenet': - cfg = alexnet_imagenet_cfg - else: - raise ValueError("dataset is not support.") +if os.path.exists(config.data_path_local): + ckpt_path = config.ckpt_path_local +else: + ckpt_path = os.path.join(config.data_path, 'checkpoint_alexnet-30_1562.ckpt') + +context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +if config.device_target == "Ascend": + context.set_context(device_id=config.device_id) - net = AlexNet(num_classes=cfg.num_classes) +if __name__ == '__main__': + net = AlexNet(num_classes=config.num_classes) - param_dict = load_checkpoint(args_opt.ckpt_file) + param_dict = load_checkpoint(ckpt_path) load_param_into_net(net, param_dict) - input_arr = Tensor(np.zeros([args_opt.batch_size, 3, cfg.image_height, cfg.image_width]), ms.float32) - export(net, input_arr, file_name=args_opt.file_name, file_format=args_opt.file_format) + input_arr = Tensor(np.zeros([config.batch_size, 3, config.image_height, config.image_width]), ms.float32) + export(net, input_arr, file_name=config.file_name, file_format=config.file_format) diff --git a/model_zoo/official/cv/alexnet/scripts/run_eval_standalone_ascend.sh b/model_zoo/official/cv/alexnet/scripts/run_eval_standalone_ascend.sh new file mode 100644 index 0000000000..09918e7149 --- /dev/null +++ b/model_zoo/official/cv/alexnet/scripts/run_eval_standalone_ascend.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# an simple tutorial as follows, more parameters can be setting +# echo "Usage: sh run_standalone_eval_ascend.sh [cifar10|imagenet] [DATA_PATH] [CKPT_PATH] [DEVICE_ID]" + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) + +if [ $# -ge 1 ]; then + if [ $1 == 'imagenet' ]; then + CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" + elif [ $1 == 'cifar10' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + else + echo "Unrecognized parameter" + exit 1 + fi +else + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" +fi + +# python eval.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --ckpt_path=$CKPT_PATH --device_id=$DEVICE_ID --device_target="Ascend" > eval_log 2>&1 & +python ../eval.py --config_path=$CONFIG_FILE > eval_log 2>&1 & \ No newline at end of file diff --git a/model_zoo/official/cv/alexnet/scripts/run_train_standalone_ascend.sh b/model_zoo/official/cv/alexnet/scripts/run_train_standalone_ascend.sh new file mode 100644 index 0000000000..7d5e750cff --- /dev/null +++ b/model_zoo/official/cv/alexnet/scripts/run_train_standalone_ascend.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# an simple tutorial as follows, more parameters can be setting +# echo "Usage: sh run_standalone_train_ascend.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]" + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) + +if [ $# -ge 1 ]; then + if [ $1 == 'imagenet' ]; then + CONFIG_FILE="${BASE_PATH}/../config_imagenet.yaml" + elif [ $1 == 'cifar10' ]; then + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + else + echo "Unrecognized parameter" + exit 1 + fi +else + CONFIG_FILE="${BASE_PATH}/../default_config.yaml" +fi + +# python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH --device_id=$DEVICE_ID --device_target="Ascend" > log 2>&1 & +python ../train.py --config_path=$CONFIG_FILE > log 2>&1 & \ No newline at end of file diff --git a/model_zoo/official/cv/alexnet/src/dataset.py b/model_zoo/official/cv/alexnet/src/dataset.py index d0a1135acb..857f6c9141 100644 --- a/model_zoo/official/cv/alexnet/src/dataset.py +++ b/model_zoo/official/cv/alexnet/src/dataset.py @@ -17,7 +17,6 @@ Produce the dataset """ import os - import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.vision.c_transforms as CV diff --git a/model_zoo/official/cv/alexnet/train.py b/model_zoo/official/cv/alexnet/train.py index d546ff2cfa..1500bc5841 100644 --- a/model_zoo/official/cv/alexnet/train.py +++ b/model_zoo/official/cv/alexnet/train.py @@ -18,10 +18,14 @@ train alexnet and get network model files(.ckpt) : python train.py --data_path /YourDataPath """ -import ast -import argparse import os -from src.config import alexnet_cifar10_cfg, alexnet_imagenet_cfg +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config +from utils.moxing_adapter import moxing_wrapper +from utils.device_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +# from src.config import alexnet_cifar10_config, alexnet_imagenet_config from src.dataset import create_dataset_cifar10, create_dataset_imagenet from src.generator_lr import get_lr_cifar10, get_lr_imagenet from src.alexnet import AlexNet @@ -40,88 +44,84 @@ from mindspore.common import set_seed set_seed(1) de.config.set_seed(1) -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') - parser.add_argument('--dataset_name', type=str, default='cifar10', choices=['imagenet', 'cifar10'], - help='dataset name.') - parser.add_argument('--sink_size', type=int, default=-1, help='control the amount of data in each sink') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], - help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') - parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ - path where the trained ckpt file') - parser.add_argument('--dataset_sink_mode', type=ast.literal_eval, - default=True, help='dataset_sink_mode is False or True') - parser.add_argument('--device_id', type=int, default=0, help='device id of GPU or Ascend. (Default: 0)') - args = parser.parse_args() - - device_num = int(os.environ.get("DEVICE_NUM", 1)) - if args.dataset_name == "cifar10": - cfg = alexnet_cifar10_cfg +if os.path.exists(config.data_path_local): + config.data_path = config.data_path_local + config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) +else: + config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) + +def modelarts_pre_process(): + pass + +@moxing_wrapper(pre_process=modelarts_pre_process) +def train_alexnet(): + print(config) + print('device id:', get_device_id()) + print('device num:', get_device_num()) + print('rank id:', get_rank_id()) + print('job id:', get_job_id()) + + device_target = config.device_target + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + context.set_context(save_graphs=False) + + device_num = get_device_num() + if config.dataset_name == "cifar10": if device_num > 1: - cfg.learning_rate = cfg.learning_rate * device_num - cfg.epoch_size = cfg.epoch_size * 2 - elif args.dataset_name == "imagenet": - cfg = alexnet_imagenet_cfg + config.learning_rate = config.learning_rate * device_num + config.epoch_size = config.epoch_size * 2 + elif config.dataset_name == "imagenet": + pass else: raise ValueError("Unsupported dataset.") - device_target = args.device_target - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - context.set_context(save_graphs=False) - - if device_target == "Ascend": - context.set_context(device_id=args.device_id) - - if device_num > 1: - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) + if device_num > 1: + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num=device_num, \ + parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + if device_target == "Ascend": + context.set_context(device_id=get_device_id()) init() - elif device_target == "GPU": - if device_num > 1: + elif device_target == "GPU": init() - context.reset_auto_parallel_context() - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True) else: - raise ValueError("Unsupported platform.") + context.set_context(device_id=get_device_id()) - if args.dataset_name == "cifar10": - ds_train = create_dataset_cifar10(args.data_path, cfg.batch_size, target=args.device_target) - elif args.dataset_name == "imagenet": - ds_train = create_dataset_imagenet(args.data_path, cfg.batch_size) + if config.dataset_name == "cifar10": + ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) + elif config.dataset_name == "imagenet": + ds_train = create_dataset_imagenet(config.data_path, config.batch_size) else: raise ValueError("Unsupported dataset.") if ds_train.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") - network = AlexNet(cfg.num_classes, phase='train') + network = AlexNet(config.num_classes, phase='train') loss_scale_manager = None metrics = None - step_per_epoch = ds_train.get_dataset_size() if args.sink_size == -1 else args.sink_size - if args.dataset_name == 'cifar10': + step_per_epoch = ds_train.get_dataset_size() if config.sink_size == -1 else config.sink_size + if config.dataset_name == 'cifar10': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - lr = Tensor(get_lr_cifar10(0, cfg.learning_rate, cfg.epoch_size, step_per_epoch)) - opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum) + lr = Tensor(get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) + opt = nn.Momentum(network.trainable_params(), lr, config.momentum) metrics = {"Accuracy": Accuracy()} - elif args.dataset_name == 'imagenet': + elif config.dataset_name == 'imagenet': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - lr = Tensor(get_lr_imagenet(cfg.learning_rate, cfg.epoch_size, step_per_epoch)) + lr = Tensor(get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(params=get_param_groups(network), learning_rate=lr, - momentum=cfg.momentum, - weight_decay=cfg.weight_decay, - loss_scale=cfg.loss_scale) + momentum=config.momentum, + weight_decay=config.weight_decay, + loss_scale=config.loss_scale) from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager - if cfg.is_dynamic_loss_scale == 1: + if config.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: - loss_scale_manager = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) + loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) else: raise ValueError("Unsupported dataset.") @@ -135,15 +135,18 @@ if __name__ == "__main__": raise ValueError("Unsupported platform.") if device_num > 1: - ckpt_save_dir = os.path.join(args.ckpt_path + "_" + str(get_rank())) + ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) else: - ckpt_save_dir = args.ckpt_path + ckpt_save_dir = config.checkpoint_path time_cb = TimeMonitor(data_size=step_per_epoch) - config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, + keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) print("============== Starting Training ==============") - model.train(cfg.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], - dataset_sink_mode=args.dataset_sink_mode, sink_size=args.sink_size) + model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], + dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size) + +if __name__ == "__main__": + train_alexnet() diff --git a/model_zoo/official/cv/alexnet/utils/__init__.py b/model_zoo/official/cv/alexnet/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model_zoo/official/cv/alexnet/utils/config.py b/model_zoo/official/cv/alexnet/utils/config.py new file mode 100644 index 0000000000..2c191e9f74 --- /dev/null +++ b/model_zoo/official/cv/alexnet/utils/config.py @@ -0,0 +1,127 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pprint, pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/official/cv/alexnet/utils/device_adapter.py b/model_zoo/official/cv/alexnet/utils/device_adapter.py new file mode 100644 index 0000000000..7c5d7f837d --- /dev/null +++ b/model_zoo/official/cv/alexnet/utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/model_zoo/official/cv/alexnet/utils/local_adapter.py b/model_zoo/official/cv/alexnet/utils/local_adapter.py new file mode 100644 index 0000000000..769fa6dc78 --- /dev/null +++ b/model_zoo/official/cv/alexnet/utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/model_zoo/official/cv/alexnet/utils/moxing_adapter.py b/model_zoo/official/cv/alexnet/utils/moxing_adapter.py new file mode 100644 index 0000000000..830d19a6fc --- /dev/null +++ b/model_zoo/official/cv/alexnet/utils/moxing_adapter.py @@ -0,0 +1,122 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from mindspore.profiler import Profiler +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + if config.enable_profiling: + profiler = Profiler() + + run_func(*args, **kwargs) + + if config.enable_profiling: + profiler.analyse() + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/model_zoo/official/cv/lenet/default_config.yaml b/model_zoo/official/cv/lenet/default_config.yaml new file mode 100644 index 0000000000..b6d7ecb363 --- /dev/null +++ b/model_zoo/official/cv/lenet/default_config.yaml @@ -0,0 +1,56 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/checkpoint_lenet-10_1875.ckpt' +device_target: Ascend +enable_profiling: False + +data_path_local: '/data/hcm/data/MNIST_Data/' +ckpt_path_local: '/data/hcm/data/ckpt_lenet/checkpoint_lenet-10_1875.ckpt' +# ============================================================================== +# Training options +num_classes: 10 +lr: 0.01 +momentum: 0.9 +epoch_size: 10 +batch_size: 15 # 32 +buffer_size: 1000 +image_height: 32 +image_width: 32 +save_checkpoint_steps: 1875 +keep_checkpoint_max: 10 +air_name: "lenet" +device_id: 0 +file_name: "lenet" +file_format: "AIR" + +model_name: lenet +learning_rate: 0.002 +dataset_name: 'mnist' +sink_size: -1 +dataset_sink_mode: True +save_checkpoint: True +save_checkpoint_epochs: 2 + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +device_target: 'Target device type' +enable_profiling: 'Whether enable profiling while training, default: False' +file_name: 'output file name.' +file_format: 'file format' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] +file_format: ['AIR', 'ONNX', 'MINDIR'] diff --git a/model_zoo/official/cv/lenet/eval.py b/model_zoo/official/cv/lenet/eval.py index 7091e360a5..11b6dd876c 100644 --- a/model_zoo/official/cv/lenet/eval.py +++ b/model_zoo/official/cv/lenet/eval.py @@ -19,43 +19,51 @@ python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt """ import os -import argparse +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config +from utils.moxing_adapter import moxing_wrapper + import mindspore.nn as nn from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train import Model from mindspore.nn.metrics import Accuracy from src.dataset import create_dataset -from src.config import mnist_cfg as cfg from src.lenet import LeNet5 -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], - help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_path', type=str, default="./Data", - help='path where the dataset is saved') - parser.add_argument('--ckpt_path', type=str, default="", help='if mode is test, must provide\ - path where the trained ckpt file') +if os.path.exists(config.data_path_local): + config.data_path = config.data_path_local + ckpt_path = config.ckpt_path_local +else: + ckpt_path = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') + +def modelarts_process(): + pass - args = parser.parse_args() +@moxing_wrapper(pre_process=modelarts_process) +def eval_lenet(): - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) - network = LeNet5(cfg.num_classes) + network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - repeat_size = cfg.epoch_size - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + # repeat_size = config.epoch_size + net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") - param_dict = load_checkpoint(args.ckpt_path) + param_dict = load_checkpoint(ckpt_path) load_param_into_net(network, param_dict) - ds_eval = create_dataset(os.path.join(args.data_path, "test"), - cfg.batch_size, + ds_eval = create_dataset(os.path.join(config.data_path, "test"), + config.batch_size, 1) if ds_eval.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") acc = model.eval(ds_eval) print("============== {} ==============".format(acc)) + + +if __name__ == "__main__": + eval_lenet() diff --git a/model_zoo/official/cv/lenet/export.py b/model_zoo/official/cv/lenet/export.py index 440630d49f..6c8ce0a89b 100644 --- a/model_zoo/official/cv/lenet/export.py +++ b/model_zoo/official/cv/lenet/export.py @@ -14,37 +14,35 @@ # ============================================================================ """export checkpoint file into air, onnx, mindir models""" -import argparse -import numpy as np +import os +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config +from utils.device_adapter import get_device_id +import numpy as np import mindspore from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export - -from src.config import mnist_cfg as cfg from src.lenet import LeNet5 -parser = argparse.ArgumentParser(description='MindSpore MNIST Example') -parser.add_argument("--device_id", type=int, default=0, help="Device id") -parser.add_argument("--batch_size", type=int, default=1, help="batch size") -parser.add_argument("--ckpt_file", type=str, required=True, help="Checkpoint file path.") -parser.add_argument("--file_name", type=str, default="lenet", help="output file name.") -parser.add_argument("--file_format", type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") -parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend", - help="device target") -args = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) -if args.device_target == "Ascend": - context.set_context(device_id=args.device_id) +if os.path.exists(config.data_path_local): + ckpt_file = config.ckpt_path_local +else: + ckpt_file = os.path.join(config.data_path, 'checkpoint_lenet-10_1875.ckpt') + +context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) +if config.device_target == "Ascend": + context.set_context(device_id=get_device_id()) if __name__ == "__main__": # define fusion network - network = LeNet5(cfg.num_classes) + network = LeNet5(config.num_classes) # load network checkpoint - param_dict = load_checkpoint(args.ckpt_file) + param_dict = load_checkpoint(ckpt_file) load_param_into_net(network, param_dict) # export network - inputs = Tensor(np.ones([args.batch_size, 1, cfg.image_height, cfg.image_width]), mindspore.float32) - export(network, inputs, file_name=args.file_name, file_format=args.file_format) + inputs = Tensor(np.ones([config.batch_size, 1, config.image_height, config.image_width]), mindspore.float32) + export(network, inputs, file_name=config.file_name, file_format=config.file_format) diff --git a/model_zoo/official/cv/lenet/scripts/run_standalone_eval_ascend.sh b/model_zoo/official/cv/lenet/scripts/run_standalone_eval_ascend.sh index 1720adde1a..0c95dfcfa9 100755 --- a/model_zoo/official/cv/lenet/scripts/run_standalone_eval_ascend.sh +++ b/model_zoo/official/cv/lenet/scripts/run_standalone_eval_ascend.sh @@ -17,6 +17,7 @@ # an simple tutorial as follows, more parameters can be setting script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") -DATA_PATH=$1 -CKPT_PATH=$2 -python -s ${self_path}/../eval.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & +# DATA_PATH=$1 +# CKPT_PATH=$2 +# --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH +python -s ${self_path}/../eval.py > log_eval.txt 2>&1 & diff --git a/model_zoo/official/cv/lenet/scripts/run_standalone_train_ascend.sh b/model_zoo/official/cv/lenet/scripts/run_standalone_train_ascend.sh index 4f3eda190b..9884cb97be 100755 --- a/model_zoo/official/cv/lenet/scripts/run_standalone_train_ascend.sh +++ b/model_zoo/official/cv/lenet/scripts/run_standalone_train_ascend.sh @@ -17,6 +17,7 @@ # an simple tutorial as follows, more parameters can be setting script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") -DATA_PATH=$1 -CKPT_PATH=$2 -python -s ${self_path}/../train.py --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH > log.txt 2>&1 & \ No newline at end of file +# DATA_PATH=$1 +# CKPT_PATH=$2 +# --data_path=$DATA_PATH --device_target="Ascend" --ckpt_path=$CKPT_PATH +python -s ${self_path}/../train.py > log.txt 2>&1 & \ No newline at end of file diff --git a/model_zoo/official/cv/lenet/train.py b/model_zoo/official/cv/lenet/train.py index 980b5e26b9..2d5be9a447 100644 --- a/model_zoo/official/cv/lenet/train.py +++ b/model_zoo/official/cv/lenet/train.py @@ -19,8 +19,12 @@ python train.py --data_path /YourDataPath """ import os -import argparse -from src.config import mnist_cfg as cfg +# import sys +# sys.path.append(os.path.join(os.getcwd(), 'utils')) +from utils.config import config +from utils.moxing_adapter import moxing_wrapper +from utils.device_adapter import get_rank_id + from src.dataset import create_dataset from src.lenet import LeNet5 import mindspore.nn as nn @@ -30,36 +34,40 @@ from mindspore.train import Model from mindspore.nn.metrics import Accuracy from mindspore.common import set_seed - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') -parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], - help='device where the code will be implemented (default: Ascend)') -parser.add_argument('--data_path', type=str, default="./Data", - help='path where the dataset is saved') -parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ - path where the trained ckpt file') -args = parser.parse_args() set_seed(1) +if os.path.exists(config.data_path_local): + config.data_path = config.data_path_local + config.checkpoint_path = os.path.join(config.checkpoint_path, str(get_rank_id())) +else: + config.checkpoint_path = os.path.join(config.output_path, config.checkpoint_path, str(get_rank_id())) -if __name__ == "__main__": - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size) +def modelarts_pre_process(): + pass + +@moxing_wrapper(pre_process=modelarts_pre_process) +def train_lenet(): + + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) + ds_train = create_dataset(os.path.join(config.data_path, "train"), config.batch_size) if ds_train.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") - network = LeNet5(cfg.num_classes) + network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=args.ckpt_path, config=config_ck) + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=config.checkpoint_path, config=config_ck) - if args.device_target != "Ascend": + if config.device_target != "Ascend": model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") print("============== Starting Training ==============") - model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) + model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()]) + +if __name__ == "__main__": + train_lenet() diff --git a/model_zoo/official/cv/lenet/utils/__init__.py b/model_zoo/official/cv/lenet/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model_zoo/official/cv/lenet/utils/config.py b/model_zoo/official/cv/lenet/utils/config.py new file mode 100644 index 0000000000..2c191e9f74 --- /dev/null +++ b/model_zoo/official/cv/lenet/utils/config.py @@ -0,0 +1,127 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pprint, pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/official/cv/lenet/utils/device_adapter.py b/model_zoo/official/cv/lenet/utils/device_adapter.py new file mode 100644 index 0000000000..7c5d7f837d --- /dev/null +++ b/model_zoo/official/cv/lenet/utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/model_zoo/official/cv/lenet/utils/local_adapter.py b/model_zoo/official/cv/lenet/utils/local_adapter.py new file mode 100644 index 0000000000..769fa6dc78 --- /dev/null +++ b/model_zoo/official/cv/lenet/utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/model_zoo/official/cv/lenet/utils/moxing_adapter.py b/model_zoo/official/cv/lenet/utils/moxing_adapter.py new file mode 100644 index 0000000000..830d19a6fc --- /dev/null +++ b/model_zoo/official/cv/lenet/utils/moxing_adapter.py @@ -0,0 +1,122 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from mindspore.profiler import Profiler +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + if config.enable_profiling: + profiler = Profiler() + + run_func(*args, **kwargs) + + if config.enable_profiling: + profiler.analyse() + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper