| @@ -55,7 +55,7 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||
| +-- Readme.md # descriptions about ShuffleNetV2 | |||
| +-- scripts | |||
| ¦ +--run_distribute_train_for_gpu.sh # shell script for distributed training | |||
| ¦ +--run_eval_for_multi_gpu.sh # shell script for evaluation | |||
| ¦ +--run_eval_for_gpu.sh # shell script for evaluation | |||
| ¦ +--run_standalone_train_for_gpu.sh # shell script for standalone training | |||
| +-- src | |||
| ¦ +--config.py # parameter configuration | |||
| @@ -75,23 +75,23 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||
| You can start training using python or shell scripts. The usage of shell scripts as follows: | |||
| - Ditributed training on GPU: sh run_distribute_train_for_gpu.sh [DATA_DIR] | |||
| - Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_ID] [DATA_DIR] | |||
| - Ditributed training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] | |||
| - Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DATASET_PATH] | |||
| ### Launch | |||
| ``` | |||
| # training example | |||
| python: | |||
| GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed --platform 'GPU' --dataset_path '~/imagenet/train/' > train.log 2>&1 & | |||
| GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & | |||
| shell: | |||
| GPU: sh run_distribute_train_for_gpu.sh ~/imagenet/train/ | |||
| GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ | |||
| ``` | |||
| ### Result | |||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log`. | |||
| Training result will be stored in the example path. Checkpoints will be stored at `./checkpoint` by default, and training log will be redirected to `./train/train.log`. | |||
| ## [Eval process](#contents) | |||
| @@ -99,21 +99,21 @@ Training result will be stored in the example path. Checkpoints will be stored a | |||
| You can start evaluation using python or shell scripts. The usage of shell scripts as follows: | |||
| - GPU: sh run_eval_for_multi_gpu.sh [DEVICE_ID] [EPOCH] | |||
| - GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ### Launch | |||
| ``` | |||
| # infer example | |||
| python: | |||
| GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform 'GPU' --dataset_path '~/imagenet/val/' --epoch 250 > eval.log 2>&1 & | |||
| GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform='GPU' --dataset_path='~/imagenet/val/' > eval.log 2>&1 & | |||
| shell: | |||
| GPU: sh run_eval_for_multi_gpu.sh 0 250 | |||
| GPU: cd scripts & sh run_eval_for_gpu.sh '~/imagenet/val/' 'checkpoint_file' | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| ### Result | |||
| Inference result will be stored in the example path, you can find result in `val.log`. | |||
| Inference result will be stored in the example path, you can find result in `eval.log`. | |||
| @@ -31,7 +31,6 @@ if __name__ == '__main__': | |||
| parser.add_argument('--checkpoint', type=str, default='', help='checkpoint of ShuffleNetV2 (Default: None)') | |||
| parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') | |||
| parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform') | |||
| parser.add_argument('--epoch', type=str, default='') | |||
| args_opt = parser.parse_args() | |||
| if args_opt.platform == 'Ascend': | |||
| @@ -43,7 +42,7 @@ if __name__ == '__main__': | |||
| ckpt = load_checkpoint(args_opt.checkpoint) | |||
| load_param_into_net(net, ckpt) | |||
| net.set_train(False) | |||
| dataset = create_dataset(args_opt.dataset_path, cfg, False) | |||
| dataset = create_dataset(args_opt.dataset_path, False, 0, 1) | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False, | |||
| smooth_factor=0.1, num_classes=cfg.num_classes) | |||
| eval_metrics = {'Loss': nn.Loss(), | |||
| @@ -13,5 +13,45 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| DATA_DIR=$1 | |||
| mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||
| if [ $# -lt 3 ] | |||
| then | |||
| echo "Usage: \ | |||
| sh run_distribute_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] \ | |||
| " | |||
| exit 1 | |||
| fi | |||
| if [ $1 -lt 1 ] && [ $1 -gt 8 ] | |||
| then | |||
| echo "error: DEVICE_NUM=$1 is not in (1-8)" | |||
| exit 1 | |||
| fi | |||
| # check dataset file | |||
| if [ ! -d $3 ] | |||
| then | |||
| echo "error: DATASET_PATH=$3 is not a directory" | |||
| exit 1 | |||
| fi | |||
| export DEVICE_NUM=$1 | |||
| export RANK_SIZE=$1 | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| if [ -d "../train" ]; | |||
| then | |||
| rm -rf ../train | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| export CUDA_VISIBLE_DEVICES="$2" | |||
| if [ $1 -gt 1 ] | |||
| then | |||
| mpirun -n $1 --allow-run-as-root \ | |||
| python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 & | |||
| else | |||
| python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$3 > train.log 2>&1 & | |||
| fi | |||
| @@ -13,6 +13,35 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| DEVICE_ID=$1 | |||
| EPOCH=$2 | |||
| CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --platform 'GPU' --dataset_path '/home/data/ImageNet_Original/val/' --epoch $EPOCH > eval.log 2>&1 & | |||
| if [ $# != 2 ] | |||
| then | |||
| echo "GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]" | |||
| exit 1 | |||
| fi | |||
| # check dataset file | |||
| if [ ! -d $1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| # check checkpoint file | |||
| if [ ! -f $2 ] | |||
| then | |||
| echo "error: CHECKPOINT_PATH=$2 is not a file" | |||
| exit 1 | |||
| fi | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| export DEVICE_ID=0 | |||
| if [ -d "../eval" ]; | |||
| then | |||
| rm -rf ../eval | |||
| fi | |||
| mkdir ../eval | |||
| cd ../eval || exit | |||
| python ${BASEPATH}/../eval.py --dataset_path=$1 --checkpoint=$2 > ./eval.log 2>&1 & | |||
| @@ -13,6 +13,28 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| DEVICE_ID=$1 | |||
| DATA_DIR=$2 | |||
| CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||
| if [ $# -lt 1 ] | |||
| then | |||
| echo "Usage: \ | |||
| sh run_standalone_train_for_gpu.sh [DATASET_PATH] \ | |||
| " | |||
| exit 1 | |||
| fi | |||
| # check dataset file | |||
| if [ ! -d $1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| if [ -d "../train" ]; | |||
| then | |||
| rm -rf ../train | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$1 > train.log 2>&1 & | |||
| @@ -75,7 +75,5 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): | |||
| ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=cfg.work_nums) | |||
| # apply batch operations | |||
| ds = ds.batch(cfg.batch_size, drop_remainder=True) | |||
| # apply dataset repeat operation | |||
| ds = ds.repeat(repeat_num) | |||
| return ds | |||
| @@ -14,6 +14,7 @@ | |||
| # ============================================================================ | |||
| """train_imagenet.""" | |||
| import argparse | |||
| import ast | |||
| import os | |||
| import random | |||
| import numpy as np | |||
| @@ -23,7 +24,7 @@ from network import ShuffleNetV2 | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore import dataset as de | |||
| from mindspore import ParallelMode | |||
| from mindspore.context import ParallelMode | |||
| from mindspore import Tensor | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| from mindspore.nn.optim.momentum import Momentum | |||
| @@ -42,10 +43,9 @@ de.config.set_seed(cfg.random_seed) | |||
| if __name__ == '__main__': | |||
| parser = argparse.ArgumentParser(description='image classification training') | |||
| parser.add_argument('--dataset_path', type=str, default='/home/data/imagenet_jpeg/train/', help='Dataset path') | |||
| parser.add_argument('--dataset_path', type=str, default='', help='Dataset path') | |||
| parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint') | |||
| parser.add_argument('--is_distributed', action='store_true', default=False, | |||
| help='distributed training') | |||
| parser.add_argument('--is_distributed', type=ast.literal_eval, default=False, help='distributed training') | |||
| parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform') | |||
| parser.add_argument('--model_size', type=str, default='1.0x', help='ShuffleNetV2 model size parameter') | |||
| args_opt = parser.parse_args() | |||