Merge pull request !6237 from yepei6/modify_mobilenetv2tags/v1.0.0
| @@ -4,17 +4,16 @@ | |||||
| - [Model Architecture](#model-architecture) | - [Model Architecture](#model-architecture) | ||||
| - [Dataset](#dataset) | - [Dataset](#dataset) | ||||
| - [Features](#features) | - [Features](#features) | ||||
| - [Mixed Precision](#mixed-precision) | |||||
| - [Mixed Precision](#mixed-precision(ascend)) | |||||
| - [Environment Requirements](#environment-requirements) | - [Environment Requirements](#environment-requirements) | ||||
| - [Script Description](#script-description) | - [Script Description](#script-description) | ||||
| - [Script and Sample Code](#script-and-sample-code) | - [Script and Sample Code](#script-and-sample-code) | ||||
| - [Training Process](#training-process) | - [Training Process](#training-process) | ||||
| - [Evaluation Process](#evaluation-process) | |||||
| - [Evaluation](#evaluation) | |||||
| - [Evaluation Process](#eval-process) | |||||
| - [Model Description](#model-description) | - [Model Description](#model-description) | ||||
| - [Performance](#performance) | - [Performance](#performance) | ||||
| - [Training Performance](#evaluation-performance) | |||||
| - [Inference Performance](#evaluation-performance) | |||||
| - [Training Performance](#training-performance) | |||||
| - [Evaluation Performance](#evaluation-performance) | |||||
| - [Description of Random Situation](#description-of-random-situation) | - [Description of Random Situation](#description-of-random-situation) | ||||
| - [ModelZoo Homepage](#modelzoo-homepage) | - [ModelZoo Homepage](#modelzoo-homepage) | ||||
| @@ -38,7 +37,7 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||||
| - Train: 120G, 1.2W images | - Train: 120G, 1.2W images | ||||
| - Test: 5G, 50000 images | - Test: 5G, 50000 images | ||||
| - Data format: RGB images. | - Data format: RGB images. | ||||
| - Note: Data will be processed in src/dataset.py | |||||
| - Note: Data will be processed in src/dataset.py | |||||
| # [Features](#contents) | # [Features](#contents) | ||||
| @@ -92,84 +91,84 @@ You can start training using python or shell scripts. The usage of shell scripts | |||||
| ### Launch | ### Launch | ||||
| ``` | |||||
| ```shell | |||||
| # training example | # training example | ||||
| python: | python: | ||||
| Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method train | |||||
| GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method train | |||||
| CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method train | |||||
| Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method train | |||||
| GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method train | |||||
| CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method train | |||||
| shell: | shell: | ||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ train | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ train | |||||
| CPU: sh run_train.sh CPU ~/imagenet/train/ train | |||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] train | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] train | |||||
| CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] train | |||||
| # fine tune example | # fine tune example | ||||
| python: | python: | ||||
| Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: python train.py --platform Ascend --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| GPU: python train.py --platform GPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| CPU: python train.py --platform CPU --dataset_path [TRAIN_DATASET_PATH] --train_method fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| shell: | shell: | ||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: sh run_train.sh CPU ~/imagenet/train/ fine_tune ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] fine_tune ./pretrain_checkpoint/mobilenetv2.ckpt | |||||
| # incremental learn example | # incremental learn example | ||||
| python: | python: | ||||
| Ascend: python train.py --dataset_path ~/imagenet/train/ --platform Ascend --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: python train.py --dataset_path ~/imagenet/train/ --platform GPU --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: python train.py --dataset_path ~/imagenet/train/ --platform CPU --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: python --platform Ascend train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| GPU: python --platform GPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| CPU: python --platform CPU train.py --dataset_path [TRAIN_DATASET_PATH] --train_method incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| shell: | shell: | ||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: sh run_train.sh CPU ~/imagenet/train/ incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| CPU: sh run_train.sh CPU [TRAIN_DATASET_PATH] incremental_learn ./pretrain_checkpoint/mobilenetv2_199.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| ``` | ``` | ||||
| ### Result | ### Result | ||||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train.log` like followings with the platform CPU and GPU, will be wrote to `./train/rank*/log*.log` with the platform Ascend . | |||||
| ``` | |||||
| ```shell | |||||
| epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | ||||
| epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 | epoch time: 140522.500, per step time: 224.836, avg loss: 5.258 | ||||
| epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] | epoch: [ 1/200], step:[ 624/ 625], loss:[3.917/3.917], time:[138221.250], lr:[0.200] | ||||
| epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 | epoch time: 138331.250, per step time: 221.330, avg loss: 3.917 | ||||
| ``` | ``` | ||||
| ## [Eval process](#contents) | |||||
| ## [Evaluation process](#contents) | |||||
| ### Usage | ### Usage | ||||
| You can start training using python or shell scripts. The usage of shell scripts as follows: | |||||
| You can start training using python or shell scripts.If the train method is train or fine tune, should not input the `[CHECKPOINT_PATH]` The usage of shell scripts as follows: | |||||
| - Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] | |||||
| - GPU: sh run_infer.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] | |||||
| - CPU: sh run_infer.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] [HEAD_CKPT_PATH] | |||||
| - Ascend: sh run_eval.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] | |||||
| - GPU: sh run_eval.sh GPU [DATASET_PATH] [CHECKPOINT_PATH] [HEAD_CKPT_PATH] | |||||
| - CPU: sh run_eval.sh CPU [DATASET_PATH] [BACKBONE_CKPT_PATH] [HEAD_CKPT_PATH] | |||||
| ### Launch | ### Launch | ||||
| ``` | |||||
| # infer example | |||||
| ```shell | |||||
| # eval example | |||||
| python: | python: | ||||
| Ascend: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform Ascend --head_ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform GPU --head_ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: python eval.py --dataset_path ~/imagenet/val/ --pretrain_ckpt ~/train/mobilenet-200_625.ckpt --platform CPU --head_ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: python eval.py --platform Ascend --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| GPU: python eval.py --platform GPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| CPU: python eval.py --platform CPU --dataset_path [VAL_DATASET_PATH] --pretrain_ckpt ./pretrain_ckpt/mobilenetv2.ckpt --head_ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| shell: | shell: | ||||
| Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| GPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| CPU: sh run_infer.sh GPU ~/imagenet/val/ ~/train/mobilenet-200_625.ckpt ./checkpoint/mobilenetv2_199.ckpt | |||||
| Ascend: sh run_eval.sh Ascend [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| GPU: sh run_eval.sh GPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| CPU: sh run_eval.sh CPU [VAL_DATASET_PATH] ./pretrain_ckpt/mobilenetv2.ckpt ./checkpoint/mobilenetv2_head_15.ckpt | |||||
| ``` | ``` | ||||
| > checkpoint can be produced in training process. | > checkpoint can be produced in training process. | ||||
| ### Result | ### Result | ||||
| Inference result will be stored in the example path, you can find result like the followings in `val.log`. | |||||
| Inference result will be stored in the example path, you can find result like the followings in `eval.log`. | |||||
| ``` | |||||
| ```shell | |||||
| result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt | result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.ckpt | ||||
| ``` | ``` | ||||
| @@ -199,7 +198,8 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625. | |||||
| # [Description of Random Situation](#contents) | # [Description of Random Situation](#contents) | ||||
| In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py. | |||||
| <!-- In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py. --> | |||||
| In train.py, we set the seed which is used by numpy.random, mindspore.common.Initializer, mindspore.ops.composite.random_ops and mindspore.nn.probability.distribution. | |||||
| # [ModelZoo Homepage](#contents) | # [ModelZoo Homepage](#contents) | ||||
| @@ -18,10 +18,10 @@ | |||||
| run_ascend() | run_ascend() | ||||
| { | { | ||||
| # check checkpoint file | |||||
| # check pretrain_ckpt file | |||||
| if [ ! -f $3 ] | if [ ! -f $3 ] | ||||
| then | then | ||||
| echo "error: CHECKPOINT_PATH=$3 is not a file" | |||||
| echo "error: PRETRAIN_CKPT=$3 is not a file" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -44,15 +44,15 @@ run_ascend() | |||||
| --dataset_path=$2 \ | --dataset_path=$2 \ | ||||
| --pretrain_ckpt=$3 \ | --pretrain_ckpt=$3 \ | ||||
| --head_ckpt=$4 \ | --head_ckpt=$4 \ | ||||
| &> ../infer.log & # dataset val folder path | |||||
| &> ../eval.log & # dataset val folder path | |||||
| } | } | ||||
| run_gpu() | run_gpu() | ||||
| { | { | ||||
| # check checkpoint file | |||||
| # check pretrain_ckpt file | |||||
| if [ ! -f $3 ] | if [ ! -f $3 ] | ||||
| then | then | ||||
| echo "error: CHECKPOINT_PATH=$3 is not a file" | |||||
| echo "error: PRETRAIN_CKPT=$3 is not a file" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -70,26 +70,18 @@ run_gpu() | |||||
| --dataset_path=$2 \ | --dataset_path=$2 \ | ||||
| --pretrain_ckpt=$3 \ | --pretrain_ckpt=$3 \ | ||||
| --head_ckpt=$4 \ | --head_ckpt=$4 \ | ||||
| &> ../infer.log & # dataset train folder | |||||
| &> ../eval.log & # dataset train folder | |||||
| } | } | ||||
| run_cpu() | run_cpu() | ||||
| { | { | ||||
| # check checkpoint file | |||||
| # check pretrain_ckpt file | |||||
| if [ ! -f $3 ] | if [ ! -f $3 ] | ||||
| then | then | ||||
| echo "error: BACKBONE_CKPT=$3 is not a file" | |||||
| echo "error: PRETRAIN_CKPT=$3 is not a file" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| # check checkpoint file | |||||
| if [ ! -f $4 ] | |||||
| then | |||||
| echo "error: HEAD_CKPT=$4 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | BASEPATH=$(cd "`dirname $0`" || exit; pwd) | ||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | export PYTHONPATH=${BASEPATH}:$PYTHONPATH | ||||
| if [ -d "../eval" ]; | if [ -d "../eval" ]; | ||||
| @@ -104,13 +96,14 @@ run_cpu() | |||||
| --dataset_path=$2 \ | --dataset_path=$2 \ | ||||
| --pretrain_ckpt=$3 \ | --pretrain_ckpt=$3 \ | ||||
| --head_ckpt=$4 \ | --head_ckpt=$4 \ | ||||
| &> ../infer.log & # dataset train folder | |||||
| &> ../eval.log & # dataset train folder | |||||
| } | } | ||||
| if [ $# -gt 4 ] || [ $# -lt 3 ] | if [ $# -gt 4 ] || [ $# -lt 3 ] | ||||
| then | then | ||||
| echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] \ | |||||
| echo "Usage: | |||||
| Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] | |||||
| GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] | GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [PRETRAIN_CKPT] | ||||
| CPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [BACKBONE_CKPT] [HEAD_CKPT]" | CPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [BACKBONE_CKPT] [HEAD_CKPT]" | ||||
| exit 1 | exit 1 | ||||
| @@ -109,11 +109,10 @@ run_cpu() | |||||
| if [ $# -gt 7 ] || [ $# -lt 4 ] | if [ $# -gt 7 ] || [ $# -lt 4 ] | ||||
| then | then | ||||
| echo "Usage:\n \ | |||||
| Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] \n \ | |||||
| GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]\n \ | |||||
| CPU: sh run_train.sh CPU [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]\n \ | |||||
| " | |||||
| echo "Usage: | |||||
| Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] | |||||
| GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH] | |||||
| CPU: sh run_train.sh CPU [DATASET_PATH] [TRAIN_METHOD] [CKPT_PATH]" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -38,31 +38,28 @@ def launch_parse_args(): | |||||
| def train_parse_args(): | def train_parse_args(): | ||||
| train_parser = argparse.ArgumentParser(description='Image classification trian') | train_parser = argparse.ArgumentParser(description='Image classification trian') | ||||
| train_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') | |||||
| train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ | train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ | ||||
| help='run platform, only support CPU, GPU and Ascend') | help='run platform, only support CPU, GPU and Ascend') | ||||
| train_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ | |||||
| for fine tune or incremental learning') | |||||
| train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') | |||||
| train_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') | |||||
| train_parser.add_argument('--train_method', type=str, choices=("train", "fine_tune", "incremental_learn"), \ | train_parser.add_argument('--train_method', type=str, choices=("train", "fine_tune", "incremental_learn"), \ | ||||
| help="\"fine_tune\"or \"incremental_learn\" if to fine tune the net after loading the ckpt, \"train\" to \ | help="\"fine_tune\"or \"incremental_learn\" if to fine tune the net after loading the ckpt, \"train\" to \ | ||||
| train from initialization model") | train from initialization model") | ||||
| train_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ | |||||
| for fine tune or incremental learning') | |||||
| train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') | |||||
| train_args = train_parser.parse_args() | train_args = train_parser.parse_args() | ||||
| return train_args | return train_args | ||||
| def eval_parse_args(): | def eval_parse_args(): | ||||
| eval_parser = argparse.ArgumentParser(description='Image classification eval') | eval_parser = argparse.ArgumentParser(description='Image classification eval') | ||||
| eval_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') | |||||
| eval_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \ | eval_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \ | ||||
| help='run platform, only support GPU, CPU and Ascend') | help='run platform, only support GPU, CPU and Ascend') | ||||
| eval_parser.add_argument('--pretrain_ckpt', type=str, default=None, help='Pretrained checkpoint path \ | |||||
| eval_parser.add_argument('--dataset_path', type=str, required=True, help='Dataset path') | |||||
| eval_parser.add_argument('--pretrain_ckpt', type=str, required=True, help='Pretrained checkpoint path \ | |||||
| for fine tune or incremental learning') | for fine tune or incremental learning') | ||||
| eval_parser.add_argument('--head_ckpt', type=str, default=None, help='Pretrained checkpoint path \ | eval_parser.add_argument('--head_ckpt', type=str, default=None, help='Pretrained checkpoint path \ | ||||
| for fine tune or incremental learning') | |||||
| eval_parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') | |||||
| for incremental learning') | |||||
| eval_parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='If run distribute in GPU.') | |||||
| eval_args = eval_parser.parse_args() | eval_args = eval_parser.parse_args() | ||||
| return eval_args | return eval_args | ||||
| @@ -16,7 +16,6 @@ | |||||
| create train or eval dataset. | create train or eval dataset. | ||||
| """ | """ | ||||
| import os | import os | ||||
| from tqdm import tqdm | |||||
| import numpy as np | import numpy as np | ||||
| from mindspore import Tensor | from mindspore import Tensor | ||||
| @@ -97,7 +96,11 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): | |||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| ds = ds.repeat(repeat_num) | ds = ds.repeat(repeat_num) | ||||
| return ds | |||||
| step_size = ds.get_dataset_size() | |||||
| if step_size == 0: | |||||
| raise ValueError("The step_size of dataset is zero. Check if the images of train dataset is more than batch_\ | |||||
| size in config.py") | |||||
| return ds, step_size | |||||
| def extract_features(net, dataset_path, config): | def extract_features(net, dataset_path, config): | ||||
| @@ -109,19 +112,16 @@ def extract_features(net, dataset_path, config): | |||||
| config=config, | config=config, | ||||
| repeat_num=1) | repeat_num=1) | ||||
| step_size = dataset.get_dataset_size() | step_size = dataset.get_dataset_size() | ||||
| pbar = tqdm(list(dataset.create_dict_iterator(output_numpy=True))) | |||||
| model = Model(net) | model = Model(net) | ||||
| i = 0 | |||||
| for data in pbar: | |||||
| for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): | |||||
| features_path = os.path.join(features_folder, f"feature_{i}.npy") | features_path = os.path.join(features_folder, f"feature_{i}.npy") | ||||
| label_path = os.path.join(features_folder, f"label_{i}.npy") | label_path = os.path.join(features_folder, f"label_{i}.npy") | ||||
| if not (os.path.exists(features_path) and os.path.exists(label_path)): | |||||
| if not os.path.exists(features_path or not os.path.exists(label_path)): | |||||
| image = data["image"] | image = data["image"] | ||||
| label = data["label"] | label = data["label"] | ||||
| features = model.predict(Tensor(image)) | features = model.predict(Tensor(image)) | ||||
| np.save(features_path, features.asnumpy()) | np.save(features_path, features.asnumpy()) | ||||
| np.save(label_path, label) | np.save(label_path, label) | ||||
| pbar.set_description("Process dataset batch: %d" % (i + 1)) | |||||
| i += 1 | |||||
| print(f"Complete the batch {i}/{step_size}") | |||||
| return step_size | return step_size | ||||
| @@ -38,17 +38,17 @@ def main(): | |||||
| for rank_id in range(0, args.nproc_per_node): | for rank_id in range(0, args.nproc_per_node): | ||||
| os.chdir(cur_path) | os.chdir(cur_path) | ||||
| device_id = visible_devices[rank_id] | device_id = visible_devices[rank_id] | ||||
| device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) | |||||
| rank_dir = os.path.join(cur_path, 'rank{}'.format(rank_id)) | |||||
| env['RANK_ID'] = str(rank_id) | env['RANK_ID'] = str(rank_id) | ||||
| env['DEVICE_ID'] = str(device_id) | env['DEVICE_ID'] = str(device_id) | ||||
| if os.path.exists(device_dir): | |||||
| shutil.rmtree(device_dir) | |||||
| os.mkdir(device_dir) | |||||
| os.chdir(device_dir) | |||||
| if os.path.exists(rank_dir): | |||||
| shutil.rmtree(rank_dir) | |||||
| os.mkdir(rank_dir) | |||||
| os.chdir(rank_dir) | |||||
| cmd = [sys.executable, '-u'] | cmd = [sys.executable, '-u'] | ||||
| cmd.append(args.training_script) | cmd.append(args.training_script) | ||||
| cmd.extend(args.training_script_args) | cmd.extend(args.training_script_args) | ||||
| log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') | |||||
| log_file = open(f'{rank_dir}/log{rank_id}.log', 'w') | |||||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | ||||
| processes.append(process) | processes.append(process) | ||||
| cmds.append(cmd) | cmds.append(cmd) | ||||
| @@ -60,8 +60,7 @@ if __name__ == '__main__': | |||||
| elif args_opt.train_method in ("train", "fine_tune"): | elif args_opt.train_method in ("train", "fine_tune"): | ||||
| if args_opt.platform == "CPU": | if args_opt.platform == "CPU": | ||||
| raise ValueError("Currently, CPU only support \"incremental_learn\", not \"fine_tune\" or \"train\".") | raise ValueError("Currently, CPU only support \"incremental_learn\", not \"fine_tune\" or \"train\".") | ||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) | |||||
| step_size = dataset.get_dataset_size() | |||||
| dataset, step_size = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) | |||||
| # Currently, only Ascend support switch precision. | # Currently, only Ascend support switch precision. | ||||
| switch_precision(net, mstype.float16, config) | switch_precision(net, mstype.float16, config) | ||||
| @@ -108,9 +107,8 @@ if __name__ == '__main__': | |||||
| losses.append(network(feature, label).asnumpy()) | losses.append(network(feature, label).asnumpy()) | ||||
| epoch_mseconds = (time.time()-epoch_start) * 1000 | epoch_mseconds = (time.time()-epoch_start) * 1000 | ||||
| per_step_mseconds = epoch_mseconds / step_size | per_step_mseconds = epoch_mseconds / step_size | ||||
| print("\r epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ | |||||
| .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses))), \ | |||||
| end="") | |||||
| print("epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\ | |||||
| .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses)))) | |||||
| if (epoch + 1) % config.save_checkpoint_epochs == 0: | if (epoch + 1) % config.save_checkpoint_epochs == 0: | ||||
| save_checkpoint(network, os.path.join(config.save_checkpoint_path, \ | save_checkpoint(network, os.path.join(config.save_checkpoint_path, \ | ||||
| f"mobilenetv2_head_{epoch+1}.ckpt")) | f"mobilenetv2_head_{epoch+1}.ckpt")) | ||||