From 6ada7318be4c2edc308cf43ea8f532c97d947aa2 Mon Sep 17 00:00:00 2001 From: shibeiji Date: Wed, 30 Dec 2020 11:26:10 +0800 Subject: [PATCH] modify centernet scripts for cpu adaption and usr friendliness --- model_zoo/research/cv/centernet/README.md | 62 ++++++++++---- model_zoo/research/cv/centernet/eval.py | 17 ++-- model_zoo/research/cv/centernet/export.py | 2 +- .../get_distribute_train_cmd.py | 7 +- .../hyper_parameter_config.ini | 1 - .../scripts/run_distributed_train_ascend.sh | 21 +++-- .../scripts/run_standalone_eval_ascend.sh | 18 +++-- .../scripts/run_standalone_eval_cpu.sh | 53 ++++++++++++ .../scripts/run_standalone_train_ascend.sh | 14 ++-- .../scripts/run_standalone_train_cpu.sh | 44 ++++++++++ .../research/cv/centernet/src/__init__.py | 7 +- .../cv/centernet/src/centernet_pose.py | 47 ++++++++++- model_zoo/research/cv/centernet/src/config.py | 3 +- .../research/cv/centernet/src/dataset.py | 21 ++--- model_zoo/research/cv/centernet/src/decode.py | 80 ++++++++++--------- model_zoo/research/cv/centernet/src/utils.py | 68 +++++++++++----- model_zoo/research/cv/centernet/train.py | 51 +++++++----- 17 files changed, 367 insertions(+), 149 deletions(-) create mode 100644 model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh create mode 100644 model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh diff --git a/model_zoo/research/cv/centernet/README.md b/model_zoo/research/cv/centernet/README.md index 163cf71031..e242912e6c 100644 --- a/model_zoo/research/cv/centernet/README.md +++ b/model_zoo/research/cv/centernet/README.md @@ -119,19 +119,27 @@ After installing MindSpore via the official website, you can start training and Note: 1.the first run of training will generate the mindrecord file, which will take a long time. 2.MINDRECORD_DATASET_PATH is the mindrecord dataset directory. + 3.LOAD_CHECKPOINT_PATH is the pretrained checkpoint file directory, if no just set "" + 4.RUN_MODE support validation and testing, set to be "val"/"test" ```shell # create dataset in mindrecord format bash scripts/convert_dataset_to_mindrecord.sh -# standalone training -bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [EPOCH_SIZE] +# standalone training on Ascend +bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH] -# distributed training -bash scripts/run_distributed_train_ascend.sh [MINDRECORD_DATASET_PATH] [RANK_TABLE_FILE] +# standalone training on CPU +bash scripts/run_standalone_train_cpu.sh [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH] -# eval -bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] +# distributed training on Ascend +bash scripts/run_distributed_train_ascend.sh [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH] [RANK_TABLE_FILE] + +# eval on Ascend +bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH] + +# eval on CPU +bash scripts/run_standalone_eval_cpu.sh [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH] ``` # [Script Description](#contents) @@ -153,9 +161,11 @@ bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] │ │ ├──get_distribute_pretrain_cmd.py // script for distributed pretraining │ │ ├──README.md │ ├──convert_dataset_to_mindrecord.sh // shell script for converting coco type dataset to mindrecord - │ ├──run_standalone_train_ascend.sh // shell script for standalone pretrain on ascend - │ ├──run_distributed_train_ascend.sh // shell script for distributed pretrain on ascend + │ ├──run_standalone_train_ascend.sh // shell script for standalone training on ascend + │ ├──run_distributed_train_ascend.sh // shell script for distributed training on ascend │ ├──run_standalone_eval_ascend.sh // shell script for standalone evaluation on ascend + │ ├──run_standalone_train_cpu.sh // shell script for standalone training on cpu + │ ├──run_standalone_eval_cpu.sh // shell script for standalone evaluation on cpu └── src ├──__init__.py ├──centernet_pose.py // centernet networks, training entry @@ -259,7 +269,6 @@ config for training. ```text config for evaluation. - flip_test whether to use flip test: True | False, default is False soft_nms nms after decode: True | False, default is True keep_res keep original or fix resolution: True | False, default is False multi_scales use multi-scales of image: List, default is [1.0] @@ -350,12 +359,12 @@ bash scripts/convert_dataset_to_mindrecord.sh The command above will run in the background, after converting mindrecord files will be located in path specified by yourself. -### Training +### Standalone Training #### Running on Ascend ```bash -bash scripts/run_standalone_pretrain_ascend.sh 0 1 +bash scripts/run_standalone_train_ascend.sh device_id /path/mindrecord_dataset /path/load_ckpt ``` The command above will run in the background, you can view training logs in training_log.txt. After training finished, you will get some checkpoint files under the script folder by default. The loss values will be displayed as follows: @@ -368,12 +377,31 @@ epoch: 349.0, current epoch percent: 1.00, step: 87500, outputs are (Tensor(shap ... ``` +#### Running on CPU + +```bash +bash scripts/run_standalone_train_cpu.sh /path/mindrecord_dataset /path/load_ckpt +``` + +The command above will run in the background, you can view training logs in training_log.txt. After training finished, you will get some checkpoint files under the script folder by default. The loss values will be displayed as follows (rusume from pretrained checkpoint and batch_size was set to be 8): + +```text +# grep "epoch" training_log.txt +... +epoch: 0.0, current epoch percent: 0.00, step: 1, time of per steps: 66.693 s, outputs are 3.645 +epoch: 0.0, current epoch percent: 0.00, step: 2, time of per steps: 46.594 s, outputs are 4.862 +epoch: 0.0, current epoch percent: 0.00, step: 3, time of per steps: 44.718 s, outputs are 3.927 +epoch: 0.0, current epoch percent: 0.00, step: 4, time of per steps: 45.113 s, outputs are 3.910 +epoch: 0.0, current epoch percent: 0.00, step: 5, time of per steps: 45.213 s, outputs are 3.749 +... +``` + ### Distributed Training #### Running on Ascend ```bash -bash scripts/run_distributed_pretrain_ascend.sh /path/coco2017 /path/mindrecord /path/hccl.json +bash scripts/run_distributed_pretrain_ascend.sh /path/mindrecord_dataset /path/load_ckpt /path/hccl.json ``` The command above will run in the background, you can view training logs in LOG*/training_log.txt and LOG*/ms_log/. After training finished, you will get some checkpoint files under the LOG*/ckpt_0 folder by default. The loss value will be displayed as follows: @@ -394,7 +422,11 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outputs are (Tensor(shape=[ ```bash # Evaluation base on validation dataset will be done automatically, while for test or test-dev dataset, the accuracy should be upload to the CodaLab official website(https://competitions.codalab.org). -bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] +# On Ascend +bash scripts/run_standalone_eval_ascend.sh device_id val(or test) /path/coco_dataset /path/load_ckpt + +# On CPU +bash scripts/run_standalone_eval_cpu.sh val(or test) /path/coco_dataset /path/load_ckpt ``` you can see the MAP result below as below: @@ -439,7 +471,7 @@ python export.py [DEVICE_ID] ## [Performance](#contents) -### Training Performance +### Training Performance On Ascend CenterNet on 11.8K images(The annotation and data format must be the same as coco) @@ -460,7 +492,7 @@ CenterNet on 11.8K images(The annotation and data format must be the same as coc | Checkpoint | 242M (.ckpt file) | | Scripts | | -### Inference Performance +### Inference Performance On Ascend CenterNet on validation(5K images) and test-dev(40K images) diff --git a/model_zoo/research/cv/centernet/eval.py b/model_zoo/research/cv/centernet/eval.py index 369ba3a524..820491fabf 100644 --- a/model_zoo/research/cv/centernet/eval.py +++ b/model_zoo/research/cv/centernet/eval.py @@ -36,6 +36,8 @@ from src.config import dataset_config, net_config, eval_config _current_dir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser(description='CenterNet evaluation') +parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'], + help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--data_dir", type=str, default="", help="Dataset directory, " @@ -52,15 +54,20 @@ def predict(): ''' Predict function ''' - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + if args_opt.device_target == "Ascend": + context.set_context(device_id=args_opt.device_id) + enable_nms_fp16 = True + else: + enable_nms_fp16 = False logger.info("Begin creating {} dataset".format(args_opt.run_mode)) coco = COCOHP(dataset_config, run_mode=args_opt.run_mode, net_opt=net_config, enable_visual_image=(args_opt.visual_image == "true"), save_path=args_opt.save_result_dir,) - coco.init(args_opt.data_dir, keep_res=eval_config.keep_res, flip_test=eval_config.flip_test) + coco.init(args_opt.data_dir, keep_res=eval_config.keep_res) dataset = coco.create_eval_dataset() - net_for_eval = CenterNetMultiPoseEval(net_config, eval_config.flip_test, eval_config.K) + net_for_eval = CenterNetMultiPoseEval(net_config, eval_config.K, enable_nms_fp16) net_for_eval.set_train(False) param_dict = load_checkpoint(args_opt.load_checkpoint_path) @@ -103,9 +110,7 @@ def predict(): print("Image {}/{} id: {} cost time {} ms".format(index, total_nums, image_id, (end - start) * 1000.)) # post-process - soft_nms = eval_config.soft_nms or len(eval_config.multi_scales) > 0 - detections = merge_outputs(detections, soft_nms) - + detections = merge_outputs(detections, eval_config.soft_nms) # get prediction result pred_json = convert_eval_format(detections, image_id) gt_image_info = coco.coco.loadImgs([image_id]) diff --git a/model_zoo/research/cv/centernet/export.py b/model_zoo/research/cv/centernet/export.py index 1003c216c9..cf63d4973d 100644 --- a/model_zoo/research/cv/centernet/export.py +++ b/model_zoo/research/cv/centernet/export.py @@ -31,7 +31,7 @@ args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) if __name__ == '__main__': - net = CenterNetMultiPoseEval(net_config, eval_config.flip_test, eval_config.K) + net = CenterNetMultiPoseEval(net_config, eval_config.K) net.set_train(False) param_dict = load_checkpoint(export_config.ckpt_file) diff --git a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py index df66c98728..95070dba75 100644 --- a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py +++ b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py @@ -39,8 +39,7 @@ def parse_args(): parser.add_argument("--hyper_parameter_config_dir", type=str, default="", help="Hyper Parameter config path, it is better to use absolute path") parser.add_argument("--mindrecord_dir", type=str, default="", help="Mindrecord dataset directory") - parser.add_argument("--mindrecord_prefix", type=str, default="coco_hp.train.mind", - help="Prefix of MindRecord dataset filename.") + parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--hccl_config_dir", type=str, default="", help="Hccl config path, it is better to use absolute path") parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", @@ -72,7 +71,7 @@ def distribute_train(): run_script = args.run_script_dir mindrecord_dir = args.mindrecord_dir - mindrecord_prefix = args.mindrecord_prefix + load_checkpoint_path = args.load_checkpoint_path cf = configparser.ConfigParser() cf.read(args.hyper_parameter_config_dir) cfg = dict(cf.items("config")) @@ -151,7 +150,7 @@ def distribute_train(): " 'device_num' or 'mindrecord_dir'! ") run_cmd += opt run_cmd += " --mindrecord_dir=" + mindrecord_dir - run_cmd += " --mindrecord_prefix=" + mindrecord_prefix + run_cmd += " --load_checkpoint_path=" + load_checkpoint_path run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + str(rank_size) + ' >./training_log.txt 2>&1 &' diff --git a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini index 7f0fa64131..ddf502b700 100644 --- a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini +++ b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini @@ -5,7 +5,6 @@ enable_save_ckpt=true do_shuffle=true enable_data_sink=true data_sink_steps=50 -load_checkpoint_path="" save_checkpoint_path=./ save_checkpoint_steps=3000 save_checkpoint_num=1 diff --git a/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh b/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh index 7e4242f1f1..08115619db 100644 --- a/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh +++ b/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh @@ -14,21 +14,26 @@ # limitations under the License. # ============================================================================ -echo "==============================================================================================================" -echo "Please run the scipt as: " -echo "bash run_distributed_train_ascend.sh DATA_DIR MINDRECORD_DIR RANK_TABLE_FILE" -echo "for example: bash run_distributed_train_ascend.sh /path/dataset /path/mindrecord /path/hccl.json" -echo "It is better to use absolute path." +echo "================================================================================================================" +echo "Please run the script as: " +echo "bash run_distributed_train_ascend.sh MINDRECORD_DIR LOAD_CHECKPOINT_PATH RANK_TABLE_FILE" +echo "for example: bash run_distributed_train_ascend.sh /path/mindrecord_dataset /path/load_ckpt /path/hccl.json" +echo "if no ckpt, just run: bash run_distributed_train_ascend.sh /path/mindrecord_dataset \"\" /path/hccl.json" +echo "It is better to use the absolute path." echo "For hyper parameter, please note that you should customize the scripts: '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " -echo "==============================================================================================================" +echo "================================================================================================================" CUR_DIR=`pwd` +MINDRECORD_DIR=$1 +LOAD_CHECKPOINT_PATH=$2 +HCCL_RANK_FILE=$3 python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py \ --run_script_dir=${CUR_DIR}/train.py \ --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ - --mindrecord_dir=$1 \ - --hccl_config_dir=$2 \ + --mindrecord_dir=$MINDRECORD_DIR \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --hccl_config_dir=$HCCL_RANK_FILE \ --hccl_time_out=1200 \ --cmd_file=distributed_cmd.sh diff --git a/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh index ad2c59399d..a3bcdea8d3 100644 --- a/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh +++ b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh @@ -16,11 +16,14 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "bash run_standalone_eval_ascend.sh DEVICE_ID" -echo "for example: bash run_standalone_eval_ascend.sh 0" +echo "bash run_standalone_eval_ascend.sh DEVICE_ID RUN_MODE DATA_DIR LOAD_CHECKPOINT_PATH" +echo "for example of validation: bash run_standalone_eval_ascend.sh 0 val /path/coco_dataset /path/load_ckpt" +echo "for example of test: bash run_standalone_eval_ascend.sh 0 test /path/coco_dataset /path/load_ckpt" echo "==============================================================================================================" - DEVICE_ID=$1 +RUN_MODE=$2 +DATA_DIR=$3 +LOAD_CHECKPOINT_PATH=$4 mkdir -p ms_log PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) CUR_DIR=`pwd` @@ -42,10 +45,11 @@ else fi python ${PROJECT_DIR}/../eval.py \ + --device_target=Ascend \ --device_id=$DEVICE_ID \ - --load_checkpoint_path="" \ - --data_dir="" \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --data_dir=$DATA_DIR \ + --run_mode=$RUN_MODE \ --visual_image=true \ --enable_eval=true \ - --save_result_dir="" \ - --run_mode=val > eval_log.txt 2>&1 & \ No newline at end of file + --save_result_dir=./ > eval_log.txt 2>&1 & diff --git a/model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh new file mode 100644 index 0000000000..d3efaf0ec1 --- /dev/null +++ b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_standalone_eval_cpu.sh RUN_MODE DATA_DIR LOAD_CHECKPOINT_PATH" +echo "for example of validation: bash run_standalone_eval_cpu.sh val /path/coco_dataset /path/load_ckpt" +echo "for example of test: bash run_standalone_eval_cpu.sh test /path/coco_dataset /path/load_ckpt" +echo "==============================================================================================================" +RUN_MODE=$1 +DATA_DIR=$2 +LOAD_CHECKPOINT_PATH=$3 +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +# install nms module from third party +if python -c "import nms" > /dev/null 2>&1 +then + echo "NMS module already exits, no need reinstall." +else + echo "NMS module was not found, install it now..." + git clone https://github.com/xingyizhou/CenterNet.git + cd CenterNet/src/lib/external/ + make + python setup.py install + cd - + rm -rf CenterNet +fi + +python ${PROJECT_DIR}/../eval.py \ + --device_target=CPU \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --data_dir=$DATA_DIR \ + --run_mode=$RUN_MODE \ + --visual_image=true \ + --enable_eval=true \ + --save_result_dir=./ > eval_log.txt 2>&1 & diff --git a/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh b/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh index ff2f7e7aef..b97a8cf255 100644 --- a/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh +++ b/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh @@ -16,12 +16,14 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "bash run_standalone_pretrain_ascend.sh DEVICE_ID EPOCH_SIZE" -echo "for example: bash run_standalone_pretrain_ascend.sh 0 350" +echo "bash run_standalone_train_ascend.sh DEVICE_ID MINDRECORD_DIR LOAD_CHECKPOINT_PATH" +echo "for example: bash run_standalone_train_ascend.sh 0 /path/mindrecord_dataset /path/load_ckpt" +echo "if no ckpt, just run: bash run_standalone_train_ascend.sh 0 /path/mindrecord_dataset \"\" " echo "==============================================================================================================" DEVICE_ID=$1 -EPOCH_SIZE=$2 +MINDRECORD_DIR=$2 +LOAD_CHECKPOINT_PATH=$3 mkdir -p ms_log PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) @@ -33,16 +35,16 @@ python ${PROJECT_DIR}/../train.py \ --distribute=false \ --need_profiler=false \ --profiler_path=./profiler \ - --epoch_size=$EPOCH_SIZE \ --device_id=$DEVICE_ID \ --enable_save_ckpt=true \ --do_shuffle=true \ --enable_data_sink=true \ --data_sink_steps=50 \ - --load_checkpoint_path="" \ + --epoch_size=350 \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ --save_checkpoint_steps=10000 \ --save_checkpoint_num=1 \ - --mindrecord_dir="" \ + --mindrecord_dir=$MINDRECORD_DIR \ --mindrecord_prefix="coco_hp.train.mind" \ --visual_image=false \ --save_result_dir="" > training_log.txt 2>&1 & \ No newline at end of file diff --git a/model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh b/model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh new file mode 100644 index 0000000000..d9117f38e1 --- /dev/null +++ b/model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +echo "==============================================================================================================" +echo "Please run the scipt as: " +echo "bash run_standalone_train_cpu.sh MINDRECORD_DIR LOAD_CHECKPOINT_PATH" +echo "for example: bash run_standalone_train_cpu.sh /path/mindrecord_dataset /path/load_ckpt" +echo "if no ckpt, just run: bash run_standalone_train_cpu.sh /path/mindrecord_dataset \"\" " +echo "==============================================================================================================" + +MINDRECORD_DIR=$1 +LOAD_CHECKPOINT_PATH=$2 + +mkdir -p ms_log +PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) +CUR_DIR=`pwd` +export GLOG_log_dir=${CUR_DIR}/ms_log +export GLOG_logtostderr=0 + +python ${PROJECT_DIR}/../train.py \ + --device_target=CPU \ + --enable_save_ckpt=true \ + --do_shuffle=true \ + --epoch_size=1 \ + --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \ + --save_checkpoint_steps=1000 \ + --save_checkpoint_num=1 \ + --mindrecord_dir=$MINDRECORD_DIR \ + --mindrecord_prefix="coco_hp.train.mind" \ + --visual_image=false \ + --save_result_dir="" > training_log.txt 2>&1 & \ No newline at end of file diff --git a/model_zoo/research/cv/centernet/src/__init__.py b/model_zoo/research/cv/centernet/src/__init__.py index 00522dc2e5..e0c5fa0d19 100644 --- a/model_zoo/research/cv/centernet/src/__init__.py +++ b/model_zoo/research/cv/centernet/src/__init__.py @@ -15,7 +15,7 @@ """CenterNet Init.""" from .centernet_pose import GatherMultiPoseFeatureCell, CenterNetMultiPoseLossCell, \ - CenterNetWithLossScaleCell, CenterNetMultiPoseEval + CenterNetWithLossScaleCell, CenterNetMultiPoseEval, CenterNetWithoutLossScaleCell from .dataset import COCOHP from .visual import visual_allimages, visual_image from .decode import MultiPoseDecode @@ -23,6 +23,7 @@ from .post_process import convert_eval_format, to_float, resize_detection, post_ __all__ = [ "GatherMultiPoseFeatureCell", "CenterNetMultiPoseLossCell", "CenterNetWithLossScaleCell", \ - "CenterNetMultiPoseEval", "COCOHP", "visual_allimages", "visual_image", "MultiPoseDecode", \ - "convert_eval_format", "to_float", "resize_detection", "post_process", "merge_outputs" + "CenterNetMultiPoseEval", "CenterNetWithoutLossScaleCell", "COCOHP", "visual_allimages", \ + "visual_image", "MultiPoseDecode", "convert_eval_format", "to_float", "resize_detection", \ + "post_process", "merge_outputs" ] diff --git a/model_zoo/research/cv/centernet/src/centernet_pose.py b/model_zoo/research/cv/centernet/src/centernet_pose.py index 0a49dc8cb2..c6c560e6a7 100644 --- a/model_zoo/research/cv/centernet/src/centernet_pose.py +++ b/model_zoo/research/cv/centernet/src/centernet_pose.py @@ -197,6 +197,46 @@ class CenterNetMultiPoseLossCell(nn.Cell): return total_loss +class CenterNetWithoutLossScaleCell(nn.Cell): + """ + Encapsulation class of centernet training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + + Returns: + Tuple of Tensors, the loss, overflow flag and scaling sens of the network. + """ + def __init__(self, network, optimizer): + super(CenterNetWithoutLossScaleCell, self).__init__(auto_prefix=False) + self.image = ImagePreProcess() + self.network = network + self.network.set_grad() + self.weights = optimizer.parameters + self.optimizer = optimizer + self.grad = ops.GradOperation(get_by_list=True, sens_param=False) + + @ops.add_flags(has_effect=True) + def construct(self, image, hm, reg_mask, ind, wh, kps, kps_mask, reg, + hm_hp, hp_offset, hp_ind, hp_mask): + """Defines the computation performed.""" + image = self.image(image) + weights = self.weights + loss = self.network(image, hm, reg_mask, ind, wh, kps, kps_mask, reg, + hm_hp, hp_offset, hp_ind, hp_mask) + + grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, kps, + kps_mask, reg, hm_hp, hp_offset, + hp_ind, hp_mask) + succ = self.optimizer(grads) + ret = loss + return ops.depend(ret, succ) + + class CenterNetWithLossScaleCell(nn.Cell): """ Encapsulation class of centernet training. @@ -279,17 +319,16 @@ class CenterNetMultiPoseEval(nn.Cell): Args: net_config: The config info of CenterNet network. - flip_test(bool): Flip data augmentation or not. Default: False. K(number): Max number of output objects. Default: 100. + enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True. Returns: Tensor, detection of images(bboxes, score, keypoints and category id of each objects) """ - def __init__(self, net_config, flip_test=False, K=100): + def __init__(self, net_config, K=100, enable_nms_fp16=True): super(CenterNetMultiPoseEval, self).__init__() self.network = GatherMultiPoseFeatureCell(net_config) - self.decode = MultiPoseDecode(net_config, flip_test, K) - self.flip_test = flip_test + self.decode = MultiPoseDecode(net_config, K, enable_nms_fp16) self.shape = ops.Shape() self.reshape = ops.Reshape() diff --git a/model_zoo/research/cv/centernet/src/config.py b/model_zoo/research/cv/centernet/src/config.py index 49f105d0b2..573d8212f4 100644 --- a/model_zoo/research/cv/centernet/src/config.py +++ b/model_zoo/research/cv/centernet/src/config.py @@ -104,8 +104,7 @@ train_config = edict({ eval_config = edict({ - 'flip_test': False, - 'soft_nms': False, + 'soft_nms': True, 'keep_res': True, 'multi_scales': [1.0], 'pad': 31, diff --git a/model_zoo/research/cv/centernet/src/dataset.py b/model_zoo/research/cv/centernet/src/dataset.py index 0b27397d47..b3f5f8eb61 100644 --- a/model_zoo/research/cv/centernet/src/dataset.py +++ b/model_zoo/research/cv/centernet/src/dataset.py @@ -17,7 +17,6 @@ Data operations, will be used in train.py """ import os -import copy import math import argparse import cv2 @@ -66,7 +65,7 @@ class COCOHP(ds.Dataset): if not os.path.exists(self.save_path): os.makedirs(self.save_path) - def init(self, data_dir, keep_res=False, flip_test=False): + def init(self, data_dir, keep_res=False): """initailize additional info""" logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) if not os.path.isdir(data_dir): @@ -94,7 +93,6 @@ class COCOHP(ds.Dataset): self.images = image_ids self.num_samples = len(self.images) self.keep_res = keep_res - self.flip_test = flip_test if self.run_mode != "train": self.pad = 31 logger.info('Loaded {} {} samples'.format(self.run_mode, self.num_samples)) @@ -167,7 +165,7 @@ class COCOHP(ds.Dataset): ret = (img, image_id) return ret - def pre_process_for_test(self, image, img_id, scale, meta=None): + def pre_process_for_test(self, image, img_id, scale): """image pre-process for evaluation""" b, h, w, ch = image.shape assert b == 1, "only single image was supported here" @@ -191,17 +189,8 @@ class COCOHP(ds.Dataset): flags=cv2.INTER_LINEAR) inp_img = (inp_image.astype(np.float32) / 255. - self.data_opt.mean) / self.data_opt.std - h, w, ch = inp_img.shape - images = copy.deepcopy(inp_img) - if self.flip_test: - flip_image = inp_img[:, ::-1, :] - inp_img = inp_img.reshape((1, h, w, ch)) - flip_image = flip_image.reshape((1, h, w, ch)) - # (2, h, w, c) - images = np.concatenate((inp_img, flip_image), axis=0) - else: - images = images.reshape((1, h, w, ch)) - images = images.transpose(0, 3, 1, 2) + eval_image = inp_img.reshape((1,) + inp_img.shape) + eval_image = eval_image.transpose(0, 3, 1, 2) meta = {'c': c, 's': s, 'out_height': inp_height // self.net_opt.down_ratio, @@ -244,7 +233,7 @@ class COCOHP(ds.Dataset): image_name = "gt_" + self.run_mode + "_image_" + str(img_id) + "_scale_" + str(scale) + ".png" cv2.imwrite("{}/{}".format(self.save_path, image_name), inp_image) - return images, meta + return eval_image, meta def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): """image pre-process and augmentation""" diff --git a/model_zoo/research/cv/centernet/src/decode.py b/model_zoo/research/cv/centernet/src/decode.py index 2b33a26387..c41a644d31 100644 --- a/model_zoo/research/cv/centernet/src/decode.py +++ b/model_zoo/research/cv/centernet/src/decode.py @@ -30,25 +30,32 @@ class NMS(nn.Cell): Args: kernel(int): Maxpooling kernel size. Default: 3. + enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True. Returns: Tensor, heatmap after non-maximum suppression. """ - def __init__(self, kernel=3): + def __init__(self, kernel=3, enable_nms_fp16=True): super(NMS, self).__init__() self.pad = (kernel - 1) // 2 self.cast = ops.Cast() self.dtype = ops.DType() self.equal = ops.Equal() self.max_pool = nn.MaxPool2d(kernel, stride=1, pad_mode="same") + self.enable_fp16 = enable_nms_fp16 def construct(self, heat): + """Non-maximum suppression""" dtype = self.dtype(heat) - heat = self.cast(heat, mstype.float16) - heat_max = self.max_pool(heat) - keep = self.equal(heat, heat_max) - keep = self.cast(keep, dtype) - heat = self.cast(heat, dtype) + if self.enable_fp16: + heat = self.cast(heat, mstype.float16) + heat_max = self.max_pool(heat) + keep = self.equal(heat, heat_max) + keep = self.cast(keep, dtype) + heat = self.cast(heat, dtype) + else: + heat_max = self.max_pool(heat) + keep = self.equal(heat, heat_max) heat = heat * keep return heat @@ -127,18 +134,24 @@ class GatherFeatureByInd(nn.Cell): """ Gather features by index - Args: None + Args: + enable_cpu_gather (bool): Use cpu operator GatherD to gather feature or not, adaption for CPU. Default: True. Returns: Tensor """ - def __init__(self): + def __init__(self, enable_cpu_gatherd=True): super(GatherFeatureByInd, self).__init__() self.tile = ops.Tile() self.shape = ops.Shape() self.concat = ops.Concat(axis=1) self.reshape = ops.Reshape() - self.gather_nd = ops.GatherNd() + self.enable_cpu_gatherd = enable_cpu_gatherd + if self.enable_cpu_gatherd: + self.gather_nd = ops.GatherD() + self.expand_dims = ops.ExpandDims() + else: + self.gather_nd = ops.GatherNd() def construct(self, feat, ind): """gather by index""" @@ -147,18 +160,24 @@ class GatherFeatureByInd(nn.Cell): b, J, K = self.shape(ind) feat = self.reshape(feat, (b, J, K, -1)) _, _, _, N = self.shape(feat) - ind = self.reshape(ind, (-1, 1)) - ind_b = nn.Range(0, b * J, 1)() - ind_b = self.reshape(ind_b, (-1, 1)) - ind_b = self.tile(ind_b, (1, K)) - ind_b = self.reshape(ind_b, (-1, 1)) - index = self.concat((ind_b, ind)) - # (b, N, 2) - index = self.reshape(index, (-1, K, 2)) - # (b, N, c) - feat = self.reshape(feat, (-1, K, N)) - feat = self.gather_nd(feat, index) - feat = self.reshape(feat, (b, J, K, -1)) + if self.enable_cpu_gatherd: + # (b, J, K, N) + index = self.expand_dims(ind, -1) + index = self.tile(index, (1, 1, 1, N)) + feat = self.gather_nd(feat, 2, index) + else: + ind = self.reshape(ind, (-1, 1)) + ind_b = nn.Range(0, b * J, 1)() + ind_b = self.reshape(ind_b, (-1, 1)) + ind_b = self.tile(ind_b, (1, K)) + ind_b = self.reshape(ind_b, (-1, 1)) + index = self.concat((ind_b, ind)) + # (b*J, K, 2) + index = self.reshape(index, (-1, K, 2)) + # (b*J, K) + feat = self.reshape(feat, (-1, K, N)) + feat = self.gather_nd(feat, index) + feat = self.reshape(feat, (b, J, K, -1)) return feat @@ -285,17 +304,16 @@ class MultiPoseDecode(nn.Cell): Args: net_config(edict): config info for CenterNet network. - flip_test(bool): flip test of not. Default: False. K(int): maximum objects number. Default: 100. + enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True. Returns: Tensor, multi-objects detections. """ - def __init__(self, net_config, flip_test=False, K=100): + def __init__(self, net_config, K=100, enable_nms_fp16=True): super(MultiPoseDecode, self).__init__() self.K = K - self.flip_test = flip_test - self.nms = NMS() + self.nms = NMS(enable_nms_fp16=enable_nms_fp16) self.shape = ops.Shape() self.gather_topk = GatherTopK() self.gather_topk_channel = GatherTopKChannel() @@ -336,8 +354,6 @@ class MultiPoseDecode(nn.Cell): def construct(self, feature): """gather detections""" heat = feature[0] - if self.flip_test: - heat = self.flip_tensor(heat) K = self.K b, _, _, _ = self.shape(heat) heat = self.nms(heat) @@ -346,8 +362,6 @@ class MultiPoseDecode(nn.Cell): xs = self.reshape(xs, (b, K, 1)) kps = feature[1] - if self.flip_test: - kps = self.flip_lr_off(kps) num_joints = self.shape(kps)[1] / 2 # (b, K, num_joints*2) kps = self.trans_gather_feature(kps, inds) @@ -365,15 +379,11 @@ class MultiPoseDecode(nn.Cell): kps = self.reshape(kps, (b, K, num_joints * 2)) wh = feature[2] - if self.flip_test: - wh = self.flip_tensor(wh) wh = self.trans_gather_feature(wh, inds) ws, hs = self.half(wh) if self.reg_offset: reg = feature[self.reg_ind] - if self.flip_test: - reg, _ = self.half_first(reg) reg = self.trans_gather_feature(reg, inds) reg = self.reshape(reg, (b, K, 2)) reg_w, reg_h = self.half(reg) @@ -387,16 +397,12 @@ class MultiPoseDecode(nn.Cell): if self.hm_hp: hm_hp = feature[self.hm_hp_ind] - if self.flip_test: - hm_hp = self.flip_lr(hm_hp) hm_hp = self.nms(hm_hp) # (b, num_joints, K) hm_score, hm_inds, hm_ys, hm_xs = self.gather_topk_channel(hm_hp, K=K) if self.reg_hp_offset: hp_offset = feature[self.reg_hp_ind] - if self.flip_test: - hp_offset, _ = self.half_first(hp_offset) hp_offset = self.trans_gather_feature(hp_offset, self.reshape(hm_inds, (b, -1))) hp_offset = self.reshape(hp_offset, (b, num_joints, K, 2)) hp_ws, hp_hs = self.half(hp_offset) diff --git a/model_zoo/research/cv/centernet/src/utils.py b/model_zoo/research/cv/centernet/src/utils.py index 307003d2d0..f063ff1ab7 100644 --- a/model_zoo/research/cv/centernet/src/utils.py +++ b/model_zoo/research/cv/centernet/src/utils.py @@ -17,6 +17,7 @@ Functional Cells to be used. """ import math +import time import numpy as np import mindspore.nn as nn import mindspore.ops as ops @@ -119,33 +120,46 @@ class GatherFeature(nn.Cell): """ Gather feature at specified position - Args: None + Args: + enable_cpu_gather (bool): Use cpu operator GatherD to gather feature or not, adaption for CPU. Default: True. Returns: Tensor, feature at spectified position """ - def __init__(self): + def __init__(self, enable_cpu_gather=True): super(GatherFeature, self).__init__() self.tile = ops.Tile() self.shape = ops.Shape() self.concat = ops.Concat(axis=1) self.reshape = ops.Reshape() - self.gather_nd = ops.GatherNd() + self.enable_cpu_gather = enable_cpu_gather + if self.enable_cpu_gather: + self.gather_nd = ops.GatherD() + self.expand_dims = ops.ExpandDims() + else: + self.gather_nd = ops.GatherND() def construct(self, feat, ind): """gather by specified index""" - # (b, N)->(b*N, 1) - b, N = self.shape(ind) - ind = self.reshape(ind, (-1, 1)) - ind_b = nn.Range(0, b, 1)() - ind_b = self.reshape(ind_b, (-1, 1)) - ind_b = self.tile(ind_b, (1, N)) - ind_b = self.reshape(ind_b, (-1, 1)) - index = self.concat((ind_b, ind)) - # (b, N, 2) - index = self.reshape(index, (b, N, -1)) - # (b, N, c) - feat = self.gather_nd(feat, index) + if self.enable_cpu_gather: + _, _, c = self.shape(feat) + # (b, N, c) + index = self.expand_dims(ind, -1) + index = self.tile(index, (1, 1, c)) + feat = self.gather_nd(feat, 1, index) + else: + # (b, N)->(b*N, 1) + b, N = self.shape(ind) + ind = self.reshape(ind, (-1, 1)) + ind_b = nn.Range(0, b, 1)() + ind_b = self.reshape(ind_b, (-1, 1)) + ind_b = self.tile(ind_b, (1, N)) + ind_b = self.reshape(ind_b, (-1, 1)) + index = self.concat((ind_b, ind)) + # (b, N, 2) + index = self.reshape(index, (b, N, -1)) + # (b, N, c) + feat = self.gather_nd(feat, index) return feat @@ -477,11 +491,19 @@ class LossCallBack(Callback): Args: dataset_size (int): Dataset size. Default: -1. + enable_static_time (bool): enable static time cost, adaption for CPU. Default: False. """ - def __init__(self, dataset_size=-1): + def __init__(self, dataset_size=-1, enable_static_time=False): super(LossCallBack, self).__init__() self._dataset_size = dataset_size + self._enable_static_time = enable_static_time + + def step_begin(self, run_context): + """ + Get begining time of each step + """ + self._begin_time = time.time() def step_end(self, run_context): """ @@ -493,11 +515,19 @@ class LossCallBack(Callback): if percent == 0: percent = 1 epoch_num -= 1 - print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" - .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) + if self._enable_static_time: + cur_time = time.time() + time_per_step = cur_time - self._begin_time + print("epoch: {}, current epoch percent: {}, step: {}, time per step: {} s, outputs are {}" + .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, "%.3f" % time_per_step, + str(cb_params.net_outputs)), flush=True) + else: + print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" + .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, + str(cb_params.net_outputs)), flush=True) else: print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, - str(cb_params.net_outputs))) + str(cb_params.net_outputs)), flush=True) class CenterNetPolynomialDecayLR(LearningRateSchedule): diff --git a/model_zoo/research/cv/centernet/train.py b/model_zoo/research/cv/centernet/train.py index 46784be702..11a4066e18 100644 --- a/model_zoo/research/cv/centernet/train.py +++ b/model_zoo/research/cv/centernet/train.py @@ -31,12 +31,15 @@ from mindspore.common import set_seed from mindspore.profiler import Profiler from src.dataset import COCOHP from src import CenterNetMultiPoseLossCell, CenterNetWithLossScaleCell +from src import CenterNetWithoutLossScaleCell from src.utils import LossCallBack, CenterNetPolynomialDecayLR, CenterNetMultiEpochsDecayLR from src.config import dataset_config, net_config, train_config _current_dir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser(description='CenterNet training') +parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'], + help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"], help="Run distribute, default is false.") parser.add_argument("--need_profiler", type=str, default="false", choices=["true", "false"], @@ -125,26 +128,32 @@ def _get_optimizer(network, dataset_size): def train(): """training CenterNet""" - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) - context.set_context(enable_auto_mixed_precision=False) + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(reserve_class_name_in_scope=False) context.set_context(save_graphs=False) ckpt_save_dir = args_opt.save_checkpoint_path - if args_opt.distribute == "true": - D.init() - device_num = args_opt.device_num - rank = args_opt.device_id % device_num - ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' - - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num) - _set_parallel_all_reduce_split() - else: - rank = 0 - device_num = 1 + rank = 0 + device_num = 1 num_workers = 8 + if args_opt.device_target == "Ascend": + context.set_context(enable_auto_mixed_precision=False) + context.set_context(device_id=args_opt.device_id) + if args_opt.distribute == "true": + D.init() + device_num = args_opt.device_num + rank = args_opt.device_id % device_num + ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + device_num=device_num) + _set_parallel_all_reduce_split() + else: + args_opt.distribute = "false" + args_opt.need_profiler = "false" + args_opt.enable_data_sink = "false" + # Start create dataset! # mindrecord files will be generated at args_opt.mindrecord_dir such as centernet.mindrecord0, 1, ... file_num. logger.info("Begin creating dataset for CenterNet") @@ -167,7 +176,8 @@ def train(): optimizer = _get_optimizer(net_with_loss, dataset_size) - callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size)] + enable_static_time = args_opt.device_target == "CPU" + callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size, enable_static_time)] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) @@ -178,12 +188,13 @@ def train(): if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) - - net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer, - sens=train_config.loss_scale_value) + if args_opt.device_target == "Ascend": + net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer, + sens=train_config.loss_scale_value) + else: + net_with_grads = CenterNetWithoutLossScaleCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) - model.train(new_repeat_count, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)