deeplabv3 support cpu training

5 years ago · 2599f14916
--- a/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh
+++ b/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh
@@ -0,0 +1,45 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 export DEVICE_ID=0
 export SLOG_PRINT_TO_STDOUT=0
 train_path=/PATH/TO/EXPERIMENTS_DIR
 train_code_path=/PATH/TO/MODEL_ZOO_CODE

 if [ -d ${train_path} ]; then
  rm -rf ${train_path}
 fi
 mkdir -p ${train_path}
 mkdir ${train_path}/device${DEVICE_ID}
 mkdir ${train_path}/ckpt
 cd ${train_path}/device${DEVICE_ID} || exit

 python ${train_code_path}/train.py --data_file=/PATH/TO/MINDRECORD_NAME  \
                    --device_target=CPU  \
                    --train_dir=${train_path}/ckpt  \
                    --train_epochs=200  \
                    --batch_size=32  \
                    --crop_size=513  \
                    --base_lr=0.015  \
                    --lr_type=cos  \
                    --min_scale=0.5  \
                    --max_scale=2.0  \
                    --ignore_label=255  \
                    --num_classes=21  \
                    --model=deeplab_v3_s16  \
                    --ckpt_pre_trained=/PATH/TO/PRETRAIN_MODEL  \
                    --save_steps=1500  \
                    --keep_checkpoint_max=200 >log 2>&1 &
--- a/model_zoo/official/cv/deeplabv3/train.py
+++ b/model_zoo/official/cv/deeplabv3/train.py
@@ -32,8 +32,6 @@ from src.nets import net_factory
 from src.utils import learning_rates

 set_seed(1)
 context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
                    device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))


 class BuildTrainNetwork(nn.Cell):
@@ -77,6 +75,8 @@ def parse_args():
    parser.add_argument('--ckpt_pre_trained', type=str, default='', help='pretrained model')

    # train
    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
                        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument('--is_distributed', action='store_true', help='distributed training')
    parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
    parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
@@ -90,6 +90,12 @@ def parse_args():
 def train():
    args = parse_args()

    if args.device_target == "CPU":
        context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU")
    else:
        context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
                            device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))

    # init multicards training
    if args.is_distributed:
        init()
@@ -150,7 +156,8 @@ def train():

    # loss scale
    manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
    model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale)
    amp_level = "O0" if args.device_target == "CPU" else "O3"
    model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale)

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)