modify yolov3&ssd shell script

5 years ago · 5d77480c7e
--- a/example/ssd_coco2017/run_distribute_train.sh
+++ b/example/ssd_coco2017/run_distribute_train.sh
@@ -14,13 +14,20 @@
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "================================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDSPORE_HCCL_CONFIG_PATH"
 echo "for example: sh run_distribute_train.sh 8 150 coco /data/hccl.json"
 echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
 echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
 echo "It is better to use absolute path."
 echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script."
 echo "=============================================================================================================="
 echo "================================================================================================================="
 if [ $# != 4 ] && [ $# != 6 ]
 then
    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \
 [MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
    exit 1
 fi
 # Before start distribute train, first create mindrecord files.
 python train.py --only_create_dataset=1
@@ -30,6 +37,8 @@ echo "After running the scipt, the network runs in the background. The log will
 export RANK_SIZE=$1
 EPOCH_SIZE=$2
 DATASET=$3
 PRE_TRAINED=$5
 PRE_TRAINED_EPOCH_SIZE=$6
 export MINDSPORE_HCCL_CONFIG_PATH=$4
@@ -43,12 +52,29 @@ do
    export RANK_ID=$i
    echo "start training for rank $i, device $DEVICE_ID"
    env > env.log
    python ../train.py  \
    --distribute=1  \
    --lr=0.4 \
    --dataset=$DATASET \
    --device_num=$RANK_SIZE  \
    --device_id=$DEVICE_ID  \
    --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
    if [ $# == 4 ]
    then
        python ../train.py  \
        --distribute=1  \
        --lr=0.4 \
        --dataset=$DATASET \
        --device_num=$RANK_SIZE  \
        --device_id=$DEVICE_ID  \
        --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
    fi
    if [ $# == 6 ]
    then
        python ../train.py  \
        --distribute=1  \
        --lr=0.4 \
        --dataset=$DATASET \
        --device_num=$RANK_SIZE  \
        --device_id=$DEVICE_ID  \
        --pre_trained=$PRE_TRAINED \
        --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
        --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
    fi
    cd ../
 done
--- a/example/ssd_coco2017/train.py
+++ b/example/ssd_coco2017/train.py
@@ -88,6 +88,7 @@ def main():
    parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.")
    parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
    parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
    parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
    parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
    args_opt = parser.parse_args()
@@ -150,17 +151,20 @@ def main():
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config)
        lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=args_opt.lr,
                           warmup_epochs=max(args_opt.epoch_size // 20, 1),
                           total_epochs=args_opt.epoch_size,
                           steps_per_epoch=dataset_size))
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)
        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError("pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
                           lr_init=0, lr_end=0, lr_max=args_opt.lr,
                           warmup_epochs=max(350 // 20, 1),
                           total_epochs=350,
                           steps_per_epoch=dataset_size))
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)
        callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
        model = Model(net)
--- a/example/yolov3_coco2017/run_distribute_train.sh
+++ b/example/yolov3_coco2017/run_distribute_train.sh
@@ -14,18 +14,27 @@
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "======================================================================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH"
 echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json"
 echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
 echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
 echo "It is better to use absolute path."
 echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
 echo "=============================================================================================================="
 echo "======================================================================================================================================================="
 if [ $# != 6 ] && [ $# != 8 ]
 then
    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
 [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
    exit 1
 fi
 EPOCH_SIZE=$2
 MINDRECORD_DIR=$3
 IMAGE_DIR=$4
 ANNO_PATH=$5
 PRE_TRAINED=$7
 PRE_TRAINED_EPOCH_SIZE=$8
 # Before start distribute train, first create mindrecord files.
 python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR  \
@@ -51,14 +60,34 @@ do
    export RANK_ID=$i
    echo "start training for rank $i, device $DEVICE_ID"
    env > env.log
    taskset -c $cmdopt python ../train.py  \
    --distribute=1  \
    --lr=0.005 \
    --device_num=$RANK_SIZE  \
    --device_id=$DEVICE_ID  \
    --mindrecord_dir=$MINDRECORD_DIR  \
    --image_dir=$IMAGE_DIR  \
    --epoch_size=$EPOCH_SIZE  \
    --anno_path=$ANNO_PATH > log.txt 2>&1 &
    if [ $# == 6 ]
    then
        taskset -c $cmdopt python ../train.py  \
        --distribute=1  \
        --lr=0.005 \
        --device_num=$RANK_SIZE  \
        --device_id=$DEVICE_ID  \
        --mindrecord_dir=$MINDRECORD_DIR  \
        --image_dir=$IMAGE_DIR  \
        --epoch_size=$EPOCH_SIZE  \
        --anno_path=$ANNO_PATH > log.txt 2>&1 &
    fi
    if [ $# == 8 ]
    then
        taskset -c $cmdopt python ../train.py  \
        --distribute=1  \
        --lr=0.005 \
        --device_num=$RANK_SIZE  \
        --device_id=$DEVICE_ID  \
        --mindrecord_dir=$MINDRECORD_DIR  \
        --image_dir=$IMAGE_DIR  \
        --epoch_size=$EPOCH_SIZE  \
        --pre_trained=$PRE_TRAINED \
        --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
        --anno_path=$ANNO_PATH > log.txt 2>&1 &
    fi
    cd ../
 done
--- a/example/yolov3_coco2017/run_standalone_train.sh
+++ b/example/yolov3_coco2017/run_standalone_train.sh
@@ -14,10 +14,25 @@
 # limitations under the License.
 # ============================================================================
 echo "=============================================================================================================="
 echo "========================================================================================================================================="
 echo "Please run the scipt as: "
 echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH"
 echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt"
 echo "=============================================================================================================="
 echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
 echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
 echo "========================================================================================================================================="
 python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
 if [ $# != 5 ] && [ $# != 7 ]
 then
    echo "Usage: sh run_standalone_train.sh [DEVICE_ID] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] \
 [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
    exit 1
 fi
 if [ $# == 5 ]
 then
    python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
 fi
 if [ $# == 7 ]
 then
    python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
 fi
--- a/example/yolov3_coco2017/train.py
+++ b/example/yolov3_coco2017/train.py
@@ -71,6 +71,7 @@ def main():
    parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
    parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
    parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
    parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
    parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
    parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train",
@@ -133,14 +134,19 @@ def main():
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)
        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size,
                           decay_step=1000, decay_rate=0.95, steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)
        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError("pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        total_epoch_size = 60
        if args_opt.distribute:
            total_epoch_size = 160
        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
                           global_step=total_epoch_size * dataset_size,
                           decay_step=1000, decay_rate=0.95, steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)
        callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]