diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh index ea469391c7..4b64e18ce2 100644 --- a/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train.sh @@ -52,21 +52,25 @@ cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` avg=`expr $cpus \/ $RANK_SIZE` gap=`expr $avg \- 1` +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +src_dir=$script_dir/.. + +start_idx=0 for((i=0;i env.log - taskset -c $cmdopt python train.py --data_path=$2 --device_target="Ascend" --device_id=$i --is_distributed=1 --dataset=$dataset_type &> log & + taskset -c $cmdopt python train.py --data_path=$2 --device_target="Ascend" --device_id=$DEVICE_ID --is_distributed=1 --dataset=$dataset_type &> log & cd .. done diff --git a/model_zoo/official/cv/vgg16/src/config.py b/model_zoo/official/cv/vgg16/src/config.py index 600829be4a..77eb253efd 100644 --- a/model_zoo/official/cv/vgg16/src/config.py +++ b/model_zoo/official/cv/vgg16/src/config.py @@ -47,14 +47,14 @@ cifar_cfg = edict({ # config for vgg16, imagenet2012 imagenet_cfg = edict({ "num_classes": 1000, - "lr": 0.01, + "lr": 0.04, "lr_init": 0.01, "lr_max": 0.1, "lr_epochs": '30,60,90,120', "lr_scheduler": 'cosine_annealing', "warmup_epochs": 0, - "batch_size": 32, - "max_epoch": 150, + "batch_size": 64, + "max_epoch": 90, "momentum": 0.9, "weight_decay": 1e-4, "loss_scale": 1024, diff --git a/model_zoo/official/cv/vgg16/train.py b/model_zoo/official/cv/vgg16/train.py index bae5d0adee..14c2248471 100644 --- a/model_zoo/official/cv/vgg16/train.py +++ b/model_zoo/official/cv/vgg16/train.py @@ -61,7 +61,7 @@ def parse_args(cloud_args=None): parser.add_argument('--lr_gamma', type=float, default=0.1, help='decrease lr by a factor of exponential lr_scheduler') parser.add_argument('--eta_min', type=float, default=0., help='eta_min in cosine_annealing scheduler') - parser.add_argument('--T_max', type=int, default=150, help='T-max in cosine_annealing scheduler') + parser.add_argument('--T_max', type=int, default=90, help='T-max in cosine_annealing scheduler') # logging and checkpoint related parser.add_argument('--log_interval', type=int, default=100, help='logging interval') @@ -140,7 +140,7 @@ if __name__ == '__main__': device_num = args.group_size context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True, all_reduce_fusion_config=[3, 10, 12, 15]) + gradients_mean=True, all_reduce_fusion_config=[2, 18]) else: if args.device_target == "Ascend": context.set_context(device_id=args.device_id)