Browse Source

extend hccl time out and modify lr schedule

tags/v1.1.0
zhouyaqiang 5 years ago
parent
commit
64a3373f31
3 changed files with 6 additions and 4 deletions
  1. +1
    -0
      model_zoo/official/cv/inceptionv3/README.md
  2. +1
    -1
      model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh
  3. +4
    -3
      model_zoo/official/cv/inceptionv3/src/lr_generator.py

+ 1
- 0
model_zoo/official/cv/inceptionv3/README.md View File

@@ -133,6 +133,7 @@ sh scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
``` ```
> Notes: > Notes:
RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link]https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools. RANK_TABLE_FILE can refer to [Link](https://www.mindspore.cn/tutorial/training/en/master/advanced_use/distributed_training_ascend.html) , and the device_ip can be got as [Link]https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.
For large models like InceptionV3, it's better to export an external environment variable `export HCCL_CONNECT_TIMEOUT=600` to extend hccl connection checking time from the default 120 seconds to 600 seconds. Otherwise, the connection could be timeout since compiling time increases with the growth of model size.


> This is processor cores binding operation regarding the `device_num` and total processor numbers. If you are not expect to do it, remove the operations `taskset` in `scripts/run_distribute_train.sh` > This is processor cores binding operation regarding the `device_num` and total processor numbers. If you are not expect to do it, remove the operations `taskset` in `scripts/run_distribute_train.sh`




+ 1
- 1
model_zoo/official/cv/inceptionv3/scripts/run_distribute_train.sh View File

@@ -17,7 +17,7 @@
DATA_DIR=$2 DATA_DIR=$2
export RANK_TABLE_FILE=$1 export RANK_TABLE_FILE=$1
export RANK_SIZE=8 export RANK_SIZE=8
export HCCL_CONNECT_TIMEOUT=600


cores=`cat /proc/cpuinfo|grep "processor" |wc -l` cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
echo "the number of logical core" $cores echo "the number of logical core" $cores


+ 4
- 3
model_zoo/official/cv/inceptionv3/src/lr_generator.py View File

@@ -45,7 +45,7 @@ def _generate_steps_lr(lr_init, lr_max, total_steps, warmup_steps, global_step=0
else: else:
lr = lr_max * 0.001 lr = lr_max * 0.001
lr_each_step.append(lr) lr_each_step.append(lr)
lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:]
lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:]
return lr_each_step return lr_each_step




@@ -81,7 +81,7 @@ def _generate_exponential_lr(lr_init, lr_max, total_steps, warmup_steps, steps_p
return lr_each_step return lr_each_step




def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps):
def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, global_step=0):
""" """
Applies cosine decay to generate learning rate array. Applies cosine decay to generate learning rate array.


@@ -105,6 +105,7 @@ def _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps):
cosine_decay = 0.5 * (1 + math.cos(math.pi * (i-warmup_steps) / decay_steps)) cosine_decay = 0.5 * (1 + math.cos(math.pi * (i-warmup_steps) / decay_steps))
lr = (lr_max-lr_end)*cosine_decay + lr_end lr = (lr_max-lr_end)*cosine_decay + lr_end
lr_each_step.append(lr) lr_each_step.append(lr)
lr_each_step = np.array(lr_each_step).astype(np.float32)[global_step:]
return lr_each_step return lr_each_step




@@ -155,7 +156,7 @@ def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch
elif lr_decay_mode == 'steps_decay': elif lr_decay_mode == 'steps_decay':
lr_each_step = _generate_exponential_lr(lr_init, lr_max, total_steps, warmup_steps, steps_per_epoch) lr_each_step = _generate_exponential_lr(lr_init, lr_max, total_steps, warmup_steps, steps_per_epoch)
elif lr_decay_mode == 'cosine': elif lr_decay_mode == 'cosine':
lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps)
lr_each_step = _generate_cosine_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps, global_step)
else: else:
lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps) lr_each_step = _generate_liner_lr(lr_init, lr_end, lr_max, total_steps, warmup_steps)
learning_rate = np.array(lr_each_step).astype(np.float32) learning_rate = np.array(lr_each_step).astype(np.float32)


Loading…
Cancel
Save