Merge pull request !3444 from zhouyuanshen/mastertags/v0.7.0-beta
| @@ -93,7 +93,8 @@ sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] | |||||
| sh run_standalone_train.sh [PRETRAINED_MODEL] | sh run_standalone_train.sh [PRETRAINED_MODEL] | ||||
| ``` | ``` | ||||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||||
| > Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). | |||||
| > As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned. | |||||
| #### Result | #### Result | ||||
| @@ -14,7 +14,7 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| if [ $# != 2 ] | |||||
| if [ $# -lt 1 ] || [ $# -gt 2 ] | |||||
| then | then | ||||
| echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" | echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" | ||||
| exit 1 | exit 1 | ||||
| @@ -27,11 +27,9 @@ get_real_path(){ | |||||
| echo "$(realpath -m $PWD/$1)" | echo "$(realpath -m $PWD/$1)" | ||||
| fi | fi | ||||
| } | } | ||||
| PATH1=$(get_real_path $1) | |||||
| PATH2=$(get_real_path $2) | |||||
| PATH1=$(get_real_path $1) | |||||
| echo $PATH1 | echo $PATH1 | ||||
| echo $PATH2 | |||||
| if [ ! -f $PATH1 ] | if [ ! -f $PATH1 ] | ||||
| then | then | ||||
| @@ -39,10 +37,15 @@ then | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| if [ ! -f $PATH2 ] | |||||
| then | |||||
| echo "error: PRETRAINED_PATH=$PATH2 is not a file" | |||||
| exit 1 | |||||
| if [ $# == 2 ] | |||||
| then | |||||
| PATH2=$(get_real_path $2) | |||||
| echo $PATH2 | |||||
| if [ ! -f $PATH2 ] | |||||
| then | |||||
| echo "error: PRETRAINED_PATH=$PATH2 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| fi | fi | ||||
| ulimit -u unlimited | ulimit -u unlimited | ||||
| @@ -63,7 +66,11 @@ do | |||||
| cd ./train_parallel$i || exit | cd ./train_parallel$i || exit | ||||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | echo "start training for rank $RANK_ID, device $DEVICE_ID" | ||||
| env > env.log | env > env.log | ||||
| python train.py --do_train=True --device_id=$i --rank_id=$i --run_distribute=True --device_num=$DEVICE_NUM \ | |||||
| --pre_trained=$PATH2 &> log & | |||||
| if [ $# == 2 ] | |||||
| then | |||||
| python train.py --do_train=True --device_id=$i --rank_id=$i --run_distribute=True --device_num=$DEVICE_NUM --pre_trained=$PATH2 &> log & | |||||
| else | |||||
| python train.py --do_train=True --device_id=$i --rank_id=$i --run_distribute=True --device_num=$DEVICE_NUM &> log & | |||||
| fi | |||||
| cd .. | cd .. | ||||
| done | done | ||||
| @@ -14,7 +14,7 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| if [ $# != 1 ] | |||||
| if [ $# -gt 1 ] | |||||
| then | then | ||||
| echo "Usage: sh run_standalone_train.sh [PRETRAINED_PATH]" | echo "Usage: sh run_standalone_train.sh [PRETRAINED_PATH]" | ||||
| exit 1 | exit 1 | ||||
| @@ -27,13 +27,17 @@ get_real_path(){ | |||||
| echo "$(realpath -m $PWD/$1)" | echo "$(realpath -m $PWD/$1)" | ||||
| fi | fi | ||||
| } | } | ||||
| PATH1=$(get_real_path $1) | |||||
| echo $PATH1 | |||||
| if [ ! -f $PATH1 ] | |||||
| then | |||||
| echo "error: PRETRAINED_PATH=$PATH1 is not a file" | |||||
| exit 1 | |||||
| if [ $# == 1 ] | |||||
| then | |||||
| PATH1=$(get_real_path $1) | |||||
| echo $PATH1 | |||||
| if [ ! -f $PATH1 ] | |||||
| then | |||||
| echo "error: PRETRAINED_PATH=$PATH1 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| fi | fi | ||||
| ulimit -u unlimited | ulimit -u unlimited | ||||
| @@ -53,5 +57,10 @@ cp -r ../src ./train | |||||
| cd ./train || exit | cd ./train || exit | ||||
| echo "start training for device $DEVICE_ID" | echo "start training for device $DEVICE_ID" | ||||
| env > env.log | env > env.log | ||||
| python train.py --do_train=True --device_id=$DEVICE_ID --pre_trained=$PATH1 &> log & | |||||
| if [ $# == 1 ] | |||||
| then | |||||
| python train.py --do_train=True --device_id=$DEVICE_ID --pre_trained=$PATH1 &> log & | |||||
| else | |||||
| python train.py --do_train=True --device_id=$DEVICE_ID &> log & | |||||
| fi | |||||
| cd .. | cd .. | ||||