resnet50_quant modify readmetags/v1.0.0
| @@ -70,7 +70,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||
| ├── mobileNetv2_quant | |||
| ├── Readme.md # descriptions about MobileNetV2-Quant | |||
| ├── scripts | |||
| │ ├──run_train_quant.sh # shell script for train on Ascend | |||
| │ ├──run_train.sh # shell script for train on Ascend and GPU | |||
| │ ├──run_infer_quant.sh # shell script for evaluation on Ascend | |||
| ├── src | |||
| │ ├──config.py # parameter configuration | |||
| @@ -91,19 +91,22 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||
| You can start training using python or shell scripts. The usage of shell scripts as follows: | |||
| - Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] | |||
| - bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) | |||
| - bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) | |||
| ### Launch | |||
| ``` | |||
| # training example | |||
| shell: | |||
| Ascend: sh run_train_quant.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt | |||
| ``` bash | |||
| # training example | |||
| >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt | |||
| >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt | |||
| ``` | |||
| ### Result | |||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||
| Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`. | |||
| `train.log` is as follows: | |||
| ``` | |||
| epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] | |||
| @@ -0,0 +1,300 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| # check_and_get_Ascend_device(){ | |||
| # #device_list=(${1//,/ }) | |||
| # IFS=',' read -ra device_list <<<"$1" | |||
| # last_device_id=0 | |||
| # first_device_id=8 | |||
| # device_used=(0 0 0 0 0 0 0 0) | |||
| # for var in "${device_list[@]}" | |||
| # do | |||
| # if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] | |||
| # then | |||
| # echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${device_used[$((var))]} -eq 0 ] | |||
| # then | |||
| # device_used[ $((var)) ]=1 | |||
| # else | |||
| # echo "error: device id is duplicate, please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${last_device_id} \< $((var)) ] | |||
| # then | |||
| # last_device_id=$((var)) | |||
| # fi | |||
| # if [ ${first_device_id} \> $((var)) ] | |||
| # then | |||
| # first_device_id=$((var)) | |||
| # fi | |||
| # done | |||
| # device_num=`expr ${last_device_id} - ${first_device_id} + 1` | |||
| # if [ ${device_num} != ${#device_list[*]} ] | |||
| # then | |||
| # echo "error: the Ascend chips used must be continuous, please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ] | |||
| # then | |||
| # if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ] | |||
| # then | |||
| # echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips." | |||
| # exit 1 | |||
| # fi | |||
| # fi | |||
| # echo "${first_device_id},`expr ${last_device_id} + 1`" | |||
| # } | |||
| # get_hccl_name(){ | |||
| # server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:") | |||
| # device_num=`expr $2 - $1` | |||
| # device_id_list="" | |||
| # for(( i=$1 ; i < $2 ; i++ )) | |||
| # do | |||
| # device_id_list=${device_id_list}$i | |||
| # done | |||
| # hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json" | |||
| # echo ${hccl_name} | |||
| # } | |||
| get_gpu_device_num(){ | |||
| #device_list=(${1//,/ }) | |||
| IFS=',' read -ra device_list <<<"$1" | |||
| device_used=(0 0 0 0 0 0 0 0) | |||
| device_num=0 | |||
| for var in "${device_list[@]}" | |||
| do | |||
| if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] | |||
| then | |||
| echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" | |||
| exit 1 | |||
| fi | |||
| if [ ${device_used[$((var))]} -eq 0 ] | |||
| then | |||
| device_used[ $((var)) ]=1 | |||
| device_num=$((device_num+1)) | |||
| fi | |||
| done | |||
| echo ${device_num} | |||
| } | |||
| run_ascend(){ | |||
| if [ $# -gt 4 ] || [ $# -lt 3 ] | |||
| then | |||
| echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " | |||
| exit 1 | |||
| fi | |||
| #first_last_device=$(check_and_get_Ascend_device $2) | |||
| #devices=(${first_last_device//,/ }) | |||
| #IFS=',' read -ra devices <<<"${first_last_device}" | |||
| # first_device=${first_last_device:0:1} | |||
| # last_device=${first_last_device:2:1} | |||
| # device_num=`expr $((last_device)) - $((first_device))` | |||
| #single ascend or multiple ascend | |||
| # if [ ${device_num} -gt 1 ] | |||
| # then | |||
| # ori_path=$(dirname "$(readlink -f "$0" )") | |||
| # #generate hccl config file | |||
| # cd ../../../../utils/hccl_tools/ || exit | |||
| # device_num_arg="[${first_device},${last_device})" | |||
| # python hccl_tools.py --device_num=${device_num_arg} | |||
| # hccl_name=$(get_hccl_name ${first_device} ${last_device}) | |||
| # if [ ! -e ${hccl_name} ] | |||
| # then | |||
| # echo "error: failed to generate the hccl config file!" | |||
| # exit 1 | |||
| # fi | |||
| # mv ${hccl_name} ${ori_path} | |||
| # cd ${ori_path} || exit | |||
| # PATH1=$(get_real_path ${hccl_name}) | |||
| # if [ ! -f $PATH1 ] | |||
| # then | |||
| # echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| # exit 1 | |||
| # fi | |||
| # export RANK_TABLE_FILE=$PATH1 | |||
| # fi | |||
| PATH1=$(get_real_path $2) | |||
| PATH2=$(get_real_path $3) | |||
| if [ $# == 4 ] | |||
| then | |||
| PATH3=$(get_real_path $4) | |||
| fi | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 4 ] && [ ! -f $PATH3 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" | |||
| exit 1 | |||
| fi | |||
| rank_file_name=${2##*/} | |||
| IFS='_' read -ra array <<<"${rank_file_name}" | |||
| device_id_list=${array[2]} | |||
| first_device=${device_id_list:0:1} | |||
| #last_device=${device_list:${#device_list}-1:1} | |||
| device_num=${#device_id_list} | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=${device_num} | |||
| export RANK_SIZE=${device_num} | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| export SERVER_ID=0 | |||
| rank_start=$((DEVICE_NUM * SERVER_ID)) | |||
| rm -rf ./train | |||
| mkdir ./train | |||
| for((i=0; i<${device_num}; i++)) | |||
| do | |||
| export DEVICE_ID=$((first_device+i)) | |||
| export RANK_ID=$((rank_start + i)) | |||
| mkdir ./train/device$i | |||
| cp ../*.py ./train/device$i | |||
| cp *.sh ./train/device$i | |||
| cp -r ../src ./train/device$i | |||
| cd ./train/device$i || exit | |||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | |||
| env > env.log | |||
| if [ $# == 3 ] | |||
| then | |||
| python train.py --device_target=$1 --dataset_path=$PATH2 &> train.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| python train.py --device_traget=$1 --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log & | |||
| fi | |||
| cd ../.. || exit | |||
| done | |||
| } | |||
| run_gpu(){ | |||
| if [ $# -gt 3 ] || [ $# -lt 2 ] | |||
| then | |||
| echo "Usage: bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " | |||
| exit 1 | |||
| fi | |||
| PATH1=$(get_real_path $3) | |||
| if [ $# == 4 ] | |||
| then | |||
| PATH2=$(get_real_path $4) | |||
| fi | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 4 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| device_num=$(get_gpu_device_num $2) | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=${device_num} | |||
| export RANK_SIZE=${device_num} | |||
| export CUDA_VISIBLE_DEVICES=$2 | |||
| rm -rf ./train | |||
| mkdir ./train | |||
| cp ../*.py ./train | |||
| cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 3 ] | |||
| then | |||
| mpirun --allow-run-as-root -n ${RANK_SIZE} \ | |||
| python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| mpirun --allow-run-as-root -n ${RANK_SIZE} \ | |||
| python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & | |||
| fi | |||
| cd .. | |||
| } | |||
| if [ $1 = "Ascend" ] ; then | |||
| run_ascend "$@" | |||
| elif [ $1 = "GPU" ] ; then | |||
| run_gpu "$@" | |||
| else | |||
| echo "Unsupported device target: $1" | |||
| fi; | |||
| @@ -1,96 +0,0 @@ | |||
| #!/usr/bin/env bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| run_ascend() | |||
| { | |||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||
| then | |||
| echo "error: DEVICE_NUM=$2 is not in (1-9)" | |||
| exit 1 | |||
| fi | |||
| if [ ! -d $5 ] && [ ! -f $5 ] | |||
| then | |||
| echo "error: DATASET_PATH=$5 is not a directory or file" | |||
| exit 1 | |||
| fi | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| if [ -d "../train" ]; | |||
| then | |||
| rm -rf ../train | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| python ${BASEPATH}/../src/launch.py \ | |||
| --nproc_per_node=$2 \ | |||
| --visible_devices=$4 \ | |||
| --server_id=$3 \ | |||
| --training_script=${BASEPATH}/../train.py \ | |||
| --dataset_path=$5 \ | |||
| --pre_trained=$6 \ | |||
| --device_target=$1 &> train.log & # dataset train folder | |||
| } | |||
| run_gpu() | |||
| { | |||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||
| then | |||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||
| exit 1 | |||
| fi | |||
| if [ ! -d $4 ] | |||
| then | |||
| echo "error: DATASET_PATH=$4 is not a directory" | |||
| exit 1 | |||
| fi | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| if [ -d "../train" ]; | |||
| then | |||
| rm -rf ../train | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| export CUDA_VISIBLE_DEVICES="$3" | |||
| mpirun -n $2 --allow-run-as-root \ | |||
| python ${BASEPATH}/../train.py \ | |||
| --dataset_path=$4 \ | |||
| --device_target=$1 \ | |||
| --pre_trained=$5 &> ../train.log & # dataset train folder | |||
| } | |||
| if [ $# -gt 6 ] || [ $# -lt 5 ] | |||
| then | |||
| echo "Usage:\n \ | |||
| Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ | |||
| GPU: sh run_train_quant.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ | |||
| " | |||
| exit 1 | |||
| fi | |||
| if [ $1 = "Ascend" ] ; then | |||
| run_ascend "$@" | |||
| elif [ $1 = "GPU" ] ; then | |||
| run_gpu "$@" | |||
| else | |||
| echo "Unsupported device target." | |||
| fi; | |||
| @@ -1,4 +1,43 @@ | |||
| # Contents | |||
| # ResNet-50_quant Example | |||
| ## Description | |||
| This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore. | |||
| ## Requirements | |||
| - Install [MindSpore](https://www.mindspore.cn/install/en). | |||
| - Download the dataset ImageNet2012 | |||
| > Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows: | |||
| > ``` | |||
| > . | |||
| > ├── ilsvrc # train dataset | |||
| > └── ilsvrc_eval # infer dataset: images should be classified into 1000 directories firstly, just like train images | |||
| > ``` | |||
| ## Example structure | |||
| ```shell | |||
| resnet50_quant/ | |||
| ├── eval.py | |||
| ├── models | |||
| │ └── resnet_quant.py | |||
| ├── Readme.md | |||
| ├── scripts | |||
| │ ├── run_infer.sh | |||
| │ └── run_train.sh | |||
| ├── src | |||
| │ ├── config.py | |||
| │ ├── crossentropy.py | |||
| │ ├── dataset.py | |||
| │ ├── launch.py | |||
| │ └── lr_generator.py | |||
| └── train.py | |||
| ``` | |||
| - [resnet50 Description](#resnet50-description) | |||
| - [Model Architecture](#model-architecture) | |||
| @@ -88,21 +127,17 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||
| ### Usage | |||
| You can start training using python or shell scripts. The usage of shell scripts as follows: | |||
| - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH][CKPT_PATH] | |||
| - Ascend: sh run_train.sh Ascend [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) | |||
| ### Launch | |||
| ``` | |||
| # training example | |||
| shell: | |||
| Ascend: sh run_train.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/resnet/train/ Resnet50-90_5004.ckpt | |||
| # training example | |||
| Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ | |||
| ``` | |||
| ### Result | |||
| Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. | |||
| Training result will be stored in the example path. Checkpoints will be stored at `./train/device$i/` by default, and training log will be redirected to `./train/device$i/train.log` like followings. | |||
| ``` | |||
| epoch: 1 step: 5004, loss is 4.8995576 | |||
| @@ -1,4 +1,4 @@ | |||
| #!/usr/bin/env bash | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| @@ -14,49 +14,259 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| run_ascend() | |||
| { | |||
| if [ $2 -lt 1 ] && [ $2 -gt 8 ] | |||
| then | |||
| echo "error: DEVICE_NUM=$2 is not in (1-8)" | |||
| exit 1 | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| if [ ! -d $5 ] && [ ! -f $5 ] | |||
| then | |||
| echo "error: DATASET_PATH=$5 is not a directory or file" | |||
| exit 1 | |||
| # check_and_get_Ascend_device(){ | |||
| # #device_list=(${1//,/ }) | |||
| # IFS=',' read -ra device_list <<<"$1" | |||
| # last_device_id=0 | |||
| # first_device_id=8 | |||
| # device_used=(0 0 0 0 0 0 0 0) | |||
| # for var in "${device_list[@]}" | |||
| # do | |||
| # if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] | |||
| # then | |||
| # echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${device_used[$((var))]} -eq 0 ] | |||
| # then | |||
| # device_used[ $((var)) ]=1 | |||
| # else | |||
| # echo "error: device id is duplicate, please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${last_device_id} \< $((var)) ] | |||
| # then | |||
| # last_device_id=$((var)) | |||
| # fi | |||
| # if [ ${first_device_id} \> $((var)) ] | |||
| # then | |||
| # first_device_id=$((var)) | |||
| # fi | |||
| # done | |||
| # device_num=`expr ${last_device_id} - ${first_device_id} + 1` | |||
| # if [ ${device_num} != ${#device_list[*]} ] | |||
| # then | |||
| # echo "error: the Ascend chips used must be continuous, please check your device id list!" | |||
| # exit 1 | |||
| # fi | |||
| # if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ] | |||
| # then | |||
| # if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ] | |||
| # then | |||
| # echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips." | |||
| # exit 1 | |||
| # fi | |||
| # fi | |||
| # echo "${first_device_id},`expr ${last_device_id} + 1`" | |||
| # } | |||
| # get_hccl_name(){ | |||
| # server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:") | |||
| # device_num=`expr $2 - $1` | |||
| # device_id_list="" | |||
| # for(( i=$1 ; i < $2 ; i++ )) | |||
| # do | |||
| # device_id_list=${device_id_list}$i | |||
| # done | |||
| # hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json" | |||
| # echo ${hccl_name} | |||
| # } | |||
| run_ascend(){ | |||
| if [ $# != 3 ] && [ $# != 4 ] | |||
| then | |||
| echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n" | |||
| exit 1 | |||
| fi | |||
| # first_last_device=$(check_and_get_Ascend_device $2) | |||
| # #devices=(${first_last_device//,/ }) | |||
| # #IFS=',' read -ra devices <<<"${first_last_device}" | |||
| # first_device=${first_last_device:0:1} | |||
| # last_device=${first_last_device:2:1} | |||
| # device_num=`expr $((last_device)) - $((first_device))` | |||
| # #single ascend or multiple ascend | |||
| # if [ ${device_num} -gt 1 ] | |||
| # then | |||
| # ori_path=$(dirname "$(readlink -f "$0")") | |||
| # #generate hccl config file | |||
| # cd ../../../../utils/hccl_tools/ || exit | |||
| # device_num_arg="[${first_device},${last_device})" | |||
| # python hccl_tools.py --device_num=${device_num_arg} | |||
| # hccl_name=$(get_hccl_name ${first_device} ${last_device}) | |||
| # if [ ! -e ${hccl_name} ] | |||
| # then | |||
| # echo "error: failed to generate the hccl config file!" | |||
| # exit 1 | |||
| # fi | |||
| # mv ${hccl_name} ${ori_path} | |||
| # cd ${ori_path} || exit | |||
| # PATH1=$(get_real_path ${hccl_name}) | |||
| # if [ ! -f $PATH1 ] | |||
| # then | |||
| # echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| # exit 1 | |||
| # fi | |||
| # export RANK_TABLE_FILE=$PATH1 | |||
| # fi | |||
| PATH1=$(get_real_path $2) | |||
| PATH2=$(get_real_path $3) | |||
| if [ $# == 4 ] | |||
| then | |||
| PATH3=$(get_real_path $4) | |||
| fi | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| if [ -d "../train" ]; | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 4 ] && [ ! -f $PATH3 ] | |||
| then | |||
| rm -rf ../train | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" | |||
| exit 1 | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| python ${BASEPATH}/../src/launch.py \ | |||
| --nproc_per_node=$2 \ | |||
| --visible_devices=$4 \ | |||
| --server_id=$3 \ | |||
| --training_script=${BASEPATH}/../train.py \ | |||
| --dataset_path=$5 \ | |||
| --pre_trained=$6 \ | |||
| --device_target=$1 &> train.log & # dataset train folder | |||
| rank_file_name=${2##*/} | |||
| IFS='_' read -ra array <<<"${rank_file_name}" | |||
| device_id_list=${array[2]} | |||
| first_device=${device_id_list:0:1} | |||
| device_num=${#device_id_list} | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=${device_num} | |||
| export RANK_SIZE=${device_num} | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| export SERVER_ID=0 | |||
| rank_start=$((DEVICE_NUM * SERVER_ID)) | |||
| rm -rf ./train | |||
| mkdir ./train | |||
| for((i=0; i<${device_num}; i++)) | |||
| do | |||
| export DEVICE_ID=$((first_device+i)) | |||
| export RANK_ID=$((rank_start + i)) | |||
| mkdir ./train/device$i | |||
| cp ../*.py ./train/device$i | |||
| cp *.sh ./train/device$i | |||
| cp -r ../src ./train/device$i | |||
| cp -r ../models ./train/device$i | |||
| cd ./train/device$i || exit | |||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | |||
| env > env.log | |||
| if [ $# == 3 ] | |||
| then | |||
| python train.py --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> train.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| python train.py --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log & | |||
| fi | |||
| cd ../.. || exit | |||
| done | |||
| } | |||
| if [ $# -gt 6 ] || [ $# -lt 4 ] | |||
| then | |||
| echo "Usage:\n \ | |||
| Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ | |||
| " | |||
| exit 1 | |||
| fi | |||
| # run_gpu(){ | |||
| # if [ $# -gt 3 ] || [ $# -lt 2 ] | |||
| # then | |||
| # echo "Usage: sh run_train_distribute_quant.sh [GPU] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " | |||
| # exit 1 | |||
| # fi | |||
| # PATH1=$(get_real_path $2) | |||
| # if [ $# == 3 ] | |||
| # then | |||
| # PATH2=$(get_real_path $3) | |||
| # fi | |||
| # if [ ! -d $PATH1 ] | |||
| # then | |||
| # echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| # exit 1 | |||
| # fi | |||
| # if [ $# == 3 ] && [ ! -f $PATH2 ] | |||
| # then | |||
| # echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| # exit 1 | |||
| # fi | |||
| # ulimit -u unlimited | |||
| # export RANK_SIZE=2 | |||
| # #export CUDA_VISIBLE_DEVICES=1,2 | |||
| # rm -rf ./train_parallel | |||
| # mkdir ./train_parallel | |||
| # cp ../*.py ./train_parallel | |||
| # cp *.sh ./train_parallel | |||
| # cp -r ../src ./train_parallel | |||
| # cp -r ../models ./train_parallel | |||
| # cd ./train_parallel || exit | |||
| # echo "start training" | |||
| # env > env.log | |||
| # if [ $# == 2 ] | |||
| # then | |||
| # mpirun --allow-run-as-root -n $RANK_SIZE | |||
| # python train.py --device_target=$1 --dataset_path=$PATH1 &> log & | |||
| # fi | |||
| # if [ $# == 3 ] | |||
| # then | |||
| # mpirun --allow-run-as-root -n $RANK_SIZE | |||
| # python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| # fi | |||
| # cd .. | |||
| # } | |||
| if [ $1 = "Ascend" ] ; then | |||
| run_ascend "$@" | |||
| else | |||
| echo "not support platform" | |||
| fi; | |||
| echo "Unsupported device target: $1" | |||
| fi; | |||