add training scripts and modify readme of mobilenetv2_quant and

resnet50_quant modify readme
5 years ago · aaa0436882
--- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md
@@ -70,7 +70,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 ├── mobileNetv2_quant
  ├── Readme.md     # descriptions about MobileNetV2-Quant
  ├── scripts
  │   ├──run_train_quant.sh   # shell script for train on Ascend
  │   ├──run_train.sh   # shell script for train on Ascend and GPU
  │   ├──run_infer_quant.sh    # shell script for evaluation on Ascend
  ├── src
  │   ├──config.py      # parameter configuration
@@ -91,19 +91,22 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 You can start training using python or shell scripts. The usage of shell scripts as follows:

 - Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
 - bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
 - bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)


 ### Launch

 ```
 # training example
  shell:
      Ascend: sh run_train_quant.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt
 ``` bash
  # training example
  >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
  >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
 ```

 ### Result

 Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
 Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log  will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`.  
 `train.log` is as follows:

 ```
 epoch: [  0/200], step:[  624/  625], loss:[5.258/5.258], time:[140412.236], lr:[0.100]
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
@@ -0,0 +1,300 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 get_real_path(){
    if [ "${1:0:1}" == "/" ]; then
        echo "$1"
    else
        echo "$(realpath -m $PWD/$1)"
    fi
 }


 # check_and_get_Ascend_device(){

 #     #device_list=(${1//,/ })
 #     IFS=',' read -ra device_list <<<"$1"
 #     last_device_id=0
 #     first_device_id=8
 #     device_used=(0 0 0 0 0 0 0 0)

 #     for var in "${device_list[@]}"
 #     do  
        
 #         if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
 #         then 
 #             echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
 #             exit 1
 #         fi

 #         if [  ${device_used[$((var))]} -eq 0 ]
 #         then 
 #             device_used[ $((var)) ]=1
 #         else
 #             echo "error: device id is duplicate, please check your device id list!"
 #             exit 1
 #         fi

 #         if [ ${last_device_id} \< $((var)) ]
 #         then 
 #             last_device_id=$((var))
 #         fi
 #         if [ ${first_device_id} \> $((var)) ]
 #         then
 #             first_device_id=$((var))
 #         fi
 #     done

 #     device_num=`expr ${last_device_id} - ${first_device_id} + 1`
 #     if [ ${device_num} != ${#device_list[*]} ]
 #     then 
 #         echo "error: the Ascend chips used must be continuous, please check your device id list!"
 #         exit 1
 #     fi

 #     if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ]
 #     then
 #         if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ]
 #         then
 #             echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips."
 #             exit 1
 #         fi
 #     fi

 #     echo "${first_device_id},`expr ${last_device_id} + 1`"
 # }

 # get_hccl_name(){

 #     server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:")
 #     device_num=`expr $2 - $1`
 #     device_id_list=""

 #     for(( i=$1 ; i < $2 ; i++ ))
 #     do 
 #         device_id_list=${device_id_list}$i
 #     done
 #     hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json"

 #     echo ${hccl_name}
 # }


 get_gpu_device_num(){

    #device_list=(${1//,/ })
    IFS=',' read -ra device_list <<<"$1"
    device_used=(0 0 0 0 0 0 0 0)
    device_num=0
    for var in "${device_list[@]}"
    do  
        if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
        then 
            echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
            exit 1
        fi

        if [  ${device_used[$((var))]} -eq 0 ]
        then 
            device_used[ $((var)) ]=1
            device_num=$((device_num+1))
        fi
    done

    echo ${device_num}
 }


 run_ascend(){

    if [ $# -gt 4 ] || [ $# -lt 3 ]
    then
        echo "Usage:  bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
        exit 1
    fi

    #first_last_device=$(check_and_get_Ascend_device $2)
    #devices=(${first_last_device//,/ })
    #IFS=',' read -ra devices <<<"${first_last_device}"
    # first_device=${first_last_device:0:1}
    # last_device=${first_last_device:2:1}
    # device_num=`expr $((last_device)) - $((first_device))`
    
    #single ascend or multiple ascend 
    # if [ ${device_num} -gt 1 ]
    # then
    #     ori_path=$(dirname "$(readlink -f "$0" )")
    #     #generate hccl config file
    #     cd ../../../../utils/hccl_tools/ || exit
    #     device_num_arg="[${first_device},${last_device})" 

    #     python hccl_tools.py --device_num=${device_num_arg}

    #     hccl_name=$(get_hccl_name ${first_device} ${last_device})

    #     if [ ! -e ${hccl_name} ]
    #     then
    #         echo "error: failed to generate the hccl config file!"
    #         exit 1
    #     fi

    #     mv ${hccl_name} ${ori_path}
    #     cd ${ori_path} || exit

    #     PATH1=$(get_real_path ${hccl_name})

    #     if [ ! -f $PATH1 ]
    #     then 
    #         echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
    #         exit 1
    #     fi 

    #     export RANK_TABLE_FILE=$PATH1
    # fi

    PATH1=$(get_real_path $2)
    PATH2=$(get_real_path $3)

    if [ $# == 4 ]
    then 
        PATH3=$(get_real_path $4)
    fi

    if [ ! -f $PATH1 ]
    then 
        echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
        exit 1
    fi 

    if [ ! -d $PATH2 ]
    then 
        echo "error: DATASET_PATH=$PATH2 is not a directory"
    exit 1
    fi 

    if [ $# == 4 ] && [ ! -f $PATH3 ]
    then
        echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
        exit 1
    fi


    rank_file_name=${2##*/}
    IFS='_' read -ra array <<<"${rank_file_name}"
    device_id_list=${array[2]}
    first_device=${device_id_list:0:1}
    #last_device=${device_list:${#device_list}-1:1}
    device_num=${#device_id_list}
    
    ulimit -u unlimited
    export DEVICE_NUM=${device_num}
    export RANK_SIZE=${device_num}
    export RANK_TABLE_FILE=$PATH1

    export SERVER_ID=0
    rank_start=$((DEVICE_NUM * SERVER_ID))

    rm -rf ./train
    mkdir ./train
    for((i=0; i<${device_num}; i++))
    do
        export DEVICE_ID=$((first_device+i))
        export RANK_ID=$((rank_start + i))
        mkdir ./train/device$i
        cp ../*.py ./train/device$i
        cp *.sh ./train/device$i
        cp -r ../src ./train/device$i
        cd ./train/device$i || exit
        echo "start training for rank $RANK_ID, device $DEVICE_ID"
        env > env.log
        if [ $# == 3 ]
        then	    
            python train.py --device_target=$1  --dataset_path=$PATH2 &> train.log &
        fi
        
        if [ $# == 4 ]
        then
            python train.py --device_traget=$1  --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log &
        fi

        cd ../.. || exit
    done
 }

 run_gpu(){
    if [ $# -gt 3 ] || [ $# -lt 2 ]
    then
        echo "Usage:  bash run_train.sh  [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
        exit 1
    fi

    PATH1=$(get_real_path $3)
    
    if [ $# == 4 ]
    then 
        PATH2=$(get_real_path $4)
    fi

    if [ ! -d $PATH1 ]
    then 
        echo "error: DATASET_PATH=$PATH1 is not a directory"
        exit 1
    fi 

    if [ $# == 4 ] && [ ! -f $PATH2 ]
    then
        echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
        exit 1
    fi

    device_num=$(get_gpu_device_num $2)

    ulimit -u unlimited
    export DEVICE_NUM=${device_num}
    export RANK_SIZE=${device_num}
    export CUDA_VISIBLE_DEVICES=$2
    
    rm -rf ./train
    mkdir ./train
    cp ../*.py ./train
    cp *.sh ./train
    cp -r ../src ./train
    cd ./train || exit
    echo "start training"
    env > env.log
    if [ $# == 3 ]
    then
        mpirun --allow-run-as-root -n ${RANK_SIZE} \
        python train.py --device_target=$1  --dataset_path=$PATH1 &> train.log &
    fi
    
    if [ $# == 4 ]
    then
        mpirun --allow-run-as-root -n ${RANK_SIZE} \
        python train.py --device_traget=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
    fi

    cd ..
 }


 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
 elif [ $1 = "GPU" ] ; then
    run_gpu "$@"
 else
    echo "Unsupported device target: $1"
 fi;
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
@@ -1,96 +0,0 @@
 #!/usr/bin/env bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 run_ascend()
 {
    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
    then
        echo "error: DEVICE_NUM=$2 is not in (1-9)"
    exit 1
    fi

    if [ ! -d $5 ] && [ ! -f $5 ]
    then
        echo "error: DATASET_PATH=$5 is not a directory or file"
    exit 1
    fi

    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
    if [ -d "../train" ];
    then
        rm -rf ../train
    fi
    mkdir ../train
    cd ../train || exit
    python ${BASEPATH}/../src/launch.py \
            --nproc_per_node=$2 \
            --visible_devices=$4 \
            --server_id=$3 \
            --training_script=${BASEPATH}/../train.py \
            --dataset_path=$5 \
            --pre_trained=$6 \
            --device_target=$1 &> train.log &  # dataset train folder
 }

 run_gpu()
 {
    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
    then
        echo "error: DEVICE_NUM=$2 is not in (1-8)"
    exit 1
    fi

    if [ ! -d $4 ]
    then
        echo "error: DATASET_PATH=$4 is not a directory"
    exit 1
    fi

    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
    if [ -d "../train" ];
    then
        rm -rf ../train
    fi
    mkdir ../train
    cd ../train || exit

    export CUDA_VISIBLE_DEVICES="$3"
    mpirun -n $2 --allow-run-as-root \
    python ${BASEPATH}/../train.py \
        --dataset_path=$4 \
        --device_target=$1 \
        --pre_trained=$5  &> ../train.log &  # dataset train folder
 }

 if [ $# -gt 6 ] || [ $# -lt 5 ]
 then
    echo "Usage:\n \
          Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
          GPU: sh run_train_quant.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
          "
 exit 1
 fi

 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
 elif [ $1 = "GPU" ] ; then
    run_gpu "$@"
 else
    echo "Unsupported device target."
 fi;

--- a/model_zoo/official/cv/resnet50_quant/Readme.md
+++ b/model_zoo/official/cv/resnet50_quant/Readme.md
@@ -1,4 +1,43 @@
 # Contents
 # ResNet-50_quant Example

 ## Description

 This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore.

 ## Requirements

 - Install [MindSpore](https://www.mindspore.cn/install/en).

 - Download the dataset ImageNet2012 

 > Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
 > ```
 > .  
 > ├── ilsvrc                  # train dataset
 > └── ilsvrc_eval             # infer dataset: images should be classified into 1000 directories firstly, just like train images
 > ```


 ## Example structure

 ```shell
 resnet50_quant/
  ├── eval.py
  ├── models
  │   └── resnet_quant.py
  ├── Readme.md
  ├── scripts
  │   ├── run_infer.sh
  │   └── run_train.sh
  ├── src
  │   ├── config.py
  │   ├── crossentropy.py
  │   ├── dataset.py
  │   ├── launch.py
  │   └── lr_generator.py
  └── train.py
 ```

 - [resnet50 Description](#resnet50-description)
 - [Model Architecture](#model-architecture)
@@ -88,21 +127,17 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 ### Usage


 You can start training using python or shell scripts. The usage of shell scripts as follows:

 - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH][CKPT_PATH]
 - Ascend: sh run_train.sh Ascend [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
 ### Launch

 ```
 # training example
  shell:
      Ascend: sh run_train.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/resnet/train/ Resnet50-90_5004.ckpt
  # training example
  Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ 
 ```

 ### Result

 Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
 Training result will be stored in the example path. Checkpoints will be stored at `./train/device$i/` by default, and training log  will be redirected to `./train/device$i/train.log` like followings. 

 ```
 epoch: 1 step: 5004, loss is 4.8995576
--- a/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,49 +14,259 @@
 # limitations under the License.
 # ============================================================================

 run_ascend()
 {
    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
    then
        echo "error: DEVICE_NUM=$2 is not in (1-8)"
    exit 1
 get_real_path(){
    if [ "${1:0:1}" == "/" ]; then
        echo "$1"
    else
        echo "$(realpath -m $PWD/$1)"
    fi
 }

    if [ ! -d $5 ] && [ ! -f $5 ]
    then
        echo "error: DATASET_PATH=$5 is not a directory or file"
    exit 1
 # check_and_get_Ascend_device(){

 #     #device_list=(${1//,/ })
 #     IFS=',' read -ra device_list <<<"$1"
 #     last_device_id=0
 #     first_device_id=8
 #     device_used=(0 0 0 0 0 0 0 0)

 #     for var in "${device_list[@]}"
 #     do  
        
 #         if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
 #         then 
 #             echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
 #             exit 1
 #         fi

 #         if [  ${device_used[$((var))]} -eq 0 ]
 #         then 
 #             device_used[ $((var)) ]=1
 #         else
 #             echo "error: device id is duplicate, please check your device id list!"
 #             exit 1
 #         fi

 #         if [ ${last_device_id} \< $((var)) ]
 #         then 
 #             last_device_id=$((var))
 #         fi
 #         if [ ${first_device_id} \> $((var)) ]
 #         then
 #             first_device_id=$((var))
 #         fi
 #     done

 #     device_num=`expr ${last_device_id} - ${first_device_id} + 1`
 #     if [ ${device_num} != ${#device_list[*]} ]
 #     then 
 #         echo "error: the Ascend chips used must be continuous, please check your device id list!"
 #         exit 1
 #     fi

 #     if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ]
 #     then
 #         if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ]
 #         then
 #             echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips."
 #             exit 1
 #         fi
 #     fi

 #     echo "${first_device_id},`expr ${last_device_id} + 1`"
 # }

 # get_hccl_name(){

 #     server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:")
 #     device_num=`expr $2 - $1`
 #     device_id_list=""

 #     for(( i=$1 ; i < $2 ; i++ ))
 #     do 
 #         device_id_list=${device_id_list}$i
 #     done
 #     hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json"

 #     echo ${hccl_name}
 # }


 run_ascend(){

    if [ $# != 3 ] && [ $# != 4 ]
    then 
        echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n"
        exit 1
    fi

    # first_last_device=$(check_and_get_Ascend_device $2)
    # #devices=(${first_last_device//,/ })
    # #IFS=',' read -ra devices <<<"${first_last_device}"
    # first_device=${first_last_device:0:1}
    # last_device=${first_last_device:2:1}
    # device_num=`expr $((last_device)) - $((first_device))`
    

    # #single ascend or multiple ascend 
    # if [ ${device_num} -gt 1 ]
    # then
    #     ori_path=$(dirname "$(readlink -f "$0")")
    #     #generate hccl config file
    #     cd ../../../../utils/hccl_tools/  || exit
    #     device_num_arg="[${first_device},${last_device})" 

    #     python hccl_tools.py --device_num=${device_num_arg}

    #     hccl_name=$(get_hccl_name ${first_device} ${last_device})

    #     if [ ! -e ${hccl_name} ]
    #     then
    #         echo "error: failed to generate the hccl config file!"
    #         exit 1
    #     fi

    #     mv ${hccl_name} ${ori_path}
    #     cd ${ori_path} || exit

    #     PATH1=$(get_real_path ${hccl_name})

    #     if [ ! -f $PATH1 ]
    #     then 
    #         echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
    #         exit 1
    #     fi 

    #     export RANK_TABLE_FILE=$PATH1
    # fi


    PATH1=$(get_real_path $2)
    PATH2=$(get_real_path $3)

    if [ $# == 4 ]
    then 
        PATH3=$(get_real_path $4)
    fi

    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
    if [ -d "../train" ];
    if [ ! -f $PATH1 ]
    then 
        echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
        exit 1
    fi 

    if [ ! -d $PATH2 ]
    then 
        echo "error: DATASET_PATH=$PATH2 is not a directory"
        exit 1
    fi 

    if [ $# == 4 ] && [ ! -f $PATH3 ]
    then
        rm -rf ../train
        echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
        exit 1
    fi
    mkdir ../train
    cd ../train || exit
    python ${BASEPATH}/../src/launch.py \
            --nproc_per_node=$2 \
            --visible_devices=$4 \
            --server_id=$3 \
            --training_script=${BASEPATH}/../train.py \
            --dataset_path=$5 \
            --pre_trained=$6 \
            --device_target=$1 &> train.log &  # dataset train folder

    rank_file_name=${2##*/}
    IFS='_' read -ra array <<<"${rank_file_name}"
    device_id_list=${array[2]}
    first_device=${device_id_list:0:1}
    device_num=${#device_id_list}


    ulimit -u unlimited
    export DEVICE_NUM=${device_num}
    export RANK_SIZE=${device_num}
    export RANK_TABLE_FILE=$PATH1

    export SERVER_ID=0
    rank_start=$((DEVICE_NUM * SERVER_ID))

    rm -rf ./train
    mkdir ./train
    for((i=0; i<${device_num}; i++))
    do
        export DEVICE_ID=$((first_device+i))
        export RANK_ID=$((rank_start + i))
        mkdir ./train/device$i
        cp ../*.py ./train/device$i
        cp *.sh ./train/device$i
        cp -r ../src ./train/device$i
        cp -r ../models ./train/device$i
        cd ./train/device$i || exit
        echo "start training for rank $RANK_ID, device $DEVICE_ID"
        env > env.log
        if [ $# == 3 ]
        then	    
            python train.py  --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> train.log &
        fi
        
        if [ $# == 4 ]
        then
            python train.py  --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log &
        fi

        cd ../.. || exit
    done    
 }

 if [ $# -gt 6 ] || [ $# -lt 4 ]
 then
    echo "Usage:\n \
          Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
          "
 exit 1
 fi
 # run_gpu(){

 #     if [ $# -gt 3 ] || [ $# -lt 2 ]
 #     then
 #         echo "Usage:  sh run_train_distribute_quant.sh  [GPU] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
 #         exit 1
 #     fi

 #     PATH1=$(get_real_path $2)
    
 #     if [ $# == 3 ]
 #     then 
 #         PATH2=$(get_real_path $3)
 #     fi

 #     if [ ! -d $PATH1 ]
 #     then 
 #         echo "error: DATASET_PATH=$PATH1 is not a directory"
 #         exit 1
 #     fi 

 #     if [ $# == 3 ] && [ ! -f $PATH2 ]
 #     then
 #         echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 #         exit 1
 #     fi

 #     ulimit -u unlimited
 #     export RANK_SIZE=2
 #     #export CUDA_VISIBLE_DEVICES=1,2

 #     rm -rf ./train_parallel
 #     mkdir ./train_parallel
 #     cp ../*.py ./train_parallel
 #     cp *.sh ./train_parallel
 #     cp -r ../src ./train_parallel
 #     cp -r ../models ./train_parallel
 #     cd ./train_parallel || exit
 #     echo "start training"
 #     env > env.log
 #     if [ $# == 2 ]
 #     then	    
 #         mpirun --allow-run-as-root -n $RANK_SIZE
 #         python train.py --device_target=$1  --dataset_path=$PATH1 &> log &
 #     fi
    
 #     if [ $# == 3 ]
 #     then
 #         mpirun --allow-run-as-root -n $RANK_SIZE
 #         python train.py --device_traget=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 #     fi
 #     cd ..
 # }


 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
 else
    echo "not support platform"
 fi;

    echo "Unsupported device target: $1"
 fi;