You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run.sh 3.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #!/usr/bin/env bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. export DEVICE_ID=0
  17. export RANK_ID=0
  18. export RANK_SIZE=1
  19. options=`getopt -u -o ht:n:i:j:c:o:v:m: -l help,task:,device_num:,device_id:,hccl_json:,config:,output:,vocab:,metric: -- "$@"`
  20. eval set -- "$options"
  21. echo $options
  22. echo_help()
  23. {
  24. echo "Usage:"
  25. echo "bash train.sh [-h] [-t t|i] [-n N] [-i N] [-j FILE] [-c FILE] [-o FILE] [-v FILE]"
  26. echo "options:"
  27. echo " -h --help show usage"
  28. echo " -t --task select task, 't' for training and 'i' for inference"
  29. echo " -n --device_num training with N devices"
  30. echo " -i --device_id training with device i"
  31. echo " -j --hccl_json set the rank table file"
  32. echo " -c --config set the configuration file"
  33. echo " -o --output set the output file of inference"
  34. echo " -v --vocab set the vocabulary"
  35. echo " -m --metric set the metric"
  36. }
  37. set_hccl_json()
  38. {
  39. while [ -n "$1" ]
  40. do
  41. if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
  42. then
  43. export MINDSPORE_HCCL_CONFIG_PATH=$2
  44. export RANK_TABLE_FILE=$2
  45. break
  46. fi
  47. shift
  48. done
  49. }
  50. set_device_id()
  51. {
  52. while [ -n "$1" ]
  53. do
  54. if [[ "$1" == "-i" || "$1" == "--device_id" ]]
  55. then
  56. if [[ $2 -ge 0 && $2 -le 7 ]]
  57. then
  58. export DEVICE_ID=$2
  59. fi
  60. break
  61. fi
  62. shift
  63. done
  64. }
  65. while [ -n "$1" ]
  66. do
  67. case "$1" in
  68. -h|--help)
  69. echo_help
  70. shift
  71. ;;
  72. -t|--task)
  73. echo "task:"
  74. if [ "$2" == "t" ]
  75. then
  76. task=train
  77. elif [ "$2" == "i" ]
  78. then
  79. task=infer
  80. fi
  81. shift 2
  82. ;;
  83. -n|--device_num)
  84. echo "device_num"
  85. if [ $2 -eq 1 ]
  86. then
  87. set_device_id $options
  88. elif [ $2 -gt 1 ]
  89. then
  90. export HCCL_FLAG=1
  91. export DEPLOY_MODE=0
  92. export RANK_SIZE=$2
  93. set_hccl_json $options
  94. fi
  95. shift 2
  96. ;;
  97. -i|--device_id)
  98. echo "set device id"
  99. export DEVICE_ID=$2
  100. shift 2
  101. ;;
  102. -c|--config)
  103. echo "config";
  104. configurations=$2
  105. shift 2
  106. ;;
  107. -o|--output)
  108. echo "output";
  109. output=$2
  110. shift 2
  111. ;;
  112. -v|--vocab)
  113. echo "vocab";
  114. vocab=$2
  115. shift 2
  116. ;;
  117. -m|--metric)
  118. echo "metric";
  119. metric=$2
  120. shift 2
  121. ;;
  122. --)
  123. shift
  124. break
  125. ;;
  126. *)
  127. shift
  128. ;;
  129. esac
  130. done
  131. file_path=$(cd "$(dirname $0)" || exit; pwd)
  132. for((i=0; i < $RANK_SIZE; i++))
  133. do
  134. if [ $RANK_SIZE -gt 1 ]
  135. then
  136. echo $RANK_SIZE
  137. export RANK_ID=$i
  138. export DEVICE_ID=$[i]
  139. fi
  140. echo "Working on device $i"
  141. cd $file_path || exit
  142. cd ../ || exit
  143. rm -rf ./run_mass_$DEVICE_ID
  144. mkdir ./run_mass_$DEVICE_ID
  145. cp train.py ./run_mass_$DEVICE_ID
  146. cp eval.py ./run_mass_$DEVICE_ID
  147. cp $configurations ./run_mass_$DEVICE_ID
  148. if [ $vocab ]
  149. then
  150. cp $vocab ./run_mass_$DEVICE_ID
  151. fi
  152. cd ./run_mass_$DEVICE_ID || exit
  153. env > log.log
  154. echo $task
  155. if [ "$task" == "train" ]
  156. then
  157. python train.py --config ${configurations##*/} >>log.log 2>&1 &
  158. elif [ "$task" == "infer" ]
  159. then
  160. python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} >>log_infer.log 2>&1 &
  161. fi
  162. cd ../
  163. done