You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run.sh 3.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. export DEVICE_ID=0
  17. export RANK_ID=0
  18. export RANK_SIZE=1
  19. options=`getopt -u -o ht:n:i:j:c:o:v: -l help,task:,device_num:,device_id:,hccl_json:,config:,output:,vocab: -- "$@"`
  20. eval set -- "$options"
  21. echo $options
  22. echo_help()
  23. {
  24. echo "Usage:"
  25. echo "bash train.sh [-h] [-t t|i] [-n N] [-i N] [-j FILE] [-c FILE] [-o FILE] [-v FILE]"
  26. echo "options:"
  27. echo " -h --help show usage"
  28. echo " -t --task select task, 't' for training and 'i' for inference"
  29. echo " -n --device_num training with N devices"
  30. echo " -i --device_id training with device i"
  31. echo " -j --hccl_json set the rank table file"
  32. echo " -c --config set the configuration file"
  33. echo " -o --output set the output file of inference"
  34. echo " -v --vocab set the vocabulary"
  35. }
  36. set_hccl_json()
  37. {
  38. while [ -n "$1" ]
  39. do
  40. if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
  41. then
  42. export MINDSPORE_HCCL_CONFIG_PATH=$2 #/data/wsc/hccl_2p_01.json
  43. export RANK_TABLE_FILE=$2 #/data/wsc/hccl_2p_01.json
  44. break
  45. fi
  46. shift
  47. done
  48. }
  49. set_device_id()
  50. {
  51. while [ -n "$1" ]
  52. do
  53. if [[ "$1" == "-i" || "$1" == "--device_id" ]]
  54. then
  55. if [[ $2 -ge 0 && $2 -le 7 ]]
  56. then
  57. export DEVICE_ID=$2
  58. fi
  59. break
  60. fi
  61. shift
  62. done
  63. }
  64. while [ -n "$1" ]
  65. do
  66. case "$1" in
  67. -h|--help)
  68. echo_help
  69. shift
  70. ;;
  71. -t|--task)
  72. echo "task:"
  73. if [ "$2" == "t" ]
  74. then
  75. task=train
  76. elif [ "$2" == "i" ]
  77. then
  78. task=infer
  79. fi
  80. shift 2
  81. ;;
  82. -n|--device_num)
  83. echo "device_num"
  84. if [ $2 -eq 1 ]
  85. then
  86. set_device_id $options
  87. elif [ $2 -gt 1 ]
  88. then
  89. export HCCL_FLAG=1
  90. export DEPLOY_MODE=0
  91. export RANK_SIZE=$2
  92. set_hccl_json $options
  93. fi
  94. shift 2
  95. ;;
  96. -i|--device_id)
  97. echo "set device id"
  98. export DEVICE_ID=$2
  99. shift 2
  100. ;;
  101. -c|--config)
  102. echo "config";
  103. configurations=$2
  104. shift 2
  105. ;;
  106. -o|--output)
  107. echo "output";
  108. output=$2
  109. shift 2
  110. ;;
  111. -v|--vocab)
  112. echo "vocab";
  113. vocab=$2
  114. shift 2
  115. ;;
  116. --)
  117. shift
  118. break
  119. ;;
  120. *)
  121. shift
  122. ;;
  123. esac
  124. done
  125. file_path=$(cd "$(dirname $0)" || exit; pwd)
  126. for((i=0; i < $RANK_SIZE; i++))
  127. do
  128. if [ $RANK_SIZE -gt 1 ]
  129. then
  130. echo $RANK_SIZE
  131. export RANK_ID=$i
  132. export DEVICE_ID=$[i]
  133. fi
  134. echo "Working on device $i"
  135. cd $file_path || exit
  136. cd ../ || exit
  137. rm -rf ./run_mass_$DEVICE_ID
  138. mkdir ./run_mass_$DEVICE_ID
  139. cp train.py ./run_mass_$DEVICE_ID
  140. cp eval.py ./run_mass_$DEVICE_ID
  141. cp $configurations ./run_mass_$DEVICE_ID
  142. if [ $vocab ]
  143. then
  144. cp $vocab ./run_mass_$DEVICE_ID
  145. fi
  146. cd ./run_mass_$DEVICE_ID || exit
  147. env > log.log
  148. echo $task
  149. if [ "$task" == "train" ]
  150. then
  151. python train.py --config ${configurations##*/} >>log.log 2>&1 &
  152. elif [ "$task" == "infer" ]
  153. then
  154. python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} >>log_infer.log 2>&1 &
  155. fi
  156. cd ../
  157. done