You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_gpu.sh 4.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. #!/usr/bin/env bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. export DEVICE_ID=0
  17. export RANK_ID=0
  18. export RANK_SIZE=1
  19. options=`getopt -u -o ht:n:i::o:v:m: -l help,task:,device_num:,device_id:,config:,output:,vocab:,metric: -- "$@"`
  20. eval set -- "$options"
  21. echo $options
  22. echo_help()
  23. {
  24. echo "Usage:"
  25. echo "bash train.sh [-h] [-t t|i] [-n N] [-i N] [-j FILE] [-c FILE] [-o FILE] [-v FILE]"
  26. echo "options:"
  27. echo " -h --help show usage"
  28. echo " -t --task select task, 't' for training and 'i' for inference"
  29. echo " -n --device_num training with N devices"
  30. echo " -i --device_id training with device i"
  31. echo " -c --config set the configuration file"
  32. echo " -o --output set the output file of inference"
  33. echo " -v --vocab set the vocabulary"
  34. echo " -m --metric set the metric"
  35. }
  36. set_device_id()
  37. {
  38. while [ -n "$1" ]
  39. do
  40. if [[ "$1" == "-i" || "$1" == "--device_id" ]]
  41. then
  42. if [[ $2 -ge 0 && $2 -le 7 ]]
  43. then
  44. export DEVICE_ID=$2
  45. fi
  46. break
  47. fi
  48. shift
  49. done
  50. }
  51. while [ -n "$1" ]
  52. do
  53. case "$1" in
  54. -h|--help)
  55. echo_help
  56. shift
  57. ;;
  58. -t|--task)
  59. echo "task:"
  60. if [ "$2" == "t" ]
  61. then
  62. task=train
  63. elif [ "$2" == "i" ]
  64. then
  65. task=infer
  66. fi
  67. shift 2
  68. ;;
  69. -n|--device_num)
  70. echo "device_num"
  71. if [ $2 -eq 1 ]
  72. then
  73. set_device_id $options
  74. elif [ $2 -gt 1 ]
  75. then
  76. export RANK_SIZE=$2
  77. fi
  78. shift 2
  79. ;;
  80. -i|--device_id)
  81. echo "set device id"
  82. export DEVICE_ID=$2
  83. shift 2
  84. ;;
  85. -c|--config)
  86. echo "config";
  87. configurations=$2
  88. shift 2
  89. ;;
  90. -o|--output)
  91. echo "output";
  92. output=$2
  93. shift 2
  94. ;;
  95. -v|--vocab)
  96. echo "vocab";
  97. vocab=$2
  98. shift 2
  99. ;;
  100. -m|--metric)
  101. echo "metric";
  102. metric=$2
  103. shift 2
  104. ;;
  105. --)
  106. shift
  107. break
  108. ;;
  109. *)
  110. shift
  111. ;;
  112. esac
  113. done
  114. file_path=$(cd "$(dirname $0)" || exit; pwd)
  115. if [ $RANK_SIZE -gt 1 ]
  116. then
  117. echo "Working on $RANK_SIZE device"
  118. fi
  119. echo "Working on file ${task}_prophetnet_$DEVICE_ID"
  120. cd $file_path || exit
  121. cd ../ || exit
  122. rm -rf ./${task}_prophetnet_$DEVICE_ID
  123. mkdir ./${task}_prophetnet_$DEVICE_ID
  124. cp train_gradient_accumulation.py ./${task}_prophetnet_$DEVICE_ID
  125. cp train.py ./${task}_prophetnet_$DEVICE_ID
  126. cp eval.py ./${task}_prophetnet_$DEVICE_ID
  127. cp -r src ./${task}_prophetnet_$DEVICE_ID
  128. cp -r config ./${task}_prophetnet_$DEVICE_ID
  129. cp $configurations ./${task}_prophetnet_$DEVICE_ID
  130. if [ $vocab ]
  131. then
  132. cp $vocab ./${task}_prophetnet_$DEVICE_ID
  133. fi
  134. cd ./${task}_prophetnet_$DEVICE_ID || exit
  135. env > log.log
  136. echo $task
  137. if [ "$task" == "train" ]
  138. then
  139. if [ $RANK_SIZE -gt 1 ]
  140. then
  141. mpirun -n $RANK_SIZE python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
  142. fi
  143. #python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
  144. python train.py --config ${configurations##*/} --platform GPU
  145. elif [ "$task" == "infer" ]
  146. then
  147. #python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform GPU >>log_infer.log 2>&1 &
  148. python eval.py --config ${configurations##*/} --output ${output} --vocab ${vocab##*/} --metric ${metric} --platform GPU
  149. fi
  150. cd ../