You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_distribute_train.sh 2.7 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/bin/bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. echo "=============================================================================================================="
  17. echo "Please run the scipt as: "
  18. echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
  19. echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
  20. echo "It is better to use absolute path."
  21. echo "================================================================================================================="
  22. if [ $# != 5 ] && [ $# != 7 ]
  23. then
  24. echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
  25. [MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
  26. exit 1
  27. fi
  28. # Before start distribute train, first create mindrecord files.
  29. python train.py --only_create_dataset=1
  30. echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
  31. export RANK_SIZE=$1
  32. EPOCH_SIZE=$2
  33. LR=$3
  34. DATASET=$4
  35. PRE_TRAINED=$6
  36. PRE_TRAINED_EPOCH_SIZE=$7
  37. export MINDSPORE_HCCL_CONFIG_PATH=$5
  38. for((i=0;i<RANK_SIZE;i++))
  39. do
  40. export DEVICE_ID=$i
  41. rm -rf LOG$i
  42. mkdir ./LOG$i
  43. cp ../*.py ./LOG$i
  44. cp -r ../src ./LOG$i
  45. cd ./LOG$i || exit
  46. export RANK_ID=$i
  47. echo "start training for rank $i, device $DEVICE_ID"
  48. env > env.log
  49. if [ $# == 5 ]
  50. then
  51. python train.py \
  52. --distribute=1 \
  53. --lr=$LR \
  54. --dataset=$DATASET \
  55. --device_num=$RANK_SIZE \
  56. --device_id=$DEVICE_ID \
  57. --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
  58. fi
  59. if [ $# == 7 ]
  60. then
  61. python train.py \
  62. --distribute=1 \
  63. --lr=$LR \
  64. --dataset=$DATASET \
  65. --device_num=$RANK_SIZE \
  66. --device_id=$DEVICE_ID \
  67. --pre_trained=$PRE_TRAINED \
  68. --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
  69. --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
  70. fi
  71. cd ../
  72. done