You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_distribute_train.sh 2.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/bin/bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. echo "=============================================================================================================="
  17. echo "Please run the scipt as: "
  18. echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH"
  19. echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json"
  20. echo "It is better to use absolute path."
  21. echo "=============================================================================================================="
  22. EPOCH_SIZE=$2
  23. MINDRECORD_DIR=$3
  24. IMAGE_DIR=$4
  25. ANNO_PATH=$5
  26. # Before start distribute train, first create mindrecord files.
  27. python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR \
  28. --anno_path=$ANNO_PATH
  29. echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
  30. export MINDSPORE_HCCL_CONFIG_PATH=$6
  31. export RANK_SIZE=$1
  32. for((i=0;i<RANK_SIZE;i++))
  33. do
  34. export DEVICE_ID=$i
  35. rm -rf LOG$i
  36. mkdir ./LOG$i
  37. cp *.py ./LOG$i
  38. cd ./LOG$i || exit
  39. export RANK_ID=$i
  40. echo "start training for rank $i, device $DEVICE_ID"
  41. env > env.log
  42. python ../train.py \
  43. --distribute=1 \
  44. --device_num=$RANK_SIZE \
  45. --device_id=$DEVICE_ID \
  46. --mindrecord_dir=$MINDRECORD_DIR \
  47. --image_dir=$IMAGE_DIR \
  48. --epoch_size=$EPOCH_SIZE \
  49. --anno_path=$ANNO_PATH > log.txt 2>&1 &
  50. cd ../
  51. done