You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_distribute_train.sh 2.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/bin/bash
  2. # Copyright 2020 Huawei Technologies Co., Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ============================================================================
  16. echo "=============================================================================================================="
  17. echo "Please run the scipt as: "
  18. echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH"
  19. echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
  20. echo "It is better to use absolute path."
  21. echo "=============================================================================================================="
  22. rm -rf run_distribute_train
  23. mkdir run_distribute_train
  24. cd run_distribute_train || exit
  25. EPOCH_SIZE=$2
  26. DATA_PATH=$3
  27. export MINDSPORE_HCCL_CONFIG_PATH=$4
  28. export RANK_TABLE_FILE=$4
  29. export RANK_SIZE=$1
  30. export HCCL_FLAG=1
  31. export DEPLOY_MODE=0
  32. for((i=0;i<RANK_SIZE;i++))
  33. do
  34. export DEVICE_ID=$i
  35. export RANK_ID=$i
  36. export GE_USE_STATIC_MEMORY=1
  37. mkdir helper$i
  38. cp -rf ../src/ ../train.py ./helper$i
  39. cd ./helper$i || exit
  40. echo "start training for rank $i, device $DEVICE_ID"
  41. env > env.log
  42. python train.py \
  43. --distribute="true" \
  44. --epoch_size=$EPOCH_SIZE \
  45. --device_id=$DEVICE_ID \
  46. --device_num=$RANK_SIZE \
  47. --enable_save_ckpt="true" \
  48. --enable_lossscale="true" \
  49. --do_shuffle="true" \
  50. --enable_data_sink="false" \
  51. --checkpoint_path="" \
  52. --save_checkpoint_steps=2500 \
  53. --save_checkpoint_num=30 \
  54. --data_path=$DATA_PATH > log.txt 2>&1 &
  55. cd ../
  56. done
  57. cd ..