You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

launch.py 3.7 kB

5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """launch train script"""
  16. import os
  17. import sys
  18. import subprocess
  19. import shutil
  20. from argparse import ArgumentParser
  21. def parse_args():
  22. """
  23. parse args .
  24. Args:
  25. Returns:
  26. args.
  27. Examples:
  28. >>> parse_args()
  29. """
  30. parser = ArgumentParser(description="mindspore distributed training launch "
  31. "helper utilty that will spawn up "
  32. "multiple distributed processes")
  33. parser.add_argument("--nproc_per_node", type=int, default=1,
  34. help="The number of processes to launch on each node, "
  35. "for D training, this is recommended to be set "
  36. "to the number of D in your system so that "
  37. "each process can be bound to a single D.")
  38. parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
  39. help="will use the visible devices sequentially")
  40. parser.add_argument("--training_script", type=str,
  41. help="The full path to the single D training "
  42. "program/script to be launched in parallel, "
  43. "followed by all the arguments for the "
  44. "training script")
  45. # rest from the training program
  46. args, unknown = parser.parse_known_args()
  47. args.training_script_args = unknown
  48. return args
  49. def main():
  50. print("start", __file__)
  51. args = parse_args()
  52. print(args)
  53. visible_devices = args.visible_devices.split(',')
  54. assert os.path.isfile(args.training_script)
  55. assert len(visible_devices) >= args.nproc_per_node
  56. print('visible_devices:{}'.format(visible_devices))
  57. # spawn the processes
  58. processes = []
  59. cmds = []
  60. log_files = []
  61. env = os.environ.copy()
  62. env['RANK_SIZE'] = str(args.nproc_per_node)
  63. cur_path = os.getcwd()
  64. for rank_id in range(0, args.nproc_per_node):
  65. os.chdir(cur_path)
  66. device_id = visible_devices[rank_id]
  67. device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
  68. env['RANK_ID'] = str(rank_id)
  69. env['DEVICE_ID'] = str(device_id)
  70. if os.path.exists(device_dir):
  71. shutil.rmtree(device_dir)
  72. os.mkdir(device_dir)
  73. os.chdir(device_dir)
  74. cmd = [sys.executable, '-u']
  75. cmd.append(args.training_script)
  76. cmd.extend(args.training_script_args)
  77. log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w')
  78. process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env)
  79. processes.append(process)
  80. cmds.append(cmd)
  81. log_files.append(log_file)
  82. for process, cmd, log_file in zip(processes, cmds, log_files):
  83. process.wait()
  84. if process.returncode != 0:
  85. raise subprocess.CalledProcessError(returncode=process, cmd=cmd)
  86. log_file.close()
  87. if __name__ == "__main__":
  88. main()