You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

launch.py 6.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """launch train script"""
  16. import os
  17. import sys
  18. import json
  19. import subprocess
  20. import shutil
  21. import platform
  22. from argparse import ArgumentParser
  23. def parse_args():
  24. """
  25. parse args .
  26. Args:
  27. Returns:
  28. args.
  29. Examples:
  30. >>> parse_args()
  31. """
  32. parser = ArgumentParser(description="mindspore distributed training launch "
  33. "helper utilty that will spawn up "
  34. "multiple distributed processes")
  35. parser.add_argument("--nproc_per_node", type=int, default=1,
  36. help="The number of processes to launch on each node, "
  37. "for D training, this is recommended to be set "
  38. "to the number of D in your system so that "
  39. "each process can be bound to a single D.")
  40. parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
  41. help="will use the visible devices sequentially")
  42. parser.add_argument("--server_id", type=str, default="",
  43. help="server ip")
  44. parser.add_argument("--training_script", type=str,
  45. help="The full path to the single D training "
  46. "program/script to be launched in parallel, "
  47. "followed by all the arguments for the "
  48. "training script")
  49. # rest from the training program
  50. args, unknown = parser.parse_known_args()
  51. args.training_script_args = unknown
  52. return args
  53. def main():
  54. print("start", __file__)
  55. args = parse_args()
  56. print(args)
  57. visible_devices = args.visible_devices.split(',')
  58. assert os.path.isfile(args.training_script)
  59. assert len(visible_devices) >= args.nproc_per_node
  60. print('visible_devices:{}'.format(visible_devices))
  61. if not args.server_id:
  62. print('pleaser input server ip!!!')
  63. exit(0)
  64. print('server_id:{}'.format(args.server_id))
  65. # construct hccn_table
  66. hccn_configs = open('/etc/hccn.conf', 'r').readlines()
  67. device_ips = {}
  68. for hccn_item in hccn_configs:
  69. hccn_item = hccn_item.strip()
  70. if hccn_item.startswith('address_'):
  71. device_id, device_ip = hccn_item.split('=')
  72. device_id = device_id.split('_')[1]
  73. device_ips[device_id] = device_ip
  74. print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
  75. hccn_table = {}
  76. arch = platform.processor()
  77. hccn_table['board_id'] = {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch]
  78. hccn_table['chip_info'] = '910'
  79. hccn_table['deploy_mode'] = 'lab'
  80. hccn_table['group_count'] = '1'
  81. hccn_table['group_list'] = []
  82. instance_list = []
  83. usable_dev = ''
  84. for instance_id in range(args.nproc_per_node):
  85. instance = {}
  86. instance['devices'] = []
  87. device_id = visible_devices[instance_id]
  88. device_ip = device_ips[device_id]
  89. usable_dev += str(device_id)
  90. instance['devices'].append({
  91. 'device_id': device_id,
  92. 'device_ip': device_ip,
  93. })
  94. instance['rank_id'] = str(instance_id)
  95. instance['server_id'] = args.server_id
  96. instance_list.append(instance)
  97. hccn_table['group_list'].append({
  98. 'device_num': str(args.nproc_per_node),
  99. 'server_num': '1',
  100. 'group_name': '',
  101. 'instance_count': str(args.nproc_per_node),
  102. 'instance_list': instance_list,
  103. })
  104. hccn_table['para_plane_nic_location'] = 'device'
  105. hccn_table['para_plane_nic_name'] = []
  106. for instance_id in range(args.nproc_per_node):
  107. eth_id = visible_devices[instance_id]
  108. hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
  109. hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
  110. hccn_table['status'] = 'completed'
  111. # save hccn_table to file
  112. table_path = os.getcwd()
  113. if not os.path.exists(table_path):
  114. os.mkdir(table_path)
  115. table_fn = os.path.join(table_path,
  116. 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
  117. with open(table_fn, 'w') as table_fp:
  118. json.dump(hccn_table, table_fp, indent=4)
  119. sys.stdout.flush()
  120. # spawn the processes
  121. processes = []
  122. cmds = []
  123. log_files = []
  124. env = os.environ.copy()
  125. env['RANK_SIZE'] = str(args.nproc_per_node)
  126. cur_path = os.getcwd()
  127. for rank_id in range(0, args.nproc_per_node):
  128. os.chdir(cur_path)
  129. device_id = visible_devices[rank_id]
  130. device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
  131. env['RANK_ID'] = str(rank_id)
  132. env['DEVICE_ID'] = str(device_id)
  133. if args.nproc_per_node > 1:
  134. env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
  135. env['RANK_TABLE_FILE'] = table_fn
  136. if os.path.exists(device_dir):
  137. shutil.rmtree(device_dir)
  138. os.mkdir(device_dir)
  139. os.chdir(device_dir)
  140. cmd = [sys.executable, '-u']
  141. cmd.append(args.training_script)
  142. cmd.extend(args.training_script_args)
  143. log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w')
  144. process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env)
  145. processes.append(process)
  146. cmds.append(cmd)
  147. log_files.append(log_file)
  148. for process, cmd, log_file in zip(processes, cmds, log_files):
  149. process.wait()
  150. if process.returncode != 0:
  151. raise subprocess.CalledProcessError(returncode=process, cmd=cmd)
  152. log_file.close()
  153. if __name__ == "__main__":
  154. main()