You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dump_test_utils.py 4.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Utils for testing offline debugger.
  17. """
  18. import os
  19. import tempfile
  20. import bisect
  21. import csv
  22. import numpy as np
  23. def write_watchpoint_to_json(watchpoint_hits):
  24. parameter_json = []
  25. for p, _ in enumerate(watchpoint_hits.parameters):
  26. parameter = "parameter" + str(p)
  27. parameter_json.append({
  28. parameter: {
  29. 'name': watchpoint_hits.parameters[p].name,
  30. 'disabled': watchpoint_hits.parameters[p].disabled,
  31. 'value': watchpoint_hits.parameters[p].value,
  32. 'hit': watchpoint_hits.parameters[p].hit,
  33. 'actual_value': watchpoint_hits.parameters[p].actual_value
  34. }
  35. })
  36. wp = {
  37. 'name': watchpoint_hits.name,
  38. 'slot': watchpoint_hits.slot,
  39. 'condition': watchpoint_hits.condition,
  40. 'watchpoint_id': watchpoint_hits.watchpoint_id,
  41. 'parameter': parameter_json,
  42. 'error_code': watchpoint_hits.error_code,
  43. 'rank_id': watchpoint_hits.rank_id,
  44. 'root_graph_id': watchpoint_hits.root_graph_id
  45. }
  46. return wp
  47. def write_tensor_to_json(tensor_info, tensor_data):
  48. data = np.frombuffer(
  49. tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist()
  50. py_byte_size = len(tensor_data.data_ptr)
  51. c_byte_size = tensor_data.data_size
  52. if c_byte_size != py_byte_size:
  53. print("The python byte size of " + str(py_byte_size) +
  54. " does not match the C++ byte size of " + str(c_byte_size) + "\n")
  55. tensor = {
  56. 'tensor_info': {
  57. 'node_name': tensor_info.node_name,
  58. 'slot': tensor_info.slot,
  59. 'iteration': tensor_info.iteration,
  60. 'rank_id': tensor_info.rank_id,
  61. 'root_graph_id': tensor_info.root_graph_id,
  62. 'is_output': tensor_info.is_output
  63. },
  64. 'tensor_data': {
  65. 'data': data,
  66. 'size_in_bytes': tensor_data.data_size,
  67. 'debugger_dtype': tensor_data.dtype,
  68. 'shape': tensor_data.shape
  69. }
  70. }
  71. return tensor
  72. def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list):
  73. """Build dump file structure from tensor_list."""
  74. ranks_run_history = {}
  75. temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path)
  76. for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list):
  77. slot = str(tensor_info.slot)
  78. iteration = str(tensor_info.iteration)
  79. rank_id = str(tensor_info.rank_id)
  80. root_graph_id = str(tensor_info.root_graph_id)
  81. is_output = str(tensor_info.is_output)
  82. graphs_run_history = ranks_run_history.get(rank_id)
  83. if graphs_run_history is None:
  84. graphs_run_history = {}
  85. ranks_run_history[rank_id] = graphs_run_history
  86. if root_graph_id not in graphs_run_history:
  87. graphs_run_history[root_graph_id] = [iteration]
  88. if iteration not in graphs_run_history[root_graph_id]:
  89. bisect.insort(graphs_run_history[root_graph_id], iteration)
  90. path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration)
  91. os.makedirs(path, exist_ok=True)
  92. if is_output == "True":
  93. file_name = f'{tensor_name}.output.{slot}.DefaultFormat.npy'
  94. else:
  95. file_name = f'{tensor_name}.input.{slot}.DefaultFormat.npy'
  96. full_path = os.path.join(path, file_name)
  97. np.save(full_path, tensor)
  98. build_global_execution_order(temp_dir, ranks_run_history)
  99. return temp_dir
  100. def build_global_execution_order(path, ranks_run_history):
  101. """Build global execution order."""
  102. for rank_id in ranks_run_history.keys():
  103. exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order"
  104. os.makedirs(exec_order_path, exist_ok=True)
  105. for graph in ranks_run_history[rank_id].keys():
  106. full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv")
  107. with open(full_path, 'w+', newline='') as csv_file:
  108. write = csv.writer(csv_file)
  109. write.writerows(ranks_run_history[rank_id][graph])