You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_watchpoints.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. # Copyright 2021-2022 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Watchpoints test script for offline debugger APIs.
  17. """
  18. import os
  19. import json
  20. import shutil
  21. import numpy as np
  22. import mindspore.offline_debug.dbg_services as d
  23. from dump_test_utils import build_dump_structure, write_watchpoint_to_json
  24. from tests.security_utils import security_off_wrap
  25. class TestOfflineWatchpoints:
  26. """Test watchpoint for offline debugger."""
  27. GENERATE_GOLDEN = False
  28. test_name = "watchpoints"
  29. watchpoint_hits_json = []
  30. temp_dir = ''
  31. @classmethod
  32. def setup_class(cls):
  33. """Init setup for offline watchpoints test"""
  34. name1 = "Conv2D.Conv2D-op369.0.0.1"
  35. tensor1 = np.array([[[-1.2808e-03, 7.7629e-03, 1.9241e-02],
  36. [-1.3931e-02, 8.9359e-04, -1.1520e-02],
  37. [-6.3248e-03, 1.8749e-03, 1.0132e-02]],
  38. [[-2.5520e-03, -6.0005e-03, -5.1918e-03],
  39. [-2.7866e-03, 2.5487e-04, 8.4782e-04],
  40. [-4.6310e-03, -8.9111e-03, -8.1778e-05]],
  41. [[1.3914e-03, 6.0844e-04, 1.0643e-03],
  42. [-2.0966e-02, -1.2865e-03, -1.8692e-03],
  43. [-1.6647e-02, 1.0233e-03, -4.1313e-03]]], np.float32)
  44. info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369",
  45. slot=1, iteration=2, rank_id=0, root_graph_id=0, is_output=False)
  46. name2 = "Parameter.fc2.bias.0.0.2"
  47. tensor2 = np.array([-5.0167350e-06, 1.2509107e-05, -4.3148934e-06, 8.1415592e-06,
  48. 2.1177532e-07, 2.9952851e-06], np.float32)
  49. info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
  50. "Parameter[6]_11/fc2.bias",
  51. slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True)
  52. tensor3 = np.array([2.9060817e-07, -5.1009415e-06, -2.8662325e-06, 2.6036503e-06,
  53. -5.1546101e-07, 6.0798648e-06], np.float32)
  54. info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
  55. "Parameter[6]_11/fc2.bias",
  56. slot=0, iteration=3, rank_id=0, root_graph_id=0, is_output=True)
  57. name3 = "CudnnUniformReal.CudnnUniformReal-op391.0.0.3"
  58. tensor4 = np.array([-32.0, -4096.0], np.float32)
  59. info4 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391",
  60. slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=False)
  61. name4 = "Cast.Cast-op4.0.0.1"
  62. tensor_all_zero = np.array([[[0, 0, 0],
  63. [0, 0, 0],
  64. [0, 0, 0]]], np.float32)
  65. info5 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/Cast-op4",
  66. slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
  67. name5 = "Cast.Cast-op40.0.0.1"
  68. tensor_all_one = np.array([[[1, 1, 1],
  69. [1, 1, 1],
  70. [1, 1, 1]]], np.float32)
  71. info6 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/Cast-op40",
  72. slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True)
  73. tensor_info = [info1, info2, info3, info4, info5, info6]
  74. tensor_name = [name1, name2, name2, name3, name4, name5]
  75. tensor_list = [tensor1, tensor2, tensor3, tensor4, tensor_all_zero, tensor_all_one]
  76. cls.temp_dir = build_dump_structure(tensor_name, tensor_list, "Test", tensor_info)
  77. @classmethod
  78. def teardown_class(cls):
  79. shutil.rmtree(cls.temp_dir)
  80. @security_off_wrap
  81. def test_sync_add_remove_watchpoints_hit(self):
  82. # NOTES: watch_condition=6 is MIN_LT
  83. # watchpoint set and hit (watch_condition=6), then remove it
  84. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  85. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True)
  86. param = d.Parameter(name="param", disabled=False, value=0.0)
  87. _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
  88. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet"
  89. "/conv1-Conv2d/Conv2D-op369":
  90. {"rank_id": [0], "root_graph_id": [0], "is_output": False
  91. }}, parameter_list=[param])
  92. # add second watchpoint to check the watchpoint hit in correct order
  93. param1 = d.Parameter(name="param", disabled=False, value=10.0)
  94. _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
  95. check_node_list={"Default/CudnnUniformReal-op391":
  96. {"rank_id": [0], "root_graph_id": [0], "is_output": False
  97. }}, parameter_list=[param1])
  98. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2)
  99. assert len(watchpoint_hits_test) == 2
  100. if self.GENERATE_GOLDEN:
  101. self.print_watchpoint_hits(watchpoint_hits_test, 0, False)
  102. else:
  103. self.compare_expect_actual_result(watchpoint_hits_test, 0)
  104. _ = debugger_backend.remove_watchpoint(watchpoint_id=1)
  105. watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
  106. assert len(watchpoint_hits_test_1) == 1
  107. @security_off_wrap
  108. def test_sync_add_remove_watchpoints_not_hit(self):
  109. # watchpoint set and not hit(watch_condition=6), then remove
  110. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  111. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True)
  112. param = d.Parameter(name="param", disabled=False, value=-1000.0)
  113. _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
  114. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet"
  115. "/conv1-Conv2d/Conv2D-op369":
  116. {"rank_id": [0], "root_graph_id": [0], "is_output": False
  117. }}, parameter_list=[param])
  118. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2)
  119. assert not watchpoint_hits_test
  120. _ = debugger_backend.remove_watchpoint(watchpoint_id=2)
  121. @security_off_wrap
  122. def test_sync_weight_change_watchpoints_hit(self):
  123. # NOTES: watch_condition=18 is CHANGE_TOO_LARGE
  124. # weight change watchpoint set and hit(watch_condition=18)
  125. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  126. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True)
  127. param_abs_mean_update_ratio_gt = d.Parameter(
  128. name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
  129. param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
  130. _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
  131. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
  132. "Parameter[6]_11/fc2.bias":
  133. {"rank_id": [0], "root_graph_id": [0], "is_output": True
  134. }}, parameter_list=[param_abs_mean_update_ratio_gt,
  135. param_epsilon])
  136. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=3)
  137. assert len(watchpoint_hits_test) == 1
  138. if self.GENERATE_GOLDEN:
  139. self.print_watchpoint_hits(watchpoint_hits_test, 2, True)
  140. else:
  141. self.compare_expect_actual_result(watchpoint_hits_test, 2)
  142. @security_off_wrap
  143. def test_async_add_remove_watchpoint_hit(self):
  144. # watchpoint set and hit(watch_condition=6) in async mode, then remove
  145. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  146. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False)
  147. param = d.Parameter(name="param", disabled=False, value=0.0)
  148. _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
  149. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet"
  150. "/conv1-Conv2d/Conv2D-op369":
  151. {"rank_id": [0], "root_graph_id": [0], "is_output": False
  152. }}, parameter_list=[param])
  153. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2)
  154. assert len(watchpoint_hits_test) == 1
  155. if not self.GENERATE_GOLDEN:
  156. self.compare_expect_actual_result(watchpoint_hits_test, 0)
  157. _ = debugger_backend.remove_watchpoint(watchpoint_id=1)
  158. watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
  159. assert not watchpoint_hits_test_1
  160. @security_off_wrap
  161. def test_async_add_remove_watchpoints_not_hit(self):
  162. # watchpoint set and not hit(watch_condition=6) in async mode, then remove
  163. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  164. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False)
  165. param = d.Parameter(name="param", disabled=False, value=-1000.0)
  166. _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
  167. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet"
  168. "/conv1-Conv2d/Conv2D-op369":
  169. {"rank_id": [0], "root_graph_id": [0], "is_output": False
  170. }}, parameter_list=[param])
  171. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2)
  172. assert not watchpoint_hits_test
  173. _ = debugger_backend.remove_watchpoint(watchpoint_id=2)
  174. @security_off_wrap
  175. def test_async_watchpoints_no_duplicate_wp_hit(self):
  176. """
  177. Feature: Offline Debugger CheckWatchpoint.
  178. Description: Test check watchpoint hit with similar op name (one is the prefix of the other)
  179. Expectation: Get exactly one watchpoint hit result and no duplicate watchpoints in the hit results.
  180. """
  181. # watchpoint set and hit only one (watch_condition=3) in async mode
  182. debugger_backend = d.DbgServices(dump_file_path=self.temp_dir)
  183. _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False)
  184. max_gt = d.Parameter(name="max_gt", disabled=False, value=0.0)
  185. debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=3,
  186. check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/Cast-op4":
  187. {"rank_id": [0], "root_graph_id": [0], "is_output": True
  188. },
  189. "Default/network-WithLossCell/_backbone-AlexNet/Cast-op40":
  190. {"rank_id": [0], "root_graph_id": [0], "is_output": True
  191. }}, parameter_list=[max_gt])
  192. watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=0)
  193. assert len(watchpoint_hits_test) == 1
  194. def compare_expect_actual_result(self, watchpoint_hits_list, test_index):
  195. """Compare actual result with golden file."""
  196. golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/",
  197. self.test_name + "_expected.json"))
  198. with open(golden_file) as f:
  199. expected_list = json.load(f)
  200. for x, watchpoint_hits in enumerate(watchpoint_hits_list):
  201. test_id = "watchpoint_hit" + str(test_index + x + 1)
  202. expect_wp = expected_list[x + test_index][test_id]
  203. actual_wp = write_watchpoint_to_json(watchpoint_hits)
  204. assert actual_wp == expect_wp
  205. def print_watchpoint_hits(self, watchpoint_hits_list, test_index, is_print):
  206. """Print watchpoint hits."""
  207. for x, watchpoint_hits in enumerate(watchpoint_hits_list):
  208. watchpoint_hit = "watchpoint_hit" + str(test_index + x + 1)
  209. wp = write_watchpoint_to_json(watchpoint_hits)
  210. self.watchpoint_hits_json.append({watchpoint_hit: wp})
  211. if is_print:
  212. with open(self.test_name + "_expected.json", "w") as dump_f:
  213. json.dump(self.watchpoint_hits_json, dump_f, indent=4, separators=(',', ': '))