You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_data_dump.py 11 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. import os
  16. import sys
  17. import tempfile
  18. import time
  19. import shutil
  20. import glob
  21. from importlib import import_module
  22. from pathlib import Path
  23. import numpy as np
  24. import pytest
  25. import mindspore.context as context
  26. import mindspore.nn as nn
  27. from mindspore import Tensor
  28. from mindspore.ops import operations as P
  29. from mindspore.nn import Cell
  30. from mindspore.nn import Dense
  31. from mindspore.nn import SoftmaxCrossEntropyWithLogits
  32. from mindspore.nn import Momentum
  33. from mindspore.nn import TrainOneStepCell
  34. from mindspore.nn import WithLossCell
  35. from dump_test_utils import generate_dump_json
  36. from tests.security_utils import security_off_wrap
  37. class Net(nn.Cell):
  38. def __init__(self):
  39. super(Net, self).__init__()
  40. self.add = P.Add()
  41. def construct(self, x_, y_):
  42. return self.add(x_, y_)
  43. x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
  44. y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)
  45. @pytest.mark.level1
  46. @pytest.mark.platform_arm_ascend_training
  47. @pytest.mark.platform_x86_ascend_training
  48. @pytest.mark.env_onecard
  49. @security_off_wrap
  50. def test_async_dump():
  51. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  52. pwd = os.getcwd()
  53. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  54. dump_path = os.path.join(tmp_dir, 'async_dump')
  55. dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
  56. generate_dump_json(dump_path, dump_config_path, 'test_async_dump')
  57. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  58. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  59. if os.path.isdir(dump_path):
  60. shutil.rmtree(dump_path)
  61. add = Net()
  62. add(Tensor(x), Tensor(y))
  63. time.sleep(5)
  64. assert len(os.listdir(dump_file_path)) == 1
  65. def run_e2e_dump():
  66. if sys.platform != 'linux':
  67. return
  68. pwd = os.getcwd()
  69. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  70. dump_path = os.path.join(tmp_dir, 'e2e_dump')
  71. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  72. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  73. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  74. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  75. if os.path.isdir(dump_path):
  76. shutil.rmtree(dump_path)
  77. add = Net()
  78. add(Tensor(x), Tensor(y))
  79. if context.get_context("device_target") == "Ascend":
  80. assert len(os.listdir(dump_file_path)) == 5
  81. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  82. elif context.get_context("device_target") == "CPU":
  83. assert len(os.listdir(dump_file_path)) == 5
  84. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  85. else:
  86. assert len(os.listdir(dump_file_path)) == 3
  87. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  88. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  89. real_path = os.path.realpath(output_path)
  90. output = np.load(real_path)
  91. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  92. assert output.dtype == expect.dtype
  93. assert np.array_equal(output, expect)
  94. @pytest.mark.level0
  95. @pytest.mark.platform_arm_ascend_training
  96. @pytest.mark.platform_x86_ascend_training
  97. @pytest.mark.env_onecard
  98. @security_off_wrap
  99. def test_e2e_dump():
  100. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  101. run_e2e_dump()
  102. @pytest.mark.level0
  103. @pytest.mark.platform_arm_ascend_training
  104. @pytest.mark.platform_x86_ascend_training
  105. @pytest.mark.env_onecard
  106. @security_off_wrap
  107. def test_e2e_dump_with_hccl_env():
  108. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  109. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  110. os.environ["RANK_ID"] = "4"
  111. run_e2e_dump()
  112. @pytest.mark.level0
  113. @pytest.mark.platform_x86_cpu
  114. @pytest.mark.env_onecard
  115. @security_off_wrap
  116. def test_cpu_e2e_dump():
  117. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  118. run_e2e_dump()
  119. @pytest.mark.level0
  120. @pytest.mark.platform_x86_cpu
  121. @pytest.mark.env_onecard
  122. @security_off_wrap
  123. def test_cpu_e2e_dump_with_hccl_set():
  124. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  125. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  126. os.environ["RANK_ID"] = "4"
  127. run_e2e_dump()
  128. @pytest.mark.level0
  129. @pytest.mark.platform_x86_gpu_training
  130. @pytest.mark.env_onecard
  131. @security_off_wrap
  132. def test_gpu_e2e_dump():
  133. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  134. run_e2e_dump()
  135. @pytest.mark.level0
  136. @pytest.mark.platform_x86_gpu_training
  137. @pytest.mark.env_onecard
  138. @security_off_wrap
  139. def test_gpu_e2e_dump_with_hccl_set():
  140. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  141. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  142. os.environ["RANK_ID"] = "4"
  143. run_e2e_dump()
  144. class ReluReduceMeanDenseRelu(Cell):
  145. def __init__(self, kernel, bias, in_channel, num_class):
  146. super().__init__()
  147. self.relu = P.ReLU()
  148. self.mean = P.ReduceMean(keep_dims=False)
  149. self.dense = Dense(in_channel, num_class, kernel, bias)
  150. def construct(self, x_):
  151. x_ = self.relu(x_)
  152. x_ = self.mean(x_, (2, 3))
  153. x_ = self.dense(x_)
  154. x_ = self.relu(x_)
  155. return x_
  156. @pytest.mark.level0
  157. @pytest.mark.platform_arm_ascend_training
  158. @pytest.mark.platform_x86_ascend_training
  159. @pytest.mark.env_onecard
  160. @security_off_wrap
  161. def test_async_dump_net_multi_layer_mode1():
  162. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  163. pwd = os.getcwd()
  164. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  165. dump_path = os.path.join(tmp_dir, 'async_dump_net_multi_layer_mode1')
  166. json_file_path = os.path.join(tmp_dir, "test_async_dump_net_multi_layer_mode1.json")
  167. generate_dump_json(dump_path, json_file_path, 'test_async_dump_net_multi_layer_mode1')
  168. os.environ['MINDSPORE_DUMP_CONFIG'] = json_file_path
  169. weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
  170. bias = Tensor(np.ones((1000,)).astype(np.float32))
  171. net = ReluReduceMeanDenseRelu(weight, bias, 2048, 1000)
  172. criterion = SoftmaxCrossEntropyWithLogits(sparse=False)
  173. optimizer = Momentum(learning_rate=0.1, momentum=0.1,
  174. params=filter(lambda x: x.requires_grad, net.get_parameters()))
  175. net_with_criterion = WithLossCell(net, criterion)
  176. train_network = TrainOneStepCell(net_with_criterion, optimizer)
  177. train_network.set_train()
  178. inputs = Tensor(np.random.randn(32, 2048, 7, 7).astype(np.float32))
  179. label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
  180. net_dict = train_network(inputs, label)
  181. dump_file_path = os.path.join(dump_path, 'rank_0', 'test', '0', '0')
  182. dump_file_name = list(Path(dump_file_path).rglob("*SoftmaxCrossEntropyWithLogits*"))[0]
  183. dump_file_full_path = os.path.join(dump_file_path, dump_file_name)
  184. npy_path = os.path.join(dump_path, "npy_files")
  185. if os.path.exists(npy_path):
  186. shutil.rmtree(npy_path)
  187. os.mkdir(npy_path)
  188. tool_path_search_list = list(Path('/usr/local/Ascend').rglob('msaccucmp.py*'))
  189. if tool_path_search_list:
  190. converter = import_module("mindspore.offline_debug.convert_async")
  191. converter.AsyncDumpConverter([dump_file_full_path], npy_path).convert_files()
  192. npy_result_file = list(Path(npy_path).rglob("*output.0.*.npy"))[0]
  193. dump_result = np.load(os.path.join(npy_path, npy_result_file))
  194. for index, value in enumerate(net_dict):
  195. assert value.asnumpy() == dump_result[index]
  196. else:
  197. print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
  198. @pytest.mark.level0
  199. @pytest.mark.platform_arm_ascend_training
  200. @pytest.mark.platform_x86_ascend_training
  201. @pytest.mark.env_onecard
  202. @security_off_wrap
  203. def test_dump_with_diagnostic_path():
  204. """
  205. Test e2e dump when path is not set (set to empty) in dump json file and MS_DIAGNOSTIC_DATA_PATH is set.
  206. Data is expected to be dumped into MS_DIAGNOSTIC_DATA_PATH/debug_dump.
  207. """
  208. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  209. pwd = os.getcwd()
  210. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  211. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  212. generate_dump_json('', dump_config_path, 'test_e2e_dump')
  213. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  214. diagnose_path = os.path.join(tmp_dir, 'e2e_dump')
  215. os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
  216. dump_file_path = os.path.join(diagnose_path, 'debug_dump', 'rank_0', 'Net', '0', '0')
  217. if os.path.isdir(diagnose_path):
  218. shutil.rmtree(diagnose_path)
  219. add = Net()
  220. add(Tensor(x), Tensor(y))
  221. assert len(os.listdir(dump_file_path)) == 5
  222. def run_e2e_dump_execution_graph():
  223. """Run e2e dump and check execution order."""
  224. if sys.platform != 'linux':
  225. return
  226. pwd = os.getcwd()
  227. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  228. dump_path = os.path.join(tmp_dir, 'e2e_dump_exe_graph')
  229. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  230. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  231. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  232. if os.path.isdir(dump_path):
  233. shutil.rmtree(dump_path)
  234. add = Net()
  235. add(Tensor(x), Tensor(y))
  236. exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
  237. assert len(os.listdir(exe_graph_path)) == 1
  238. @pytest.mark.level0
  239. @pytest.mark.platform_x86_gpu_training
  240. @pytest.mark.env_onecard
  241. @security_off_wrap
  242. def test_dump_with_execution_graph():
  243. """Test dump with execution graph on GPU."""
  244. context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
  245. run_e2e_dump_execution_graph()