You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_data_dump.py 28 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. import os
  16. import sys
  17. import tempfile
  18. import time
  19. import shutil
  20. import glob
  21. import csv
  22. from importlib import import_module
  23. from pathlib import Path
  24. import numpy as np
  25. import pytest
  26. import mindspore.context as context
  27. import mindspore.nn as nn
  28. import mindspore.ops as ops
  29. from mindspore import Tensor
  30. from mindspore.ops import operations as P, constexpr
  31. from mindspore.nn import Cell
  32. from mindspore.nn import Dense
  33. from mindspore.nn import SoftmaxCrossEntropyWithLogits
  34. from mindspore.nn import Momentum
  35. from mindspore.nn import TrainOneStepCell
  36. from mindspore.nn import WithLossCell
  37. from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
  38. generate_statistic_dump_json, check_dump_structure, find_nth_pos
  39. from tests.security_utils import security_off_wrap
  40. class Net(nn.Cell):
  41. def __init__(self):
  42. super(Net, self).__init__()
  43. self.add = P.Add()
  44. def construct(self, x_, y_):
  45. return self.add(x_, y_)
  46. x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
  47. y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)
  48. def run_async_dump(test_name):
  49. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  50. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  51. dump_path = os.path.join(tmp_dir, 'async_dump')
  52. dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
  53. generate_dump_json(dump_path, dump_config_path, test_name)
  54. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  55. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  56. if os.path.isdir(dump_path):
  57. shutil.rmtree(dump_path)
  58. add = Net()
  59. add(Tensor(x), Tensor(y))
  60. for _ in range(3):
  61. if not os.path.exists(dump_file_path):
  62. time.sleep(2)
  63. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  64. assert len(os.listdir(dump_file_path)) == 1
  65. del os.environ['MINDSPORE_DUMP_CONFIG']
  66. @pytest.mark.level1
  67. @pytest.mark.platform_arm_ascend_training
  68. @pytest.mark.platform_x86_ascend_training
  69. @pytest.mark.env_onecard
  70. @security_off_wrap
  71. def test_async_dump():
  72. """
  73. Feature: async dump on Ascend
  74. Description: test async dump with default file_format value ("bin")
  75. Expectation: dump data are generated as protobuf file format (suffix with timestamp)
  76. """
  77. run_async_dump("test_async_dump")
  78. def run_e2e_dump():
  79. if sys.platform != 'linux':
  80. return
  81. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  82. dump_path = os.path.join(tmp_dir, 'e2e_dump')
  83. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  84. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  85. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  86. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  87. if os.path.isdir(dump_path):
  88. shutil.rmtree(dump_path)
  89. add = Net()
  90. add(Tensor(x), Tensor(y))
  91. if context.get_context("device_target") == "Ascend":
  92. assert len(os.listdir(dump_file_path)) == 3
  93. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  94. elif context.get_context("device_target") == "CPU":
  95. assert len(os.listdir(dump_file_path)) == 5
  96. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  97. else:
  98. assert len(os.listdir(dump_file_path)) == 3
  99. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  100. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  101. real_path = os.path.realpath(output_path)
  102. output = np.load(real_path)
  103. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  104. assert output.dtype == expect.dtype
  105. assert np.array_equal(output, expect)
  106. for _ in range(3):
  107. if not os.path.exists(dump_file_path):
  108. time.sleep(2)
  109. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  110. del os.environ['MINDSPORE_DUMP_CONFIG']
  111. @pytest.mark.level0
  112. @pytest.mark.platform_arm_ascend_training
  113. @pytest.mark.platform_x86_ascend_training
  114. @pytest.mark.env_onecard
  115. @security_off_wrap
  116. def test_e2e_dump():
  117. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  118. run_e2e_dump()
  119. @pytest.mark.level0
  120. @pytest.mark.platform_arm_ascend_training
  121. @pytest.mark.platform_x86_ascend_training
  122. @pytest.mark.env_onecard
  123. @security_off_wrap
  124. def test_e2e_dump_with_hccl_env():
  125. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  126. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  127. os.environ["RANK_ID"] = "4"
  128. run_e2e_dump()
  129. del os.environ['RANK_TABLE_FILE']
  130. del os.environ['RANK_ID']
  131. @pytest.mark.level0
  132. @pytest.mark.platform_x86_cpu
  133. @pytest.mark.env_onecard
  134. @security_off_wrap
  135. def test_cpu_e2e_dump():
  136. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  137. run_e2e_dump()
  138. @pytest.mark.level0
  139. @pytest.mark.platform_x86_cpu
  140. @pytest.mark.env_onecard
  141. @security_off_wrap
  142. def test_cpu_e2e_dump_with_hccl_set():
  143. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  144. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  145. os.environ["RANK_ID"] = "4"
  146. run_e2e_dump()
  147. del os.environ['RANK_TABLE_FILE']
  148. del os.environ['RANK_ID']
  149. @pytest.mark.level0
  150. @pytest.mark.platform_x86_gpu_training
  151. @pytest.mark.env_onecard
  152. @security_off_wrap
  153. def test_gpu_e2e_dump():
  154. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  155. run_e2e_dump()
  156. @pytest.mark.level0
  157. @pytest.mark.platform_x86_gpu_training
  158. @pytest.mark.env_onecard
  159. @security_off_wrap
  160. def test_gpu_e2e_dump_with_hccl_set():
  161. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  162. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  163. os.environ["RANK_ID"] = "4"
  164. run_e2e_dump()
  165. del os.environ['RANK_TABLE_FILE']
  166. del os.environ['RANK_ID']
  167. class ReluReduceMeanDenseRelu(Cell):
  168. def __init__(self, kernel, bias, in_channel, num_class):
  169. super().__init__()
  170. self.relu = P.ReLU()
  171. self.mean = P.ReduceMean(keep_dims=False)
  172. self.dense = Dense(in_channel, num_class, kernel, bias)
  173. def construct(self, x_):
  174. x_ = self.relu(x_)
  175. x_ = self.mean(x_, (2, 3))
  176. x_ = self.dense(x_)
  177. x_ = self.relu(x_)
  178. return x_
  179. @pytest.mark.level0
  180. @pytest.mark.platform_arm_ascend_training
  181. @pytest.mark.platform_x86_ascend_training
  182. @pytest.mark.env_onecard
  183. @security_off_wrap
  184. def test_async_dump_net_multi_layer_mode1():
  185. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  186. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  187. dump_path = os.path.join(tmp_dir, 'async_dump_net_multi_layer_mode1')
  188. json_file_path = os.path.join(tmp_dir, "test_async_dump_net_multi_layer_mode1.json")
  189. generate_dump_json(dump_path, json_file_path, 'test_async_dump_net_multi_layer_mode1')
  190. os.environ['MINDSPORE_DUMP_CONFIG'] = json_file_path
  191. weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
  192. bias = Tensor(np.ones((1000,)).astype(np.float32))
  193. net = ReluReduceMeanDenseRelu(weight, bias, 2048, 1000)
  194. criterion = SoftmaxCrossEntropyWithLogits(sparse=False)
  195. optimizer = Momentum(learning_rate=0.1, momentum=0.1,
  196. params=filter(lambda x: x.requires_grad, net.get_parameters()))
  197. net_with_criterion = WithLossCell(net, criterion)
  198. train_network = TrainOneStepCell(net_with_criterion, optimizer)
  199. train_network.set_train()
  200. inputs = Tensor(np.random.randn(32, 2048, 7, 7).astype(np.float32))
  201. label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
  202. net_dict = train_network(inputs, label)
  203. dump_file_path = os.path.join(dump_path, 'rank_0', 'test', '0', '0')
  204. dump_file_name = list(Path(dump_file_path).rglob("*SoftmaxCrossEntropyWithLogits*"))[0]
  205. dump_file_full_path = os.path.join(dump_file_path, dump_file_name)
  206. npy_path = os.path.join(dump_path, "npy_files")
  207. if os.path.exists(npy_path):
  208. shutil.rmtree(npy_path)
  209. os.mkdir(npy_path)
  210. tool_path_search_list = list(Path('/usr/local/Ascend').rglob('msaccucmp.py*'))
  211. if tool_path_search_list:
  212. converter = import_module("mindspore.offline_debug.convert_async")
  213. converter.AsyncDumpConverter([dump_file_full_path], npy_path).convert_files()
  214. npy_result_file = list(Path(npy_path).rglob("*output.0.*.npy"))[0]
  215. dump_result = np.load(os.path.join(npy_path, npy_result_file))
  216. for index, value in enumerate(net_dict):
  217. assert value.asnumpy() == dump_result[index]
  218. else:
  219. print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
  220. del os.environ['MINDSPORE_DUMP_CONFIG']
  221. @pytest.mark.level0
  222. @pytest.mark.platform_arm_ascend_training
  223. @pytest.mark.platform_x86_ascend_training
  224. @pytest.mark.env_onecard
  225. @security_off_wrap
  226. def test_dump_with_diagnostic_path():
  227. """
  228. Test e2e dump when path is not set (set to empty) in dump json file and MS_DIAGNOSTIC_DATA_PATH is set.
  229. Data is expected to be dumped into MS_DIAGNOSTIC_DATA_PATH/debug_dump.
  230. """
  231. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  232. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  233. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  234. generate_dump_json('', dump_config_path, 'test_e2e_dump')
  235. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  236. diagnose_path = os.path.join(tmp_dir, 'e2e_dump')
  237. os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
  238. dump_file_path = os.path.join(diagnose_path, 'debug_dump', 'rank_0', 'Net', '0', '0')
  239. if os.path.isdir(diagnose_path):
  240. shutil.rmtree(diagnose_path)
  241. add = Net()
  242. add(Tensor(x), Tensor(y))
  243. assert len(os.listdir(dump_file_path)) == 3
  244. del os.environ['MINDSPORE_DUMP_CONFIG']
  245. del os.environ['MS_DIAGNOSTIC_DATA_PATH']
  246. def run_e2e_dump_execution_graph():
  247. """Run e2e dump and check execution order."""
  248. if sys.platform != 'linux':
  249. return
  250. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  251. dump_path = os.path.join(tmp_dir, 'e2e_dump_exe_graph')
  252. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  253. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  254. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  255. if os.path.isdir(dump_path):
  256. shutil.rmtree(dump_path)
  257. add = Net()
  258. add(Tensor(x), Tensor(y))
  259. exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
  260. assert len(os.listdir(exe_graph_path)) == 2
  261. del os.environ['MINDSPORE_DUMP_CONFIG']
  262. @pytest.mark.level0
  263. @pytest.mark.platform_x86_gpu_training
  264. @pytest.mark.env_onecard
  265. @security_off_wrap
  266. def test_dump_with_execution_graph():
  267. """Test dump with execution graph on GPU."""
  268. context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
  269. run_e2e_dump_execution_graph()
  270. def run_overflow_dump():
  271. """Run async dump and generate overflow"""
  272. if sys.platform != 'linux':
  273. return
  274. overflow_x = np.array([60000, 60000]).astype(np.float16)
  275. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  276. dump_path = os.path.join(tmp_dir, 'overflow_dump')
  277. dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
  278. generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
  279. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  280. if os.path.isdir(dump_path):
  281. shutil.rmtree(dump_path)
  282. add = Net()
  283. add(Tensor(overflow_x), Tensor(overflow_x))
  284. exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  285. for _ in range(5):
  286. if not os.path.exists(exe_graph_path):
  287. time.sleep(2)
  288. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  289. # check if overflow dump generate exact two files, and the naming format
  290. assert len(os.listdir(exe_graph_path)) == 2
  291. output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
  292. overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
  293. assert output_path
  294. assert overflow_path
  295. # check if generated files have matching task and stream id
  296. output_file_name = os.path.split(output_path)
  297. overflow_file_name = os.path.split(overflow_path)
  298. output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
  299. output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
  300. output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
  301. output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
  302. output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
  303. overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
  304. overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
  305. overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
  306. overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
  307. overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
  308. assert output_task_id == overflow_task_id
  309. assert output_stream_id == overflow_stream_id
  310. # check if overflow dump file contains same task and stream id as file name
  311. with open(overflow_path, 'rb') as f:
  312. f.seek(321, 0)
  313. raw_data = f.read()
  314. task_id_infile = int.from_bytes(raw_data[24:25], 'little')
  315. stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
  316. assert output_task_id == str(task_id_infile)
  317. assert output_stream_id == str(stream_id_infile)
  318. del os.environ['MINDSPORE_DUMP_CONFIG']
  319. def run_not_overflow_dump():
  320. """Run async dump and not generate overflow"""
  321. if sys.platform != 'linux':
  322. return
  323. overflow_x = np.array([60000, 60000]).astype(np.float16)
  324. overflow_y = np.array([2, 2]).astype(np.float16)
  325. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  326. dump_path = os.path.join(tmp_dir, 'overflow_dump')
  327. dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
  328. generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
  329. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  330. if os.path.isdir(dump_path):
  331. shutil.rmtree(dump_path)
  332. add = Net()
  333. add(Tensor(overflow_x), Tensor(overflow_y))
  334. exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  335. # check no overflow is happening, and path should not be generated
  336. assert not os.path.exists(exe_graph_path)
  337. del os.environ['MINDSPORE_DUMP_CONFIG']
  338. @pytest.mark.level0
  339. @pytest.mark.platform_arm_ascend_training
  340. @pytest.mark.platform_x86_ascend_training
  341. @pytest.mark.env_onecard
  342. @security_off_wrap
  343. def test_ascend_overflow_dump():
  344. """
  345. Feature: Overflow Dump
  346. Description: Test overflow dump
  347. Expectation: Overflow is occurred, and overflow dump file is in correct format
  348. """
  349. context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
  350. run_overflow_dump()
  351. @pytest.mark.level0
  352. @pytest.mark.platform_arm_ascend_training
  353. @pytest.mark.platform_x86_ascend_training
  354. @pytest.mark.env_onecard
  355. @security_off_wrap
  356. def test_ascend_not_overflow_dump():
  357. """
  358. Feature: Overflow Dump
  359. Description: Test overflow dump
  360. Expectation: Overflow is not occurred, and overflow dump file is not generated
  361. """
  362. context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
  363. run_not_overflow_dump()
  364. def check_statistic_dump(dump_file_path):
  365. output_name = "statistic.csv"
  366. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  367. real_path = os.path.realpath(output_path)
  368. with open(real_path) as f:
  369. reader = csv.DictReader(f)
  370. stats = list(reader)
  371. num_tensors = len(stats)
  372. assert num_tensors == 3
  373. for tensor in stats:
  374. if (tensor['IO'] == 'input' and tensor['Slot'] == 0):
  375. assert tensor['Min Value'] == '1'
  376. assert tensor['Max Value'] == '6'
  377. elif (tensor['IO'] == 'input' and tensor['Slot'] == 1):
  378. assert tensor['Min Value'] == '7'
  379. assert tensor['Max Value'] == '12'
  380. elif (tensor['IO'] == 'output' and tensor['Slot'] == 0):
  381. assert tensor['Min Value'] == '8'
  382. assert tensor['Max Value'] == '18'
  383. def check_data_dump(dump_file_path):
  384. output_name = "Add.Add-op*.output.0.*.npy"
  385. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  386. real_path = os.path.realpath(output_path)
  387. output = np.load(real_path)
  388. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  389. assert np.array_equal(output, expect)
  390. def run_train():
  391. add = Net()
  392. add(Tensor(x), Tensor(y))
  393. def run_saved_data_dump_test(scenario, saved_data):
  394. """Run e2e dump on scenario, testing statistic dump"""
  395. if sys.platform != 'linux':
  396. return
  397. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  398. dump_path = os.path.join(tmp_dir, 'test_saved_data')
  399. dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
  400. generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data)
  401. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  402. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  403. if os.path.isdir(dump_path):
  404. shutil.rmtree(dump_path)
  405. exec_network_cmd = 'cd {0}; python -c "from test_data_dump import run_train; run_train()"'.format(os.getcwd())
  406. _ = os.system(exec_network_cmd)
  407. for _ in range(3):
  408. if not os.path.exists(dump_file_path):
  409. time.sleep(2)
  410. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  411. if saved_data in ('statistic', 'full'):
  412. check_statistic_dump(dump_file_path)
  413. if saved_data in ('tensor', 'full'):
  414. check_data_dump(dump_file_path)
  415. if saved_data == 'statistic':
  416. # assert only file is statistic.csv, tensor data is not saved
  417. assert len(os.listdir(dump_file_path)) == 1
  418. elif saved_data == 'tensor':
  419. # assert only tensor data is saved, not statistics
  420. stat_path = os.path.join(dump_file_path, 'statistic.csv')
  421. assert not os.path.isfile(stat_path)
  422. del os.environ['MINDSPORE_DUMP_CONFIG']
  423. @pytest.mark.level0
  424. @pytest.mark.platform_x86_gpu_training
  425. @pytest.mark.env_onecard
  426. @security_off_wrap
  427. def test_gpu_e2e_statistic_dump():
  428. """
  429. Feature: GPU Statistics Dump
  430. Description: Test GPU statistics dump
  431. Expectation: Statistics are stored in statistic.csv files
  432. """
  433. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  434. run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic')
  435. @pytest.mark.level0
  436. @pytest.mark.platform_x86_gpu_training
  437. @pytest.mark.env_onecard
  438. @security_off_wrap
  439. def test_gpu_e2e_tensor_dump():
  440. """
  441. Feature: GPU Tensor Dump
  442. Description: Test GPU tensor dump
  443. Expectation: Tensor data are stored in npy files
  444. """
  445. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  446. run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor')
  447. @pytest.mark.level0
  448. @pytest.mark.platform_x86_gpu_training
  449. @pytest.mark.env_onecard
  450. @security_off_wrap
  451. def test_gpu_e2e_full_dump():
  452. """
  453. Feature: GPU Full Dump
  454. Description: Test GPU full dump
  455. Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
  456. """
  457. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  458. run_saved_data_dump_test('test_gpu_e2e_dump', 'full')
  459. @pytest.mark.level0
  460. @pytest.mark.platform_x86_gpu_training
  461. @pytest.mark.env_onecard
  462. @security_off_wrap
  463. def test_stat_dump_nulls():
  464. """
  465. Feature: GPU Statistics Dump
  466. Description: Test GPU statistics dump when printing tensors full with NaNs and Infs
  467. Expectation: Min, Max, Avg Values stored in statistic.csv show null for such tensors
  468. """
  469. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  470. if sys.platform != 'linux':
  471. return
  472. empty_x = np.array([]).astype(np.float16)
  473. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  474. dump_path = os.path.join(tmp_dir, 'test_saved_data')
  475. dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
  476. generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', 'statistic')
  477. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  478. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  479. if os.path.isdir(dump_path):
  480. shutil.rmtree(dump_path)
  481. add = Net()
  482. add(Tensor(empty_x), Tensor(empty_x))
  483. for _ in range(3):
  484. if not os.path.exists(dump_file_path):
  485. time.sleep(2)
  486. # check dumped data
  487. output_path = glob.glob(os.path.join(dump_file_path, 'statistic.csv'))[0]
  488. real_path = os.path.realpath(output_path)
  489. with open(real_path) as f:
  490. reader = csv.DictReader(f)
  491. [output] = list(reader)
  492. assert output['IO'] == 'output'
  493. assert output['Min Value'] == 'null'
  494. assert output['Max Value'] == 'null'
  495. assert output['Avg Value'] == 'null'
  496. @pytest.mark.level0
  497. @pytest.mark.platform_arm_ascend_training
  498. @pytest.mark.platform_x86_ascend_training
  499. @pytest.mark.env_onecard
  500. @security_off_wrap
  501. def test_ascend_statistic_dump():
  502. """
  503. Feature: Ascend Statistics Dump
  504. Description: Test Ascend statistics dump
  505. Expectation: Statistics are stored in statistic.csv files
  506. """
  507. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  508. run_saved_data_dump_test('test_async_dump', 'statistic')
  509. @pytest.mark.level0
  510. @pytest.mark.platform_arm_ascend_training
  511. @pytest.mark.platform_x86_ascend_training
  512. @pytest.mark.env_onecard
  513. @security_off_wrap
  514. def test_ascend_statistic_dump_kernel_by_kernel():
  515. """
  516. Feature: Ascend Statistics Dump in kernel by kernel (mindRT) mode
  517. Description: Test Ascend statistics dump
  518. Expectation: Statistics are stored in statistic.csv files
  519. """
  520. # set env `GRAPH_OP_RUN`` to enable kernel-by-kernel mode.
  521. os.environ['GRAPH_OP_RUN'] = "1"
  522. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  523. run_saved_data_dump_test('test_async_dump', 'statistic')
  524. del os.environ['GRAPH_OP_RUN']
  525. @pytest.mark.level0
  526. @pytest.mark.platform_arm_ascend_training
  527. @pytest.mark.platform_x86_ascend_training
  528. @pytest.mark.env_onecard
  529. @security_off_wrap
  530. def test_ascend_tensor_dump():
  531. """
  532. Feature: Ascend Tensor Dump
  533. Description: Test Ascend tensor dump
  534. Expectation: Tensors are stored in npy files
  535. """
  536. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  537. run_saved_data_dump_test('test_async_dump', 'tensor')
  538. @pytest.mark.level0
  539. @pytest.mark.platform_arm_ascend_training
  540. @pytest.mark.platform_x86_ascend_training
  541. @pytest.mark.env_onecard
  542. @security_off_wrap
  543. def test_ascend_full_dump():
  544. """
  545. Feature: Ascend Full Dump
  546. Description: Test Ascend full dump
  547. Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
  548. """
  549. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  550. run_saved_data_dump_test('test_async_dump', 'full')
  551. @constexpr
  552. def construct_tensor(cst):
  553. return Tensor(np.array(cst))
  554. class ConstantNet(nn.Cell):
  555. def __init__(self):
  556. super(ConstantNet, self).__init__()
  557. self.relu = ops.ReLU()
  558. def construct(self, x_):
  559. return self.relu(construct_tensor(ops.shape(x_)))
  560. @pytest.mark.level0
  561. @pytest.mark.platform_arm_ascend_training
  562. @pytest.mark.platform_x86_ascend_training
  563. @pytest.mark.env_onecard
  564. def test_constant_async_ascend_dump():
  565. """
  566. Feature: Constant async dump
  567. Description: Test async constant dump in Ascend
  568. Expectation: constant dump folder is created, dump file has expected tensor info
  569. """
  570. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  571. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  572. dump_path = os.path.join(tmp_dir, 'constant_dump')
  573. dump_config_path = os.path.join(tmp_dir, 'constant_dump.json')
  574. generate_dump_json(dump_path, dump_config_path, 'test_async_dump')
  575. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  576. if os.path.isdir(dump_path):
  577. shutil.rmtree(dump_path)
  578. net = ConstantNet()
  579. tensor = Tensor(np.random.random([1, 2, 3]))
  580. expect = net(tensor)
  581. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  582. constant_path = os.path.join(dump_path, 'rank_0', 'Net', '0', 'constants')
  583. assert os.path.exists(constant_path)
  584. assert len(os.listdir(constant_path)) == 1
  585. output_name = "Parameter.data-*.0.0.*.DefaultFormat.npy"
  586. output_path = glob.glob(os.path.join(constant_path, output_name))[0]
  587. real_path = os.path.realpath(output_path)
  588. output = np.load(real_path)
  589. assert np.array_equal(output, expect)
  590. del os.environ['MINDSPORE_DUMP_CONFIG']
  591. def run_constant_e2e_dump():
  592. if sys.platform != 'linux':
  593. return
  594. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  595. dump_path = os.path.join(tmp_dir, 'constant_dump')
  596. dump_config_path = os.path.join(tmp_dir, 'constant_dump.json')
  597. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  598. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  599. if os.path.isdir(dump_path):
  600. shutil.rmtree(dump_path)
  601. net = ConstantNet()
  602. tensor = Tensor(np.random.random([1, 2, 3]))
  603. expect = net(tensor)
  604. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  605. constant_path = os.path.join(dump_path, 'rank_0', 'Net', '0', 'constants')
  606. assert os.path.exists(constant_path)
  607. assert len(os.listdir(constant_path)) == 1
  608. output_name = "Parameter.data-*.0.0.*.DefaultFormat.npy"
  609. output_path = glob.glob(os.path.join(constant_path, output_name))[0]
  610. real_path = os.path.realpath(output_path)
  611. output = np.load(real_path)
  612. assert np.array_equal(output, expect)
  613. del os.environ['MINDSPORE_DUMP_CONFIG']
  614. @pytest.mark.level0
  615. @pytest.mark.platform_x86_gpu_training
  616. @pytest.mark.env_onecard
  617. @security_off_wrap
  618. def test_constant_gpu_e2e_dump():
  619. """
  620. Feature: Constant sync dump
  621. Description: Test constant sync dump in GPU
  622. Expectation: constant dump folder is created, dump file has expected tensor info
  623. """
  624. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  625. run_constant_e2e_dump()
  626. @pytest.mark.level0
  627. @pytest.mark.platform_arm_ascend_training
  628. @pytest.mark.platform_x86_ascend_training
  629. @pytest.mark.env_onecard
  630. @security_off_wrap
  631. def test_constant_ascend_e2e_dump():
  632. """
  633. Feature: Constant sync dump
  634. Description: Test constant sync dump in Ascend
  635. Expectation: constant dump folder is created, dump file has expected tensor info
  636. """
  637. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  638. run_constant_e2e_dump()