You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_data_dump.py 28 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. import os
  16. import sys
  17. import tempfile
  18. import time
  19. import shutil
  20. import glob
  21. import csv
  22. from importlib import import_module
  23. from pathlib import Path
  24. import numpy as np
  25. import pytest
  26. import mindspore.context as context
  27. import mindspore.nn as nn
  28. import mindspore.ops as ops
  29. from mindspore import Tensor
  30. from mindspore.ops import operations as P, constexpr
  31. from mindspore.nn import Cell
  32. from mindspore.nn import Dense
  33. from mindspore.nn import SoftmaxCrossEntropyWithLogits
  34. from mindspore.nn import Momentum
  35. from mindspore.nn import TrainOneStepCell
  36. from mindspore.nn import WithLossCell
  37. from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
  38. generate_statistic_dump_json, check_dump_structure, find_nth_pos
  39. from tests.security_utils import security_off_wrap
  40. class Net(nn.Cell):
  41. def __init__(self):
  42. super(Net, self).__init__()
  43. self.add = P.Add()
  44. def construct(self, x_, y_):
  45. return self.add(x_, y_)
  46. x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
  47. y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)
  48. def run_async_dump(test_name):
  49. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  50. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  51. dump_path = os.path.join(tmp_dir, 'async_dump')
  52. dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
  53. generate_dump_json(dump_path, dump_config_path, test_name)
  54. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  55. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  56. if os.path.isdir(dump_path):
  57. shutil.rmtree(dump_path)
  58. add = Net()
  59. add(Tensor(x), Tensor(y))
  60. for _ in range(3):
  61. if not os.path.exists(dump_file_path):
  62. time.sleep(2)
  63. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  64. assert len(os.listdir(dump_file_path)) == 1
  65. del os.environ['MINDSPORE_DUMP_CONFIG']
  66. @pytest.mark.level1
  67. @pytest.mark.platform_arm_ascend_training
  68. @pytest.mark.platform_x86_ascend_training
  69. @pytest.mark.env_onecard
  70. @security_off_wrap
  71. def test_async_dump():
  72. """
  73. Feature: async dump on Ascend
  74. Description: test async dump with default file_format value ("bin")
  75. Expectation: dump data are generated as protobuf file format (suffix with timestamp)
  76. """
  77. run_async_dump("test_async_dump")
  78. def run_e2e_dump():
  79. if sys.platform != 'linux':
  80. return
  81. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  82. dump_path = os.path.join(tmp_dir, 'e2e_dump')
  83. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  84. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  85. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  86. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  87. if os.path.isdir(dump_path):
  88. shutil.rmtree(dump_path)
  89. add = Net()
  90. add(Tensor(x), Tensor(y))
  91. if context.get_context("device_target") == "Ascend":
  92. assert len(os.listdir(dump_file_path)) == 3
  93. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  94. elif context.get_context("device_target") == "CPU":
  95. assert len(os.listdir(dump_file_path)) == 5
  96. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  97. else:
  98. assert len(os.listdir(dump_file_path)) == 3
  99. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  100. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  101. real_path = os.path.realpath(output_path)
  102. output = np.load(real_path)
  103. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  104. assert output.dtype == expect.dtype
  105. assert np.array_equal(output, expect)
  106. for _ in range(3):
  107. if not os.path.exists(dump_file_path):
  108. time.sleep(2)
  109. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  110. del os.environ['MINDSPORE_DUMP_CONFIG']
  111. @pytest.mark.level0
  112. @pytest.mark.platform_arm_ascend_training
  113. @pytest.mark.platform_x86_ascend_training
  114. @pytest.mark.env_onecard
  115. @security_off_wrap
  116. def test_e2e_dump():
  117. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  118. run_e2e_dump()
  119. @pytest.mark.level0
  120. @pytest.mark.platform_arm_ascend_training
  121. @pytest.mark.platform_x86_ascend_training
  122. @pytest.mark.env_onecard
  123. @security_off_wrap
  124. def test_e2e_dump_with_hccl_env():
  125. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  126. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  127. os.environ["RANK_ID"] = "4"
  128. run_e2e_dump()
  129. del os.environ['RANK_TABLE_FILE']
  130. del os.environ['RANK_ID']
  131. @pytest.mark.level0
  132. @pytest.mark.platform_x86_cpu
  133. @pytest.mark.env_onecard
  134. @security_off_wrap
  135. def test_cpu_e2e_dump():
  136. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  137. run_e2e_dump()
  138. @pytest.mark.level0
  139. @pytest.mark.platform_x86_cpu
  140. @pytest.mark.env_onecard
  141. @security_off_wrap
  142. def test_cpu_e2e_dump_with_hccl_set():
  143. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  144. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  145. os.environ["RANK_ID"] = "4"
  146. run_e2e_dump()
  147. del os.environ['RANK_TABLE_FILE']
  148. del os.environ['RANK_ID']
  149. @pytest.mark.level0
  150. @pytest.mark.platform_x86_gpu_training
  151. @pytest.mark.env_onecard
  152. @security_off_wrap
  153. def test_gpu_e2e_dump():
  154. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  155. run_e2e_dump()
  156. @pytest.mark.level0
  157. @pytest.mark.platform_x86_gpu_training
  158. @pytest.mark.env_onecard
  159. @security_off_wrap
  160. def test_gpu_e2e_dump_with_hccl_set():
  161. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  162. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  163. os.environ["RANK_ID"] = "4"
  164. run_e2e_dump()
  165. del os.environ['RANK_TABLE_FILE']
  166. del os.environ['RANK_ID']
  167. class ReluReduceMeanDenseRelu(Cell):
  168. def __init__(self, kernel, bias, in_channel, num_class):
  169. super().__init__()
  170. self.relu = P.ReLU()
  171. self.mean = P.ReduceMean(keep_dims=False)
  172. self.dense = Dense(in_channel, num_class, kernel, bias)
  173. def construct(self, x_):
  174. x_ = self.relu(x_)
  175. x_ = self.mean(x_, (2, 3))
  176. x_ = self.dense(x_)
  177. x_ = self.relu(x_)
  178. return x_
  179. @pytest.mark.level0
  180. @pytest.mark.platform_arm_ascend_training
  181. @pytest.mark.platform_x86_ascend_training
  182. @pytest.mark.env_onecard
  183. @security_off_wrap
  184. def test_async_dump_net_multi_layer_mode1():
  185. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  186. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  187. dump_path = os.path.join(tmp_dir, 'async_dump_net_multi_layer_mode1')
  188. json_file_path = os.path.join(tmp_dir, "test_async_dump_net_multi_layer_mode1.json")
  189. generate_dump_json(dump_path, json_file_path, 'test_async_dump_net_multi_layer_mode1')
  190. os.environ['MINDSPORE_DUMP_CONFIG'] = json_file_path
  191. weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
  192. bias = Tensor(np.ones((1000,)).astype(np.float32))
  193. net = ReluReduceMeanDenseRelu(weight, bias, 2048, 1000)
  194. criterion = SoftmaxCrossEntropyWithLogits(sparse=False)
  195. optimizer = Momentum(learning_rate=0.1, momentum=0.1,
  196. params=filter(lambda x: x.requires_grad, net.get_parameters()))
  197. net_with_criterion = WithLossCell(net, criterion)
  198. train_network = TrainOneStepCell(net_with_criterion, optimizer)
  199. train_network.set_train()
  200. inputs = Tensor(np.random.randn(32, 2048, 7, 7).astype(np.float32))
  201. label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
  202. net_dict = train_network(inputs, label)
  203. dump_file_path = os.path.join(dump_path, 'rank_0', 'test', '0', '0')
  204. dump_file_name = list(Path(dump_file_path).rglob("*SoftmaxCrossEntropyWithLogits*"))[0]
  205. dump_file_full_path = os.path.join(dump_file_path, dump_file_name)
  206. npy_path = os.path.join(dump_path, "npy_files")
  207. if os.path.exists(npy_path):
  208. shutil.rmtree(npy_path)
  209. os.mkdir(npy_path)
  210. tool_path_search_list = list(Path('/usr/local/Ascend').rglob('msaccucmp.py*'))
  211. if tool_path_search_list:
  212. converter = import_module("mindspore.offline_debug.convert_async")
  213. converter.AsyncDumpConverter([dump_file_full_path], npy_path).convert_files()
  214. npy_result_file = list(Path(npy_path).rglob("*output.0.*.npy"))[0]
  215. dump_result = np.load(os.path.join(npy_path, npy_result_file))
  216. for index, value in enumerate(net_dict):
  217. assert value.asnumpy() == dump_result[index]
  218. else:
  219. print('Failed to find hisi convert tools: msaccucmp.py or msaccucmp.pyc.')
  220. del os.environ['MINDSPORE_DUMP_CONFIG']
  221. @pytest.mark.level0
  222. @pytest.mark.platform_arm_ascend_training
  223. @pytest.mark.platform_x86_ascend_training
  224. @pytest.mark.env_onecard
  225. @security_off_wrap
  226. def test_dump_with_diagnostic_path():
  227. """
  228. Test e2e dump when path is not set (set to empty) in dump json file and MS_DIAGNOSTIC_DATA_PATH is set.
  229. Data is expected to be dumped into MS_DIAGNOSTIC_DATA_PATH/debug_dump.
  230. """
  231. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  232. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  233. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  234. generate_dump_json('', dump_config_path, 'test_e2e_dump')
  235. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  236. diagnose_path = os.path.join(tmp_dir, 'e2e_dump')
  237. os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
  238. dump_file_path = os.path.join(diagnose_path, 'debug_dump', 'rank_0', 'Net', '0', '0')
  239. if os.path.isdir(diagnose_path):
  240. shutil.rmtree(diagnose_path)
  241. add = Net()
  242. add(Tensor(x), Tensor(y))
  243. assert len(os.listdir(dump_file_path)) == 3
  244. del os.environ['MINDSPORE_DUMP_CONFIG']
  245. del os.environ['MS_DIAGNOSTIC_DATA_PATH']
  246. def run_e2e_dump_execution_graph():
  247. """Run e2e dump and check execution order."""
  248. if sys.platform != 'linux':
  249. return
  250. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  251. dump_path = os.path.join(tmp_dir, 'e2e_dump_exe_graph')
  252. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  253. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  254. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  255. if os.path.isdir(dump_path):
  256. shutil.rmtree(dump_path)
  257. add = Net()
  258. add(Tensor(x), Tensor(y))
  259. exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order')
  260. assert len(os.listdir(exe_graph_path)) == 2
  261. del os.environ['MINDSPORE_DUMP_CONFIG']
  262. @pytest.mark.level0
  263. @pytest.mark.platform_x86_gpu_training
  264. @pytest.mark.env_onecard
  265. @security_off_wrap
  266. def test_dump_with_execution_graph():
  267. """Test dump with execution graph on GPU."""
  268. context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
  269. run_e2e_dump_execution_graph()
  270. def run_overflow_dump():
  271. """Run async dump and generate overflow"""
  272. if sys.platform != 'linux':
  273. return
  274. overflow_x = np.array([60000, 60000]).astype(np.float16)
  275. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  276. dump_path = os.path.join(tmp_dir, 'overflow_dump')
  277. dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
  278. generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
  279. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  280. if os.path.isdir(dump_path):
  281. shutil.rmtree(dump_path)
  282. add = Net()
  283. add(Tensor(overflow_x), Tensor(overflow_x))
  284. exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  285. for _ in range(5):
  286. if not os.path.exists(exe_graph_path):
  287. time.sleep(2)
  288. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  289. # check if overflow dump generate exact two files, and the naming format
  290. assert len(os.listdir(exe_graph_path)) == 2
  291. output_path = glob.glob(os.path.join(exe_graph_path, "Add.Default_Add-op0.*.*.*"))[0]
  292. overflow_path = glob.glob(os.path.join(exe_graph_path, "Opdebug.Node_OpDebug.*.*.*"))[0]
  293. assert output_path
  294. assert overflow_path
  295. # check if generated files have matching task and stream id
  296. output_file_name = os.path.split(output_path)
  297. overflow_file_name = os.path.split(overflow_path)
  298. output_second_dot_pos = find_nth_pos(output_file_name[1], ".", 2)
  299. output_third_dot_pos = find_nth_pos(output_file_name[1], ".", 3)
  300. output_fourth_dot_pos = find_nth_pos(output_file_name[1], ".", 4)
  301. output_task_id = output_file_name[1][output_second_dot_pos+1:output_third_dot_pos]
  302. output_stream_id = output_file_name[1][output_third_dot_pos+1:output_fourth_dot_pos]
  303. overflow_second_dot_pos = find_nth_pos(overflow_file_name[1], ".", 2)
  304. overflow_third_dot_pos = find_nth_pos(overflow_file_name[1], ".", 3)
  305. overflow_fourth_dot_pos = find_nth_pos(overflow_file_name[1], ".", 4)
  306. overflow_task_id = overflow_file_name[1][overflow_second_dot_pos+1:overflow_third_dot_pos]
  307. overflow_stream_id = overflow_file_name[1][overflow_third_dot_pos+1:overflow_fourth_dot_pos]
  308. assert output_task_id == overflow_task_id
  309. assert output_stream_id == overflow_stream_id
  310. # check if overflow dump file contains same task and stream id as file name
  311. with open(overflow_path, 'rb') as f:
  312. f.seek(321, 0)
  313. raw_data = f.read()
  314. task_id_infile = int.from_bytes(raw_data[24:25], 'little')
  315. stream_id_infile = int.from_bytes(raw_data[16:17], 'little')
  316. assert output_task_id == str(task_id_infile)
  317. assert output_stream_id == str(stream_id_infile)
  318. del os.environ['MINDSPORE_DUMP_CONFIG']
  319. def run_not_overflow_dump():
  320. """Run async dump and not generate overflow"""
  321. if sys.platform != 'linux':
  322. return
  323. overflow_x = np.array([60000, 60000]).astype(np.float16)
  324. overflow_y = np.array([2, 2]).astype(np.float16)
  325. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  326. dump_path = os.path.join(tmp_dir, 'overflow_dump')
  327. dump_config_path = os.path.join(tmp_dir, 'overflow_dump.json')
  328. generate_dump_json_with_overflow(dump_path, dump_config_path, 'test_async_dump', 3)
  329. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  330. if os.path.isdir(dump_path):
  331. shutil.rmtree(dump_path)
  332. add = Net()
  333. add(Tensor(overflow_x), Tensor(overflow_y))
  334. exe_graph_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  335. # check no overflow is happening, and path should not be generated
  336. assert not os.path.exists(exe_graph_path)
  337. del os.environ['MINDSPORE_DUMP_CONFIG']
  338. @pytest.mark.level0
  339. @pytest.mark.platform_arm_ascend_training
  340. @pytest.mark.platform_x86_ascend_training
  341. @pytest.mark.env_onecard
  342. @security_off_wrap
  343. def test_ascend_overflow_dump():
  344. """
  345. Feature: Overflow Dump
  346. Description: Test overflow dump
  347. Expectation: Overflow is occurred, and overflow dump file is in correct format
  348. """
  349. context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
  350. run_overflow_dump()
  351. @pytest.mark.level0
  352. @pytest.mark.platform_arm_ascend_training
  353. @pytest.mark.platform_x86_ascend_training
  354. @pytest.mark.env_onecard
  355. @security_off_wrap
  356. def test_ascend_not_overflow_dump():
  357. """
  358. Feature: Overflow Dump
  359. Description: Test overflow dump
  360. Expectation: Overflow is not occurred, and overflow dump file is not generated
  361. """
  362. context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
  363. run_not_overflow_dump()
  364. def check_statistic_dump(dump_file_path):
  365. output_name = "statistic.csv"
  366. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  367. real_path = os.path.realpath(output_path)
  368. with open(real_path) as f:
  369. reader = csv.DictReader(f)
  370. stats = list(reader)
  371. num_tensors = len(stats)
  372. assert num_tensors == 3
  373. for tensor in stats:
  374. if (tensor['IO'] == 'input' and tensor['Slot'] == 0):
  375. assert tensor['Min Value'] == '1'
  376. assert tensor['Max Value'] == '6'
  377. elif (tensor['IO'] == 'input' and tensor['Slot'] == 1):
  378. assert tensor['Min Value'] == '7'
  379. assert tensor['Max Value'] == '12'
  380. elif (tensor['IO'] == 'output' and tensor['Slot'] == 0):
  381. assert tensor['Min Value'] == '8'
  382. assert tensor['Max Value'] == '18'
  383. def check_data_dump(dump_file_path):
  384. output_name = "Add.Add-op*.output.0.*.npy"
  385. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  386. real_path = os.path.realpath(output_path)
  387. output = np.load(real_path)
  388. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  389. assert np.array_equal(output, expect)
  390. def run_saved_data_dump_test(scenario, saved_data):
  391. """Run e2e dump on scenario, testing statistic dump"""
  392. if sys.platform != 'linux':
  393. return
  394. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  395. dump_path = os.path.join(tmp_dir, 'test_saved_data')
  396. dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
  397. generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data)
  398. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  399. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  400. if os.path.isdir(dump_path):
  401. shutil.rmtree(dump_path)
  402. add = Net()
  403. add(Tensor(x), Tensor(y))
  404. for _ in range(3):
  405. if not os.path.exists(dump_file_path):
  406. time.sleep(2)
  407. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  408. if saved_data in ('statistic', 'full'):
  409. check_statistic_dump(dump_file_path)
  410. if saved_data in ('tensor', 'full'):
  411. check_data_dump(dump_file_path)
  412. if saved_data == 'statistic':
  413. # assert only file is statistic.csv, tensor data is not saved
  414. assert len(os.listdir(dump_file_path)) == 1
  415. elif saved_data == 'tensor':
  416. # assert only tensor data is saved, not statistics
  417. stat_path = os.path.join(dump_file_path, 'statistic.csv')
  418. assert not os.path.isfile(stat_path)
  419. del os.environ['MINDSPORE_DUMP_CONFIG']
  420. @pytest.mark.level0
  421. @pytest.mark.platform_x86_gpu_training
  422. @pytest.mark.env_onecard
  423. @security_off_wrap
  424. def test_gpu_e2e_statistic_dump():
  425. """
  426. Feature: GPU Statistics Dump
  427. Description: Test GPU statistics dump
  428. Expectation: Statistics are stored in statistic.csv files
  429. """
  430. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  431. run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic')
  432. @pytest.mark.level0
  433. @pytest.mark.platform_x86_gpu_training
  434. @pytest.mark.env_onecard
  435. @security_off_wrap
  436. def test_gpu_e2e_tensor_dump():
  437. """
  438. Feature: GPU Tensor Dump
  439. Description: Test GPU tensor dump
  440. Expectation: Tensor data are stored in npy files
  441. """
  442. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  443. run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor')
  444. @pytest.mark.level0
  445. @pytest.mark.platform_x86_gpu_training
  446. @pytest.mark.env_onecard
  447. @security_off_wrap
  448. def test_gpu_e2e_full_dump():
  449. """
  450. Feature: GPU Full Dump
  451. Description: Test GPU full dump
  452. Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
  453. """
  454. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  455. run_saved_data_dump_test('test_gpu_e2e_dump', 'full')
  456. @pytest.mark.level0
  457. @pytest.mark.platform_x86_gpu_training
  458. @pytest.mark.env_onecard
  459. @security_off_wrap
  460. def test_stat_dump_nulls():
  461. """
  462. Feature: GPU Statistics Dump
  463. Description: Test GPU statistics dump when printing tensors full with NaNs and Infs
  464. Expectation: Min, Max, Avg Values stored in statistic.csv show null for such tensors
  465. """
  466. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  467. if sys.platform != 'linux':
  468. return
  469. empty_x = np.array([]).astype(np.float16)
  470. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  471. dump_path = os.path.join(tmp_dir, 'test_saved_data')
  472. dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json')
  473. generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', 'statistic')
  474. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  475. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  476. if os.path.isdir(dump_path):
  477. shutil.rmtree(dump_path)
  478. add = Net()
  479. add(Tensor(empty_x), Tensor(empty_x))
  480. for _ in range(3):
  481. if not os.path.exists(dump_file_path):
  482. time.sleep(2)
  483. # check dumped data
  484. output_path = glob.glob(os.path.join(dump_file_path, 'statistic.csv'))[0]
  485. real_path = os.path.realpath(output_path)
  486. with open(real_path) as f:
  487. reader = csv.DictReader(f)
  488. [output] = list(reader)
  489. assert output['IO'] == 'output'
  490. assert output['Min Value'] == 'null'
  491. assert output['Max Value'] == 'null'
  492. assert output['Avg Value'] == 'null'
  493. @pytest.mark.level0
  494. @pytest.mark.platform_arm_ascend_training
  495. @pytest.mark.platform_x86_ascend_training
  496. @pytest.mark.env_onecard
  497. @security_off_wrap
  498. def test_ascend_statistic_dump():
  499. """
  500. Feature: Ascend Statistics Dump
  501. Description: Test Ascend statistics dump
  502. Expectation: Statistics are stored in statistic.csv files
  503. """
  504. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  505. run_saved_data_dump_test('test_async_dump', 'statistic')
  506. @pytest.mark.level0
  507. @pytest.mark.platform_arm_ascend_training
  508. @pytest.mark.platform_x86_ascend_training
  509. @pytest.mark.env_onecard
  510. @security_off_wrap
  511. def test_ascend_statistic_dump_kernel_by_kernel():
  512. """
  513. Feature: Ascend Statistics Dump in kernel by kernel (mindRT) mode
  514. Description: Test Ascend statistics dump
  515. Expectation: Statistics are stored in statistic.csv files
  516. """
  517. # set env `GRAPH_OP_RUN`` to enable kernel-by-kernel mode.
  518. os.environ['GRAPH_OP_RUN'] = "1"
  519. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  520. run_saved_data_dump_test('test_async_dump', 'statistic')
  521. del os.environ['GRAPH_OP_RUN']
  522. @pytest.mark.level0
  523. @pytest.mark.platform_arm_ascend_training
  524. @pytest.mark.platform_x86_ascend_training
  525. @pytest.mark.env_onecard
  526. @security_off_wrap
  527. def test_ascend_tensor_dump():
  528. """
  529. Feature: Ascend Tensor Dump
  530. Description: Test Ascend tensor dump
  531. Expectation: Tensors are stored in npy files
  532. """
  533. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  534. run_saved_data_dump_test('test_async_dump', 'tensor')
  535. @pytest.mark.level0
  536. @pytest.mark.platform_arm_ascend_training
  537. @pytest.mark.platform_x86_ascend_training
  538. @pytest.mark.env_onecard
  539. @security_off_wrap
  540. def test_ascend_full_dump():
  541. """
  542. Feature: Ascend Full Dump
  543. Description: Test Ascend full dump
  544. Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv
  545. """
  546. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  547. run_saved_data_dump_test('test_async_dump', 'full')
  548. @constexpr
  549. def construct_tensor(cst):
  550. return Tensor(np.array(cst))
  551. class ConstantNet(nn.Cell):
  552. def __init__(self):
  553. super(ConstantNet, self).__init__()
  554. self.relu = ops.ReLU()
  555. def construct(self, x_):
  556. return self.relu(construct_tensor(ops.shape(x_)))
  557. @pytest.mark.level0
  558. @pytest.mark.platform_arm_ascend_training
  559. @pytest.mark.platform_x86_ascend_training
  560. @pytest.mark.env_onecard
  561. def test_constant_async_ascend_dump():
  562. """
  563. Feature: Constant async dump
  564. Description: Test async constant dump in Ascend
  565. Expectation: constant dump folder is created, dump file has expected tensor info
  566. """
  567. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  568. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  569. dump_path = os.path.join(tmp_dir, 'constant_dump')
  570. dump_config_path = os.path.join(tmp_dir, 'constant_dump.json')
  571. generate_dump_json(dump_path, dump_config_path, 'test_async_dump')
  572. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  573. if os.path.isdir(dump_path):
  574. shutil.rmtree(dump_path)
  575. net = ConstantNet()
  576. tensor = Tensor(np.random.random([1, 2, 3]))
  577. expect = net(tensor)
  578. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  579. constant_path = os.path.join(dump_path, 'rank_0', 'Net', '0', 'constants')
  580. assert os.path.exists(constant_path)
  581. assert len(os.listdir(constant_path)) == 1
  582. output_name = "Parameter.data-*.0.0.*.DefaultFormat.npy"
  583. output_path = glob.glob(os.path.join(constant_path, output_name))[0]
  584. real_path = os.path.realpath(output_path)
  585. output = np.load(real_path)
  586. assert np.array_equal(output, expect)
  587. del os.environ['MINDSPORE_DUMP_CONFIG']
  588. def run_constant_e2e_dump():
  589. if sys.platform != 'linux':
  590. return
  591. with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir:
  592. dump_path = os.path.join(tmp_dir, 'constant_dump')
  593. dump_config_path = os.path.join(tmp_dir, 'constant_dump.json')
  594. generate_dump_json(dump_path, dump_config_path, 'test_e2e_dump')
  595. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  596. if os.path.isdir(dump_path):
  597. shutil.rmtree(dump_path)
  598. net = ConstantNet()
  599. tensor = Tensor(np.random.random([1, 2, 3]))
  600. expect = net(tensor)
  601. check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
  602. constant_path = os.path.join(dump_path, 'rank_0', 'Net', '0', 'constants')
  603. assert os.path.exists(constant_path)
  604. assert len(os.listdir(constant_path)) == 1
  605. output_name = "Parameter.data-*.0.0.*.DefaultFormat.npy"
  606. output_path = glob.glob(os.path.join(constant_path, output_name))[0]
  607. real_path = os.path.realpath(output_path)
  608. output = np.load(real_path)
  609. assert np.array_equal(output, expect)
  610. del os.environ['MINDSPORE_DUMP_CONFIG']
  611. @pytest.mark.level0
  612. @pytest.mark.platform_x86_gpu_training
  613. @pytest.mark.env_onecard
  614. @security_off_wrap
  615. def test_constant_gpu_e2e_dump():
  616. """
  617. Feature: Constant sync dump
  618. Description: Test constant sync dump in GPU
  619. Expectation: constant dump folder is created, dump file has expected tensor info
  620. """
  621. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  622. run_constant_e2e_dump()
  623. @pytest.mark.level0
  624. @pytest.mark.platform_arm_ascend_training
  625. @pytest.mark.platform_x86_ascend_training
  626. @pytest.mark.env_onecard
  627. @security_off_wrap
  628. def test_constant_ascend_e2e_dump():
  629. """
  630. Feature: Constant sync dump
  631. Description: Test constant sync dump in Ascend
  632. Expectation: constant dump folder is created, dump file has expected tensor info
  633. """
  634. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  635. run_constant_e2e_dump()