You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_data_dump.py 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. import os
  16. import json
  17. import sys
  18. import tempfile
  19. import time
  20. import shutil
  21. import glob
  22. import numpy as np
  23. import pytest
  24. import mindspore.context as context
  25. import mindspore.nn as nn
  26. from mindspore import Tensor
  27. from mindspore.ops import operations as P
  28. from mindspore.nn import Cell
  29. from mindspore.nn import Dense
  30. from mindspore.nn import SoftmaxCrossEntropyWithLogits
  31. from mindspore.nn import Momentum
  32. from mindspore.nn import TrainOneStepCell
  33. from mindspore.nn import WithLossCell
  34. class Net(nn.Cell):
  35. def __init__(self):
  36. super(Net, self).__init__()
  37. self.add = P.Add()
  38. def construct(self, x_, y_):
  39. return self.add(x_, y_)
  40. x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
  41. y = np.array([[7, 8, 9], [10, 11, 12]]).astype(np.float32)
  42. def change_current_dump_json(file_name, dump_path, dump_config_path):
  43. with open(file_name, 'r+') as f:
  44. data = json.load(f)
  45. data["common_dump_settings"]["path"] = dump_path
  46. with open(dump_config_path, 'w') as f:
  47. json.dump(data, f)
  48. @pytest.mark.level1
  49. @pytest.mark.platform_arm_ascend_training
  50. @pytest.mark.platform_x86_ascend_training
  51. @pytest.mark.env_onecard
  52. def test_async_dump():
  53. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  54. pwd = os.getcwd()
  55. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  56. dump_path = os.path.join(tmp_dir, 'async_dump')
  57. dump_config_path = os.path.join(tmp_dir, 'async_dump.json')
  58. change_current_dump_json('async_dump.json', dump_path, dump_config_path)
  59. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  60. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  61. if os.path.isdir(dump_path):
  62. shutil.rmtree(dump_path)
  63. add = Net()
  64. add(Tensor(x), Tensor(y))
  65. time.sleep(5)
  66. assert len(os.listdir(dump_file_path)) == 1
  67. def run_e2e_dump():
  68. if sys.platform != 'linux':
  69. return
  70. pwd = os.getcwd()
  71. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  72. dump_path = os.path.join(tmp_dir, 'e2e_dump')
  73. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  74. change_current_dump_json('e2e_dump.json', dump_path, dump_config_path)
  75. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  76. dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
  77. if os.path.isdir(dump_path):
  78. shutil.rmtree(dump_path)
  79. add = Net()
  80. add(Tensor(x), Tensor(y))
  81. if context.get_context("device_target") == "Ascend":
  82. assert len(os.listdir(dump_file_path)) == 5
  83. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  84. elif context.get_context("device_target") == "CPU":
  85. assert len(os.listdir(dump_file_path)) == 5
  86. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  87. else:
  88. assert len(os.listdir(dump_file_path)) == 3
  89. output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
  90. output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
  91. real_path = os.path.realpath(output_path)
  92. output = np.load(real_path)
  93. expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
  94. assert output.dtype == expect.dtype
  95. assert np.array_equal(output, expect)
  96. @pytest.mark.level0
  97. @pytest.mark.platform_arm_ascend_training
  98. @pytest.mark.platform_x86_ascend_training
  99. @pytest.mark.env_onecard
  100. def test_e2e_dump():
  101. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  102. run_e2e_dump()
  103. @pytest.mark.level0
  104. @pytest.mark.platform_arm_ascend_training
  105. @pytest.mark.platform_x86_ascend_training
  106. @pytest.mark.env_onecard
  107. def test_e2e_dump_with_hccl_env():
  108. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  109. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  110. os.environ["RANK_ID"] = "4"
  111. run_e2e_dump()
  112. @pytest.mark.level0
  113. @pytest.mark.platform_x86_cpu
  114. @pytest.mark.env_onecard
  115. def test_cpu_e2e_dump():
  116. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  117. run_e2e_dump()
  118. @pytest.mark.level0
  119. @pytest.mark.platform_x86_cpu
  120. @pytest.mark.env_onecard
  121. def test_cpu_e2e_dump_with_hccl_set():
  122. context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
  123. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  124. os.environ["RANK_ID"] = "4"
  125. run_e2e_dump()
  126. @pytest.mark.level0
  127. @pytest.mark.platform_x86_gpu_training
  128. @pytest.mark.env_onecard
  129. def test_gpu_e2e_dump():
  130. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  131. run_e2e_dump()
  132. @pytest.mark.level0
  133. @pytest.mark.platform_x86_gpu_training
  134. @pytest.mark.env_onecard
  135. def test_gpu_e2e_dump_with_hccl_set():
  136. context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
  137. os.environ["RANK_TABLE_FILE"] = "invalid_file.json"
  138. os.environ["RANK_ID"] = "4"
  139. run_e2e_dump()
  140. class ReluReduceMeanDenseRelu(Cell):
  141. def __init__(self, kernel, bias, in_channel, num_class):
  142. super().__init__()
  143. self.relu = P.ReLU()
  144. self.mean = P.ReduceMean(keep_dims=False)
  145. self.dense = Dense(in_channel, num_class, kernel, bias)
  146. def construct(self, x_):
  147. x_ = self.relu(x_)
  148. x_ = self.mean(x_, (2, 3))
  149. x_ = self.dense(x_)
  150. x_ = self.relu(x_)
  151. return x_
  152. def search_path(path, keyword):
  153. content = os.listdir(path)
  154. for each in content:
  155. each_path = path + os.sep + each
  156. if keyword in each:
  157. return each_path
  158. read_write = os.access(each_path, os.W_OK) and os.access(each_path, os.R_OK)
  159. if not read_write:
  160. continue
  161. if os.path.isdir(each_path):
  162. search_path(each_path, keyword)
  163. return None
  164. @pytest.mark.level0
  165. @pytest.mark.platform_arm_ascend_training
  166. @pytest.mark.platform_x86_ascend_training
  167. @pytest.mark.env_onecard
  168. def test_async_dump_net_multi_layer_mode1():
  169. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  170. test_name = "test_async_dump_net_multi_layer_mode1"
  171. json_file = os.path.join(os.getcwd(), "{}.json".format(test_name))
  172. rank_id = 0
  173. dump_full_path = os.path.join("/tmp/async_dump/", "{}_{}".format(test_name, rank_id))
  174. os.system("rm -rf {}/*".format(dump_full_path))
  175. os.environ["MINDSPORE_DUMP_CONFIG"] = json_file
  176. weight = Tensor(np.ones((1000, 2048)).astype(np.float32))
  177. bias = Tensor(np.ones((1000,)).astype(np.float32))
  178. net = ReluReduceMeanDenseRelu(weight, bias, 2048, 1000)
  179. criterion = SoftmaxCrossEntropyWithLogits(sparse=False)
  180. optimizer = Momentum(learning_rate=0.1, momentum=0.1,
  181. params=filter(lambda x: x.requires_grad, net.get_parameters()))
  182. net_with_criterion = WithLossCell(net, criterion)
  183. train_network = TrainOneStepCell(net_with_criterion, optimizer)
  184. train_network.set_train()
  185. inputs = Tensor(np.random.randn(32, 2048, 7, 7).astype(np.float32))
  186. label = Tensor(np.zeros(shape=(32, 1000)).astype(np.float32))
  187. net_dict = train_network(inputs, label)
  188. dump_path = "/tmp/async_dump/{}/rank_{}/test/0/0/".format(test_name, rank_id)
  189. dump_file = os.listdir(dump_path)
  190. dump_file_name = ""
  191. for file in dump_file:
  192. if "SoftmaxCrossEntropyWithLogits" in file:
  193. dump_file_name = file
  194. dump_file_full_path = os.path.join(dump_path, dump_file_name)
  195. npy_path = os.path.join(os.getcwd(), "./{}".format(test_name))
  196. if os.path.exists(npy_path):
  197. shutil.rmtree(npy_path)
  198. os.mkdir(npy_path)
  199. tool_path = search_path('/usr/local/Ascend', 'msaccucmp.pyc')
  200. if tool_path:
  201. cmd = "python {0} convert -d {1} -out {2}".format(tool_path, dump_file_full_path, npy_path)
  202. os.system(cmd)
  203. npy_file_list = os.listdir(npy_path)
  204. dump_result = {}
  205. for file in npy_file_list:
  206. if "output.0.npy" in file:
  207. dump_result["output0"] = np.load(os.path.join(npy_path, file))
  208. for index, value in enumerate(net_dict):
  209. assert value.asnumpy() == dump_result["output0"][index]
  210. else:
  211. print('not find convert tools msaccucmp.pyc')
  212. @pytest.mark.level0
  213. @pytest.mark.platform_arm_ascend_training
  214. @pytest.mark.platform_x86_ascend_training
  215. @pytest.mark.env_onecard
  216. def test_dump_with_diagnostic_path():
  217. """
  218. Test e2e dump when path is not set (set to empty) in dump json file and MS_DIAGNOSTIC_DATA_PATH is set.
  219. Data is expected to be dumped into MS_DIAGNOSTIC_DATA_PATH/debug_dump.
  220. """
  221. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  222. pwd = os.getcwd()
  223. with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
  224. dump_config_path = os.path.join(tmp_dir, 'e2e_dump.json')
  225. change_current_dump_json('e2e_dump.json', '', dump_config_path)
  226. os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
  227. diagnose_path = os.path.join(tmp_dir, 'e2e_dump')
  228. os.environ['MS_DIAGNOSTIC_DATA_PATH'] = diagnose_path
  229. dump_file_path = os.path.join(diagnose_path, 'debug_dump', 'rank_0', 'Net', '0', '0')
  230. if os.path.isdir(diagnose_path):
  231. shutil.rmtree(diagnose_path)
  232. add = Net()
  233. add(Tensor(x), Tensor(y))
  234. assert len(os.listdir(dump_file_path)) == 5