You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debugger_utils.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /**
  2. * Copyright 2021-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debugger/debugger_utils.h"
  17. #include <iostream>
  18. #include <vector>
  19. #include <memory>
  20. #include <string>
  21. #include "include/common/debug/anf_dump_utils.h"
  22. #include "debug/debugger/debugger.h"
  23. #include "plugin/device/gpu/hal/device/gpu_device_address.h"
  24. #include "debug/data_dump/dump_json_parser.h"
  25. #ifdef ENABLE_D
  26. #include "debug/dump_data_builder.h"
  27. #endif
  28. #include "backend/common/session/anf_runtime_algorithm.h"
  29. #include "include/common/utils/anfalgo.h"
  30. #include "kernel/kernel.h"
  31. #include "debug/data_dump/e2e_dump.h"
  32. #include "include/common/utils/config_manager.h"
  33. #include "backend/common/session/session_basic.h"
  34. constexpr int kFailure = 1;
  35. using mindspore::kernel::AddressPtr;
  36. using mindspore::kernel::KernelLaunchInfo;
  37. using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
  38. using KernelGraph = mindspore::session::KernelGraph;
  39. using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
  40. namespace mindspore {
  41. /*
  42. * Feature group: Online debugger.
  43. * Target device group: GPU.
  44. * Runtime category: MindRT.
  45. * Description: Returns a vector containing real output number.
  46. */
  47. std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
  48. std::vector<size_t> real_outputs;
  49. // P.BatchNorm is used for training and inference
  50. // can add the filter list for more operators here....
  51. if (node_name == "BatchNorm") {
  52. MS_LOG(INFO) << "loading node named " << node_name;
  53. (void)real_outputs.insert(real_outputs.end(), {0, 3, 4});
  54. } else {
  55. // by default, TensorLoader will load all outputs
  56. for (size_t j = 0; j < output_size; ++j) {
  57. real_outputs.push_back(j);
  58. }
  59. }
  60. return real_outputs;
  61. }
  62. /*
  63. * Feature group: Dump, Online debugger.
  64. * Target device group: GPU.
  65. * Runtime category: MindRT.
  66. * Description: Get kernel inputs from launch_info and load the inputs from device to host.
  67. */
  68. void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id,
  69. const DeviceContext *device_context) {
  70. // get inputs
  71. auto kernel_inputs = launch_info->inputs_;
  72. auto input_size = common::AnfAlgo::GetInputTensorNum(cnode);
  73. for (size_t j = 0; j < input_size; ++j) {
  74. auto input_kernel = cnode->input(j + 1);
  75. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  76. auto addr = kernel_inputs[j];
  77. auto type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
  78. // For example, this happens with the Depend op
  79. if (type == kMetaTypeNone) {
  80. continue;
  81. }
  82. auto format = kOpFormat_DEFAULT;
  83. auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
  84. string input_tensor_name = input_kernel_name + ':' + "0";
  85. ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
  86. auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), format, int_shapes, type, 0, true,
  87. root_graph_id, false);
  88. if (!ret) {
  89. MS_LOG(ERROR) << "LoadMemToHost:"
  90. << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
  91. }
  92. }
  93. }
  94. /*
  95. * Feature group: Dump, Online debugger.
  96. * Target device group: GPU.
  97. * Runtime category: MindRT.
  98. * Description: Get kernel outputs from launch_info and load the inputs from device to host.
  99. */
  100. void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
  101. uint32_t root_graph_id, const DeviceContext *device_context) {
  102. // get outputs
  103. auto kernel_outputs = launch_info->outputs_;
  104. auto output_size = common::AnfAlgo::GetOutputTensorNum(cnode);
  105. auto node_name = common::AnfAlgo::GetCNodeName(cnode);
  106. std::string kernel_name = GetKernelNodeName(cnode);
  107. std::vector<size_t> real_outputs = CheckRealOutput(node_name, output_size);
  108. for (size_t j : real_outputs) {
  109. auto addr = kernel_outputs[j];
  110. auto type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
  111. // For example, this happens with the Depend op
  112. if (type == kMetaTypeNone) {
  113. continue;
  114. }
  115. auto format = kOpFormat_DEFAULT;
  116. auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
  117. string tensor_name = kernel_name + ':' + std::to_string(j);
  118. ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
  119. auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), format, int_shapes, type, j, false,
  120. root_graph_id, false);
  121. if (!ret) {
  122. MS_LOG(ERROR) << "LoadMemToHost:"
  123. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  124. }
  125. }
  126. }
  127. /*
  128. * Feature group: Dump, Online debugger.
  129. * Target device group: Ascend, GPU.
  130. * Runtime category: MindRT.
  131. * Description: Returns true if the node needs to be read for Dump or online debugger. This function is used by GPU
  132. * and Ascend kernel-by-kernel mindRT.
  133. */
  134. bool CheckReadData(const CNodePtr &cnode) {
  135. auto debugger = Debugger::GetInstance();
  136. if (!debugger) {
  137. return false;
  138. }
  139. bool read_data = false;
  140. auto &dump_json_parser = DumpJsonParser::GetInstance();
  141. bool dump_enabled = dump_json_parser.DumpEnabledForIter();
  142. MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
  143. std::string kernel_name = GetKernelNodeName(cnode);
  144. if (dump_enabled) {
  145. if (dump_json_parser.NeedDump(kernel_name)) {
  146. read_data = true;
  147. }
  148. }
  149. if (debugger->debugger_enabled()) {
  150. read_data = debugger->ReadNodeDataRequired(cnode);
  151. }
  152. return read_data;
  153. }
  154. bool IsDeviceTargetGPU() {
  155. auto context = MsContext::GetInstance();
  156. MS_EXCEPTION_IF_NULL(context);
  157. return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
  158. }
  159. /*
  160. * Feature group: Dump, Online debugger.
  161. * Target device group: Ascend, GPU.
  162. * Runtime category: MindRT.
  163. * Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
  164. * PostExecuteNode function on the given node for GPU.
  165. */
  166. void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
  167. const DeviceContext *device_context) {
  168. auto debugger = Debugger::GetInstance();
  169. if (!debugger) {
  170. return;
  171. }
  172. auto &dump_json_parser = DumpJsonParser::GetInstance();
  173. bool dump_enabled = dump_json_parser.DumpEnabledForIter();
  174. MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
  175. auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
  176. MS_EXCEPTION_IF_NULL(kernel_graph);
  177. auto root_graph_id = kernel_graph->root_graph_id();
  178. if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
  179. LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context);
  180. }
  181. if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
  182. LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context);
  183. }
  184. // Dump kernel
  185. if (dump_enabled) {
  186. MS_EXCEPTION_IF_NULL(kernel_graph);
  187. auto graph_id = kernel_graph->graph_id();
  188. // for GPU, nodes are dumped in graph_id directory.
  189. if (IsDeviceTargetGPU()) {
  190. debugger->DumpSingleNode(cnode, graph_id);
  191. } else {
  192. // for Ascend, node are dumped in root_graph_id directory.
  193. debugger->DumpSingleNode(cnode, root_graph_id, launch_info);
  194. }
  195. // Clear Dumped data when online debugger is not enabled
  196. if (!debugger->debugger_enabled()) {
  197. debugger->ClearCurrentData();
  198. }
  199. }
  200. if (IsDeviceTargetGPU()) {
  201. // check if the node is last kernel
  202. bool last_kernel = !common::AnfAlgo::IsInplaceNode(cnode, "skip");
  203. debugger->PostExecuteNode(cnode, last_kernel);
  204. }
  205. }
  206. /*
  207. * Feature group: Dump, Online Debugger.
  208. * Target device group: Ascend, GPU.
  209. * Runtime category: MindRT.
  210. * Description: Returns the error_info when sink_mode is true and we are in online debugger mode or dump mode for
  211. * GPU, if everything is normal the error_info string will be empty.
  212. */
  213. std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
  214. std::string error_info = "";
  215. bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
  216. auto debugger = Debugger::GetInstance();
  217. if (debugger->CheckDebuggerDumpEnabled() && sink_mode && IsDeviceTargetGPU()) {
  218. error_info = "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  219. }
  220. if (debugger->CheckDebuggerEnabled() && sink_mode) {
  221. error_info = "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  222. }
  223. return error_info;
  224. }
  225. /*
  226. * Feature group: Online Debugger.
  227. * Target device group: Ascend.
  228. * Runtime category: MindRT.
  229. * Description: Loads graph's outputs and parameters for Ascend super kernel mode.
  230. */
  231. void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
  232. auto context = MsContext::GetInstance();
  233. MS_EXCEPTION_IF_NULL(context);
  234. if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
  235. return;
  236. }
  237. #ifdef ENABLE_DEBUGGER
  238. auto debugger = Debugger::GetInstance();
  239. MS_EXCEPTION_IF_NULL(debugger);
  240. if (!debugger->CheckDebuggerEnabled()) {
  241. return;
  242. }
  243. MS_LOG(INFO) << "Start load step";
  244. debugger->SetGraphPtr(graph_ptr);
  245. // load output
  246. debugger->LoadGraphOutputs();
  247. // load parameters
  248. debugger->LoadParametersAndConst();
  249. #endif
  250. }
  251. void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
  252. MS_LOG(DEBUG) << "Start!";
  253. MS_EXCEPTION_IF_NULL(graph);
  254. E2eDump::DumpData(graph.get(), rank_id);
  255. MS_LOG(DEBUG) << "Finish!";
  256. }
  257. uint32_t GetRankID() {
  258. uint32_t rank_id = 0;
  259. auto ms_context = MsContext::GetInstance();
  260. MS_EXCEPTION_IF_NULL(ms_context);
  261. auto env_rank_id = common::GetEnv("RANK_ID");
  262. if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
  263. // get actual rank id if it's distribution training case.
  264. rank_id = GetRankId();
  265. }
  266. return rank_id;
  267. }
  268. void SuperKernelE2eDump(const KernelGraphPtr &graph) {
  269. #ifndef ENABLE_SECURITY
  270. Dump(graph, GetRankID());
  271. #endif
  272. }
  273. #ifdef ENABLE_D
  274. /*
  275. * Feature group: Dump.
  276. * Target device group: Ascend.
  277. * Runtime category: Old runtime, MindRT.
  278. * Description: It is a function to be registered to Adx server for a + m dump feature with the following steps:
  279. * 1) Merge chunks into one memory segment after receiving all the data for one node.
  280. * 2) Parse dump data object.
  281. * 3) Convert data from device to host format.
  282. * 4) Dump to disk based on configuration.
  283. */
  284. int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size) {
  285. MS_LOG(DEBUG) << "ADX DumpDataCallBack is called";
  286. MS_LOG(DEBUG) << "The dump_chunk size is: " << size;
  287. string file_name = dump_chunk->fileName;
  288. uint32_t isLastChunk = dump_chunk->isLastChunk;
  289. // parse chunk header
  290. auto debugger = Debugger::GetInstance();
  291. MS_EXCEPTION_IF_NULL(debugger);
  292. auto dump_data_build = debugger->LoadDumpDataBuilder(file_name);
  293. if (dump_data_build == nullptr) {
  294. MS_LOG(ERROR) << "Failed to load dump data builder for node " << file_name;
  295. return 0;
  296. }
  297. if (!dump_data_build->CopyDumpChunk(dump_chunk)) {
  298. return 1;
  299. }
  300. if (isLastChunk == 1) {
  301. // construct dump data object
  302. debugger::dump::DumpData dump_data;
  303. std::vector<char> data_buf;
  304. if (!dump_data_build->ConstructDumpData(&dump_data, &data_buf)) {
  305. MS_LOG(ERROR) << "Failed to parse data for node " << file_name;
  306. return 0;
  307. }
  308. // convert and save to files
  309. auto separator = file_name.rfind("/");
  310. auto path_name = file_name.substr(0, separator);
  311. auto file_base_name = file_name.substr(separator + 1);
  312. if (file_base_name.rfind("Opdebug.Node_OpDebug.") == 0) {
  313. // save overflow data
  314. E2eDump::DumpOpDebugToFile(file_name, dump_data, data_buf.data());
  315. } else {
  316. // save tensor data
  317. // generate fully qualified file name
  318. // before: op_type.op_name.task_id.stream_id.timestamp
  319. // after: op_type.op_name_no_scope.task_id.stream_id.timestamp
  320. size_t first_dot = file_base_name.find(".");
  321. size_t second_dot = file_base_name.size();
  322. const int kNumDots = 3;
  323. int nth_dot_from_back = 0;
  324. while (nth_dot_from_back != kNumDots && second_dot != std::string::npos) {
  325. second_dot = file_base_name.rfind(".", second_dot - 1);
  326. nth_dot_from_back++;
  327. }
  328. if (first_dot == std::string::npos || second_dot == std::string::npos) {
  329. MS_LOG(ERROR) << "Failed to generate fully qualified file name for " << file_name;
  330. return 0;
  331. }
  332. auto op_type = file_base_name.substr(0, first_dot);
  333. auto task_stream_timestamp = file_base_name.substr(second_dot);
  334. std::string op_name = dump_data.op_name();
  335. auto op_name_no_scope = GetOpNameWithoutScope(op_name, "/");
  336. E2eDump::DumpTensorToFile(path_name + "/" + op_type + "." + op_name_no_scope + task_stream_timestamp, dump_data,
  337. data_buf.data());
  338. }
  339. debugger->ClearDumpDataBuilder(file_name);
  340. }
  341. return 0;
  342. }
  343. #endif
  344. } // namespace mindspore