From 56f1bb1b66ecbc5ee7f3decf9222497fa75dabad Mon Sep 17 00:00:00 2001 From: jjfeing Date: Sat, 31 Oct 2020 10:44:21 +0800 Subject: [PATCH] add dump data when task error --- .../ccsrc/debug/data_dump/e2e_dump_util.cc | 94 +++++++++++-------- .../ccsrc/debug/data_dump/e2e_dump_util.h | 5 + .../device/ascend/ascend_kernel_runtime.cc | 51 ++++++++-- .../device/ascend/ascend_kernel_runtime.h | 4 +- tests/ut/cpp/stub/runtime/runtime_stub.cc | 2 + 5 files changed, 107 insertions(+), 49 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc index 09f305b431..72b060e3ef 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc @@ -112,20 +112,26 @@ void E2eDumpUtil::DumpOutput(const session::KernelGraph *graph, const std::strin continue; } DumpJsonParser::GetInstance().MatchKernel(kernel_name); - GetFileKernelName(NOT_NULL(&kernel_name)); - auto output_size = AnfAlgo::GetOutputTensorNum(node); - for (size_t j = 0; j < output_size; ++j) { - auto addr = AnfAlgo::GetOutputAddr(node, j); - ShapeVector int_shapes; - GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes)); - auto type = AnfAlgo::GetOutputInferDataType(node, j); - std::string file_path = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j); - if (IsDeviceTargetGPU()) { - DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j, - debugger); - } else { - DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); - } + DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger); + } +} + +void E2eDumpUtil::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name, Debugger *debugger) { + MS_EXCEPTION_IF_NULL(node); + GetFileKernelName(NOT_NULL(kernel_name)); + auto output_size = AnfAlgo::GetOutputTensorNum(node); + for (size_t j = 0; j < output_size; ++j) { + auto addr = AnfAlgo::GetOutputAddr(node, j); + ShapeVector int_shapes; + GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes)); + auto type = AnfAlgo::GetOutputInferDataType(node, j); + std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j); + if (IsDeviceTargetGPU()) { + DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j, + debugger); + } else { + DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); } } } @@ -147,35 +153,41 @@ void E2eDumpUtil::DumpInput(const session::KernelGraph *graph, const std::string continue; } DumpJsonParser::GetInstance().MatchKernel(kernel_name); - GetFileKernelName(NOT_NULL(&kernel_name)); - auto input_size = AnfAlgo::GetInputTensorNum(node); - for (size_t j = 0; j < input_size; ++j) { - auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j); - auto input = kernel_with_index.first; - auto index = kernel_with_index.second; - auto addr = AnfAlgo::GetOutputAddr(input, index); + DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger); + } +} - std::string tensor_name; - size_t slot; - if (IsDeviceTargetGPU()) { - auto input_kernel = node->input(j + 1); - std::string input_kernel_name = input_kernel->fullname_with_scope(); - tensor_name = input_kernel_name; - slot = 0; - } else { - tensor_name = node->fullname_with_scope(); - slot = j; - } +void E2eDumpUtil::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name, Debugger *debugger) { + MS_EXCEPTION_IF_NULL(node); + GetFileKernelName(NOT_NULL(kernel_name)); + auto input_size = AnfAlgo::GetInputTensorNum(node); + for (size_t j = 0; j < input_size; ++j) { + auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j); + auto input = kernel_with_index.first; + auto index = kernel_with_index.second; + auto addr = AnfAlgo::GetOutputAddr(input, index); - ShapeVector int_shapes; - GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes)); - auto type = AnfAlgo::GetOutputInferDataType(input, index); - std::string file_path = dump_path + '/' + kernel_name + '_' + "input_" + std::to_string(j); - if (IsDeviceTargetGPU()) { - DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger); - } else { - DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); - } + std::string tensor_name; + size_t slot; + if (IsDeviceTargetGPU()) { + auto input_kernel = node->input(j + 1); + std::string input_kernel_name = input_kernel->fullname_with_scope(); + tensor_name = input_kernel_name; + slot = 0; + } else { + tensor_name = node->fullname_with_scope(); + slot = j; + } + + ShapeVector int_shapes; + GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes)); + auto type = AnfAlgo::GetOutputInferDataType(input, index); + std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j); + if (IsDeviceTargetGPU()) { + DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger); + } else { + DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); } } } diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.h b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.h index 5af506be28..26f8cc9714 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.h @@ -34,6 +34,11 @@ class E2eDumpUtil { ~E2eDumpUtil() = default; static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, Debugger *debugger = nullptr); static void GetFileKernelName(NotNull kernel_name); + // Dump data when task error. + static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name, Debugger *debugger); + static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, + std::string *kernel_name, Debugger *debugger); private: static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, Debugger *debugger); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index bfc4592232..a5639e3e3b 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -22,6 +22,7 @@ #include #include #include +#include "debug/data_dump/e2e_dump_util.h" #include "runtime/device/ascend/ascend_device_address.h" #include "runtime/device/cpu/mpi/mpi_interface.h" #include "utils/ms_context.h" @@ -106,6 +107,7 @@ std::string GetRankId() { } } // namespace +std::vector AscendKernelRuntime::exception_infos_; AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } void AscendKernelRuntime::SetContext() { @@ -268,6 +270,12 @@ bool AscendKernelRuntime::Init() { MS_EXCEPTION_IF_NULL(mem_manager_); mem_manager_->MallocDeviceMemory(); + // Set callback func when exception error + auto rt_ret = rtSetTaskFailCallback(ExceptionCallback); + if (rt_ret != RT_ERROR_NONE) { + MS_LOG(EXCEPTION) << "SetTaskFailCallback failed, error: " << rt_ret; + } + initialized_ = true; return ret; } @@ -507,11 +515,41 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { } } -void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) { - auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph_id); - for (auto iter : runtime_info_map) { - MS_LOG(WARNING) << "Task name:" << iter.first << " task_id:" << std::get(*iter.second) - << " stream_id:" << std::get(*iter.second); +void AscendKernelRuntime::ExceptionCallback(rtExceptionInfo *exception_info) { + static std::mutex exception_mutex; + std::lock_guard lock(exception_mutex); + exception_infos_.push_back(*exception_info); +} + +void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + std::vector full_scope_name{}; + // Find node name(full scope name) + auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id()); + MS_LOG(ERROR) << "Exception_infos_ size: " << exception_infos_.size() << ". first example: " + << ", task_id: " << exception_infos_.at(0).taskid << ", stream_id: " << exception_infos_.at(0).streamid + << ", tid: " << exception_infos_.at(0).tid << ", device_id: " << exception_infos_.at(0).deviceid; + + for (const auto &exception_info : exception_infos_) { + for (const auto &iter : runtime_info_map) { + auto task_id = std::get(*iter.second); + auto stream_id = std::get(*iter.second); + if (task_id == exception_info.taskid && stream_id == exception_info.streamid) { + full_scope_name.push_back(iter.first); + MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; + } + } + } + // Dump error data in local path + const std::string local_path = std::string("./task_error_dump/") + std::to_string(exception_infos_.at(0).deviceid); + for (const auto &node : graph->execution_order()) { + for (auto &name : full_scope_name) { + if (node->fullname_with_scope() == name) { + MS_LOG(ERROR) << "Begin to dump node (" << name << ") task error input/output data in local path."; + E2eDumpUtil::DumpInputImpl(node, false, local_path, &name, nullptr); + E2eDumpUtil::DumpOutputImpl(node, false, local_path, &name, nullptr); + } + } } } @@ -621,8 +659,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); if (!status) { - MS_LOG(ERROR) << "Run task failed"; - DebugTaskIdName(graph->graph_id()); + DumpTaskExceptionInfo(graph); return false; } return true; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index c91990eb62..259fb08c98 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -71,9 +71,10 @@ class AscendKernelRuntime : public KernelRuntime { void ReleaseDeviceRes() override; bool GraphWithEmptyTaskList(const session::KernelGraph *graph) const; bool CheckGraphIdValid(GraphId graph_id) const; - static void DebugTaskIdName(GraphId graph_id); void DistributeDebugTask(NotNull graph, NotNull> model_handle); void LaunchDataDump(GraphId graph_id); + static void DumpTaskExceptionInfo(const session::KernelGraph *graph); + static void ExceptionCallback(rtExceptionInfo *exception_info); rtContext_t rt_context_{nullptr}; rtContext_t rt_context_hccl_{nullptr}; @@ -81,6 +82,7 @@ class AscendKernelRuntime : public KernelRuntime { unordered_map>> task_map_; unordered_map> graph_model_map_; unordered_map> graph_data_dumper_; + static std::vector exception_infos_; }; MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc index 255a200ad6..232a34aad6 100644 --- a/tests/ut/cpp/stub/runtime/runtime_stub.cc +++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc @@ -146,3 +146,5 @@ RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t* int AdxDataDumpServerInit() { return 0; } int AdxDataDumpServerUnInit() { return 0; } + +RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } \ No newline at end of file