Merge pull request !8030 from jjfeing/mastertags/v1.1.0
| @@ -112,20 +112,26 @@ void E2eDumpUtil::DumpOutput(const session::KernelGraph *graph, const std::strin | |||||
| continue; | continue; | ||||
| } | } | ||||
| DumpJsonParser::GetInstance().MatchKernel(kernel_name); | DumpJsonParser::GetInstance().MatchKernel(kernel_name); | ||||
| GetFileKernelName(NOT_NULL(&kernel_name)); | |||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||||
| for (size_t j = 0; j < output_size; ++j) { | |||||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||||
| ShapeVector int_shapes; | |||||
| GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes)); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||||
| std::string file_path = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j); | |||||
| if (IsDeviceTargetGPU()) { | |||||
| DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j, | |||||
| debugger); | |||||
| } else { | |||||
| DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); | |||||
| } | |||||
| DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger); | |||||
| } | |||||
| } | |||||
| void E2eDumpUtil::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||||
| std::string *kernel_name, Debugger *debugger) { | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| GetFileKernelName(NOT_NULL(kernel_name)); | |||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||||
| for (size_t j = 0; j < output_size; ++j) { | |||||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||||
| ShapeVector int_shapes; | |||||
| GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes)); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||||
| std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j); | |||||
| if (IsDeviceTargetGPU()) { | |||||
| DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j, | |||||
| debugger); | |||||
| } else { | |||||
| DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -147,35 +153,41 @@ void E2eDumpUtil::DumpInput(const session::KernelGraph *graph, const std::string | |||||
| continue; | continue; | ||||
| } | } | ||||
| DumpJsonParser::GetInstance().MatchKernel(kernel_name); | DumpJsonParser::GetInstance().MatchKernel(kernel_name); | ||||
| GetFileKernelName(NOT_NULL(&kernel_name)); | |||||
| auto input_size = AnfAlgo::GetInputTensorNum(node); | |||||
| for (size_t j = 0; j < input_size; ++j) { | |||||
| auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j); | |||||
| auto input = kernel_with_index.first; | |||||
| auto index = kernel_with_index.second; | |||||
| auto addr = AnfAlgo::GetOutputAddr(input, index); | |||||
| DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger); | |||||
| } | |||||
| } | |||||
| std::string tensor_name; | |||||
| size_t slot; | |||||
| if (IsDeviceTargetGPU()) { | |||||
| auto input_kernel = node->input(j + 1); | |||||
| std::string input_kernel_name = input_kernel->fullname_with_scope(); | |||||
| tensor_name = input_kernel_name; | |||||
| slot = 0; | |||||
| } else { | |||||
| tensor_name = node->fullname_with_scope(); | |||||
| slot = j; | |||||
| } | |||||
| void E2eDumpUtil::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||||
| std::string *kernel_name, Debugger *debugger) { | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| GetFileKernelName(NOT_NULL(kernel_name)); | |||||
| auto input_size = AnfAlgo::GetInputTensorNum(node); | |||||
| for (size_t j = 0; j < input_size; ++j) { | |||||
| auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j); | |||||
| auto input = kernel_with_index.first; | |||||
| auto index = kernel_with_index.second; | |||||
| auto addr = AnfAlgo::GetOutputAddr(input, index); | |||||
| ShapeVector int_shapes; | |||||
| GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes)); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(input, index); | |||||
| std::string file_path = dump_path + '/' + kernel_name + '_' + "input_" + std::to_string(j); | |||||
| if (IsDeviceTargetGPU()) { | |||||
| DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger); | |||||
| } else { | |||||
| DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); | |||||
| } | |||||
| std::string tensor_name; | |||||
| size_t slot; | |||||
| if (IsDeviceTargetGPU()) { | |||||
| auto input_kernel = node->input(j + 1); | |||||
| std::string input_kernel_name = input_kernel->fullname_with_scope(); | |||||
| tensor_name = input_kernel_name; | |||||
| slot = 0; | |||||
| } else { | |||||
| tensor_name = node->fullname_with_scope(); | |||||
| slot = j; | |||||
| } | |||||
| ShapeVector int_shapes; | |||||
| GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes)); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(input, index); | |||||
| std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j); | |||||
| if (IsDeviceTargetGPU()) { | |||||
| DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger); | |||||
| } else { | |||||
| DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -34,6 +34,11 @@ class E2eDumpUtil { | |||||
| ~E2eDumpUtil() = default; | ~E2eDumpUtil() = default; | ||||
| static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, Debugger *debugger = nullptr); | static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, Debugger *debugger = nullptr); | ||||
| static void GetFileKernelName(NotNull<std::string *> kernel_name); | static void GetFileKernelName(NotNull<std::string *> kernel_name); | ||||
| // Dump data when task error. | |||||
| static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||||
| std::string *kernel_name, Debugger *debugger); | |||||
| static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||||
| std::string *kernel_name, Debugger *debugger); | |||||
| private: | private: | ||||
| static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, Debugger *debugger); | static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, Debugger *debugger); | ||||
| @@ -22,6 +22,7 @@ | |||||
| #include <exception> | #include <exception> | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <thread> | #include <thread> | ||||
| #include "debug/data_dump/e2e_dump_util.h" | |||||
| #include "runtime/device/ascend/ascend_device_address.h" | #include "runtime/device/ascend/ascend_device_address.h" | ||||
| #include "runtime/device/cpu/mpi/mpi_interface.h" | #include "runtime/device/cpu/mpi/mpi_interface.h" | ||||
| #include "utils/ms_context.h" | #include "utils/ms_context.h" | ||||
| @@ -106,6 +107,7 @@ std::string GetRankId() { | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| std::vector<rtExceptionInfo> AscendKernelRuntime::exception_infos_; | |||||
| AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | ||||
| void AscendKernelRuntime::SetContext() { | void AscendKernelRuntime::SetContext() { | ||||
| @@ -268,6 +270,12 @@ bool AscendKernelRuntime::Init() { | |||||
| MS_EXCEPTION_IF_NULL(mem_manager_); | MS_EXCEPTION_IF_NULL(mem_manager_); | ||||
| mem_manager_->MallocDeviceMemory(); | mem_manager_->MallocDeviceMemory(); | ||||
| // Set callback func when exception error | |||||
| auto rt_ret = rtSetTaskFailCallback(ExceptionCallback); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| MS_LOG(EXCEPTION) << "SetTaskFailCallback failed, error: " << rt_ret; | |||||
| } | |||||
| initialized_ = true; | initialized_ = true; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -507,11 +515,41 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { | |||||
| } | } | ||||
| } | } | ||||
| void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) { | |||||
| auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph_id); | |||||
| for (auto iter : runtime_info_map) { | |||||
| MS_LOG(WARNING) << "Task name:" << iter.first << " task_id:" << std::get<kTupleTaskId>(*iter.second) | |||||
| << " stream_id:" << std::get<kTupleStreamId>(*iter.second); | |||||
| void AscendKernelRuntime::ExceptionCallback(rtExceptionInfo *exception_info) { | |||||
| static std::mutex exception_mutex; | |||||
| std::lock_guard<std::mutex> lock(exception_mutex); | |||||
| exception_infos_.push_back(*exception_info); | |||||
| } | |||||
| void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| std::vector<std::string> full_scope_name{}; | |||||
| // Find node name(full scope name) | |||||
| auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id()); | |||||
| MS_LOG(ERROR) << "Exception_infos_ size: " << exception_infos_.size() << ". first example: " | |||||
| << ", task_id: " << exception_infos_.at(0).taskid << ", stream_id: " << exception_infos_.at(0).streamid | |||||
| << ", tid: " << exception_infos_.at(0).tid << ", device_id: " << exception_infos_.at(0).deviceid; | |||||
| for (const auto &exception_info : exception_infos_) { | |||||
| for (const auto &iter : runtime_info_map) { | |||||
| auto task_id = std::get<kTupleTaskId>(*iter.second); | |||||
| auto stream_id = std::get<kTupleStreamId>(*iter.second); | |||||
| if (task_id == exception_info.taskid && stream_id == exception_info.streamid) { | |||||
| full_scope_name.push_back(iter.first); | |||||
| MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; | |||||
| } | |||||
| } | |||||
| } | |||||
| // Dump error data in local path | |||||
| const std::string local_path = std::string("./task_error_dump/") + std::to_string(exception_infos_.at(0).deviceid); | |||||
| for (const auto &node : graph->execution_order()) { | |||||
| for (auto &name : full_scope_name) { | |||||
| if (node->fullname_with_scope() == name) { | |||||
| MS_LOG(ERROR) << "Begin to dump node (" << name << ") task error input/output data in local path."; | |||||
| E2eDumpUtil::DumpInputImpl(node, false, local_path, &name, nullptr); | |||||
| E2eDumpUtil::DumpOutputImpl(node, false, local_path, &name, nullptr); | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -621,8 +659,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | |||||
| bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); | bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); | ||||
| if (!status) { | if (!status) { | ||||
| MS_LOG(ERROR) << "Run task failed"; | |||||
| DebugTaskIdName(graph->graph_id()); | |||||
| DumpTaskExceptionInfo(graph); | |||||
| return false; | return false; | ||||
| } | } | ||||
| return true; | return true; | ||||
| @@ -71,9 +71,10 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| void ReleaseDeviceRes() override; | void ReleaseDeviceRes() override; | ||||
| bool GraphWithEmptyTaskList(const session::KernelGraph *graph) const; | bool GraphWithEmptyTaskList(const session::KernelGraph *graph) const; | ||||
| bool CheckGraphIdValid(GraphId graph_id) const; | bool CheckGraphIdValid(GraphId graph_id) const; | ||||
| static void DebugTaskIdName(GraphId graph_id); | |||||
| void DistributeDebugTask(NotNull<const session::KernelGraph *> graph, NotNull<std::function<void *()>> model_handle); | void DistributeDebugTask(NotNull<const session::KernelGraph *> graph, NotNull<std::function<void *()>> model_handle); | ||||
| void LaunchDataDump(GraphId graph_id); | void LaunchDataDump(GraphId graph_id); | ||||
| static void DumpTaskExceptionInfo(const session::KernelGraph *graph); | |||||
| static void ExceptionCallback(rtExceptionInfo *exception_info); | |||||
| rtContext_t rt_context_{nullptr}; | rtContext_t rt_context_{nullptr}; | ||||
| rtContext_t rt_context_hccl_{nullptr}; | rtContext_t rt_context_hccl_{nullptr}; | ||||
| @@ -81,6 +82,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| unordered_map<GraphId, vector<std::shared_ptr<TaskInfo>>> task_map_; | unordered_map<GraphId, vector<std::shared_ptr<TaskInfo>>> task_map_; | ||||
| unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_; | unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_; | ||||
| unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_; | unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_; | ||||
| static std::vector<rtExceptionInfo> exception_infos_; | |||||
| }; | }; | ||||
| MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); | MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); | ||||
| @@ -146,3 +146,5 @@ RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t* | |||||
| int AdxDataDumpServerInit() { return 0; } | int AdxDataDumpServerInit() { return 0; } | ||||
| int AdxDataDumpServerUnInit() { return 0; } | int AdxDataDumpServerUnInit() { return 0; } | ||||
| RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } | |||||