Browse Source

!8030 dump node input/output when task error

Merge pull request !8030 from jjfeing/master
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
17bc7ffca6
5 changed files with 107 additions and 49 deletions
  1. +53
    -41
      mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc
  2. +5
    -0
      mindspore/ccsrc/debug/data_dump/e2e_dump_util.h
  3. +44
    -7
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  4. +3
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
  5. +2
    -0
      tests/ut/cpp/stub/runtime/runtime_stub.cc

+ 53
- 41
mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc View File

@@ -112,20 +112,26 @@ void E2eDumpUtil::DumpOutput(const session::KernelGraph *graph, const std::strin
continue;
}
DumpJsonParser::GetInstance().MatchKernel(kernel_name);
GetFileKernelName(NOT_NULL(&kernel_name));
auto output_size = AnfAlgo::GetOutputTensorNum(node);
for (size_t j = 0; j < output_size; ++j) {
auto addr = AnfAlgo::GetOutputAddr(node, j);
ShapeVector int_shapes;
GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(node, j);
std::string file_path = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j);
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j,
debugger);
} else {
DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type);
}
DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
}
}

void E2eDumpUtil::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(node);
GetFileKernelName(NOT_NULL(kernel_name));
auto output_size = AnfAlgo::GetOutputTensorNum(node);
for (size_t j = 0; j < output_size; ++j) {
auto addr = AnfAlgo::GetOutputAddr(node, j);
ShapeVector int_shapes;
GetDumpIntShape(node, j, trans_flag, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(node, j);
std::string file_path = dump_path + '/' + *kernel_name + '_' + "output_" + std::to_string(j);
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, node->fullname_with_scope(), NOT_NULL(addr), trans_flag, int_shapes, type, j,
debugger);
} else {
DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type);
}
}
}
@@ -147,35 +153,41 @@ void E2eDumpUtil::DumpInput(const session::KernelGraph *graph, const std::string
continue;
}
DumpJsonParser::GetInstance().MatchKernel(kernel_name);
GetFileKernelName(NOT_NULL(&kernel_name));
auto input_size = AnfAlgo::GetInputTensorNum(node);
for (size_t j = 0; j < input_size; ++j) {
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j);
auto input = kernel_with_index.first;
auto index = kernel_with_index.second;
auto addr = AnfAlgo::GetOutputAddr(input, index);
DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
}
}

std::string tensor_name;
size_t slot;
if (IsDeviceTargetGPU()) {
auto input_kernel = node->input(j + 1);
std::string input_kernel_name = input_kernel->fullname_with_scope();
tensor_name = input_kernel_name;
slot = 0;
} else {
tensor_name = node->fullname_with_scope();
slot = j;
}
void E2eDumpUtil::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(node);
GetFileKernelName(NOT_NULL(kernel_name));
auto input_size = AnfAlgo::GetInputTensorNum(node);
for (size_t j = 0; j < input_size; ++j) {
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j);
auto input = kernel_with_index.first;
auto index = kernel_with_index.second;
auto addr = AnfAlgo::GetOutputAddr(input, index);

ShapeVector int_shapes;
GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(input, index);
std::string file_path = dump_path + '/' + kernel_name + '_' + "input_" + std::to_string(j);
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger);
} else {
DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type);
}
std::string tensor_name;
size_t slot;
if (IsDeviceTargetGPU()) {
auto input_kernel = node->input(j + 1);
std::string input_kernel_name = input_kernel->fullname_with_scope();
tensor_name = input_kernel_name;
slot = 0;
} else {
tensor_name = node->fullname_with_scope();
slot = j;
}

ShapeVector int_shapes;
GetDumpIntShape(input, index, trans_flag, NOT_NULL(&int_shapes));
auto type = AnfAlgo::GetOutputInferDataType(input, index);
std::string file_path = dump_path + '/' + *kernel_name + '_' + "input_" + std::to_string(j);
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(file_path, tensor_name, NOT_NULL(addr), trans_flag, int_shapes, type, slot, debugger);
} else {
DumpMemToFile(file_path, NOT_NULL(addr), trans_flag, int_shapes, type);
}
}
}


+ 5
- 0
mindspore/ccsrc/debug/data_dump/e2e_dump_util.h View File

@@ -34,6 +34,11 @@ class E2eDumpUtil {
~E2eDumpUtil() = default;
static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, Debugger *debugger = nullptr);
static void GetFileKernelName(NotNull<std::string *> kernel_name);
// Dump data when task error.
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, Debugger *debugger);
static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, Debugger *debugger);

private:
static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, Debugger *debugger);


+ 44
- 7
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -22,6 +22,7 @@
#include <exception>
#include <algorithm>
#include <thread>
#include "debug/data_dump/e2e_dump_util.h"
#include "runtime/device/ascend/ascend_device_address.h"
#include "runtime/device/cpu/mpi/mpi_interface.h"
#include "utils/ms_context.h"
@@ -106,6 +107,7 @@ std::string GetRankId() {
}
} // namespace

std::vector<rtExceptionInfo> AscendKernelRuntime::exception_infos_;
AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }

void AscendKernelRuntime::SetContext() {
@@ -268,6 +270,12 @@ bool AscendKernelRuntime::Init() {
MS_EXCEPTION_IF_NULL(mem_manager_);
mem_manager_->MallocDeviceMemory();

// Set callback func when exception error
auto rt_ret = rtSetTaskFailCallback(ExceptionCallback);
if (rt_ret != RT_ERROR_NONE) {
MS_LOG(EXCEPTION) << "SetTaskFailCallback failed, error: " << rt_ret;
}

initialized_ = true;
return ret;
}
@@ -507,11 +515,41 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) {
}
}

void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph_id);
for (auto iter : runtime_info_map) {
MS_LOG(WARNING) << "Task name:" << iter.first << " task_id:" << std::get<kTupleTaskId>(*iter.second)
<< " stream_id:" << std::get<kTupleStreamId>(*iter.second);
void AscendKernelRuntime::ExceptionCallback(rtExceptionInfo *exception_info) {
static std::mutex exception_mutex;
std::lock_guard<std::mutex> lock(exception_mutex);
exception_infos_.push_back(*exception_info);
}

void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
std::vector<std::string> full_scope_name{};
// Find node name(full scope name)
auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id());
MS_LOG(ERROR) << "Exception_infos_ size: " << exception_infos_.size() << ". first example: "
<< ", task_id: " << exception_infos_.at(0).taskid << ", stream_id: " << exception_infos_.at(0).streamid
<< ", tid: " << exception_infos_.at(0).tid << ", device_id: " << exception_infos_.at(0).deviceid;

for (const auto &exception_info : exception_infos_) {
for (const auto &iter : runtime_info_map) {
auto task_id = std::get<kTupleTaskId>(*iter.second);
auto stream_id = std::get<kTupleStreamId>(*iter.second);
if (task_id == exception_info.taskid && stream_id == exception_info.streamid) {
full_scope_name.push_back(iter.first);
MS_LOG(ERROR) << "Node: " << iter.first << ", run task error.";
}
}
}
// Dump error data in local path
const std::string local_path = std::string("./task_error_dump/") + std::to_string(exception_infos_.at(0).deviceid);
for (const auto &node : graph->execution_order()) {
for (auto &name : full_scope_name) {
if (node->fullname_with_scope() == name) {
MS_LOG(ERROR) << "Begin to dump node (" << name << ") task error input/output data in local path.";
E2eDumpUtil::DumpInputImpl(node, false, local_path, &name, nullptr);
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &name, nullptr);
}
}
}
}

@@ -621,8 +659,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) {

bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors);
if (!status) {
MS_LOG(ERROR) << "Run task failed";
DebugTaskIdName(graph->graph_id());
DumpTaskExceptionInfo(graph);
return false;
}
return true;


+ 3
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h View File

@@ -71,9 +71,10 @@ class AscendKernelRuntime : public KernelRuntime {
void ReleaseDeviceRes() override;
bool GraphWithEmptyTaskList(const session::KernelGraph *graph) const;
bool CheckGraphIdValid(GraphId graph_id) const;
static void DebugTaskIdName(GraphId graph_id);
void DistributeDebugTask(NotNull<const session::KernelGraph *> graph, NotNull<std::function<void *()>> model_handle);
void LaunchDataDump(GraphId graph_id);
static void DumpTaskExceptionInfo(const session::KernelGraph *graph);
static void ExceptionCallback(rtExceptionInfo *exception_info);

rtContext_t rt_context_{nullptr};
rtContext_t rt_context_hccl_{nullptr};
@@ -81,6 +82,7 @@ class AscendKernelRuntime : public KernelRuntime {
unordered_map<GraphId, vector<std::shared_ptr<TaskInfo>>> task_map_;
unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_;
unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_;
static std::vector<rtExceptionInfo> exception_infos_;
};

MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime);


+ 2
- 0
tests/ut/cpp/stub/runtime/runtime_stub.cc View File

@@ -146,3 +146,5 @@ RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t*
int AdxDataDumpServerInit() { return 0; }

int AdxDataDumpServerUnInit() { return 0; }

RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; }

Loading…
Cancel
Save