|
|
|
@@ -19,18 +19,14 @@ |
|
|
|
#include <vector> |
|
|
|
#include <memory> |
|
|
|
#include <utility> |
|
|
|
#include <exception> |
|
|
|
#include <algorithm> |
|
|
|
#include <thread> |
|
|
|
#include "runtime/device/ascend/signal_util.h" |
|
|
|
#include "debug/data_dump/e2e_dump_util.h" |
|
|
|
#include "runtime/device/ascend/ascend_device_address.h" |
|
|
|
#include "runtime/device/cpu/mpi/mpi_interface.h" |
|
|
|
#include "utils/ms_context.h" |
|
|
|
#include "utils/context/context_extends.h" |
|
|
|
#include "utils/mpi/mpi_config.h" |
|
|
|
#include "runtime/device/ascend/profiling/profiling_manager.h" |
|
|
|
#include "hccl/hcom.h" |
|
|
|
#include "common/trans.h" |
|
|
|
#include "runtime/context.h" |
|
|
|
#include "runtime/device/ascend/ascend_label_assign.h" |
|
|
|
@@ -39,12 +35,9 @@ |
|
|
|
#include "runtime/device/ascend/tasksink/task_generator.h" |
|
|
|
#include "backend/session/anf_runtime_algorithm.h" |
|
|
|
#include "runtime/device/ascend/profiling/profiling_utils.h" |
|
|
|
#include "backend/kernel_compiler/tbe/tbe_utils.h" |
|
|
|
#include "runtime/device/ascend/ascend_memory_manager.h" |
|
|
|
#include "debug/tensor_load.h" |
|
|
|
#include "debug/data_dump/dump_json_parser.h" |
|
|
|
#include "toolchain/adx_datadump_server.h" |
|
|
|
#include "utils/shape_utils.h" |
|
|
|
#include "utils/trace_base.h" |
|
|
|
#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" |
|
|
|
#include "debug/anf_ir_dump.h" |
|
|
|
@@ -113,9 +106,12 @@ std::string GetRankId() { |
|
|
|
} // namespace |
|
|
|
|
|
|
|
std::vector<rtTaskFailInfo> AscendKernelRuntime::task_fail_infoes_ = {}; |
|
|
|
uint32_t AscendKernelRuntime::current_graph_id_ = 0; |
|
|
|
const session::KernelGraph *current_graph_ = nullptr; |
|
|
|
std::map<std::string, uint32_t> AscendKernelRuntime::overflow_tasks_; |
|
|
|
AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } |
|
|
|
AscendKernelRuntime::~AscendKernelRuntime() { |
|
|
|
graph_model_map_.clear(); |
|
|
|
current_graph_ = nullptr; |
|
|
|
} |
|
|
|
|
|
|
|
void AscendKernelRuntime::SetContext() { |
|
|
|
if (rt_context_ == nullptr) { |
|
|
|
@@ -268,6 +264,7 @@ void AscendKernelRuntime::ReleaseDeviceRes() { |
|
|
|
(void)DestroyHccl(); |
|
|
|
(void)ResetDevice(device_id); |
|
|
|
(void)ProfilingManager::GetInstance().StopProfiling(); |
|
|
|
current_graph_ = nullptr; |
|
|
|
MS_LOG(INFO) << "Ascend finalize end"; |
|
|
|
} |
|
|
|
|
|
|
|
@@ -389,6 +386,7 @@ bool AscendKernelRuntime::GenDynamicKernel(const session::KernelGraph *graph) { |
|
|
|
} |
|
|
|
|
|
|
|
bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) { |
|
|
|
MS_EXCEPTION_IF_NULL(graph); |
|
|
|
InnerSetContext(); |
|
|
|
if (graph->is_dynamic_shape()) { |
|
|
|
if (ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE && (ConfigManager::GetInstance().iter_num() > 1)) { |
|
|
|
@@ -400,9 +398,6 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) { |
|
|
|
MS_LOG(INFO) << "Dynamic Shape Graph Generate Dynamic kernel"; |
|
|
|
return GenDynamicKernel(graph); |
|
|
|
} |
|
|
|
if (graph == nullptr) { |
|
|
|
MS_EXCEPTION(NotExistsError) << "session::KernelGraph is NULL!"; |
|
|
|
} |
|
|
|
MS_LOG(INFO) << "GenTask start. GraphId:" << graph->graph_id(); |
|
|
|
DumpJsonParser::GetInstance().UpdateNeedDumpKernels(NOT_NULL(graph)); |
|
|
|
#ifdef MEM_REUSE_DEBUG |
|
|
|
@@ -454,15 +449,13 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) { |
|
|
|
} |
|
|
|
|
|
|
|
bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) { |
|
|
|
MS_EXCEPTION_IF_NULL(graph); |
|
|
|
InnerSetContext(); |
|
|
|
if (graph->is_dynamic_shape()) { |
|
|
|
MS_LOG(INFO) << "Dynamic Shape Graph Skip Load Task Step"; |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
if (graph == nullptr) { |
|
|
|
MS_EXCEPTION(NotExistsError) << "Null pointer graph, LoadTask failed. "; |
|
|
|
} |
|
|
|
MS_LOG(INFO) << "LoadTask start. GraphId:" << graph->graph_id(); |
|
|
|
if (GraphWithEmptyTaskList(graph)) { |
|
|
|
MS_LOG(WARNING) << "LoadTask end, task list is empty"; |
|
|
|
@@ -508,7 +501,7 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) { |
|
|
|
} |
|
|
|
|
|
|
|
void AscendKernelRuntime::DistributeDebugTask(NotNull<const session::KernelGraph *> graph, |
|
|
|
NotNull<std::function<void *()>> model_handle) { |
|
|
|
const NotNull<std::function<void *()>> &model_handle) { |
|
|
|
if (!DumpJsonParser::GetInstance().async_dump_enabled()) { |
|
|
|
return; |
|
|
|
} |
|
|
|
@@ -543,55 +536,64 @@ void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) { |
|
|
|
static std::mutex exception_mutex; |
|
|
|
std::lock_guard<std::mutex> lock(exception_mutex); |
|
|
|
if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) { |
|
|
|
auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid); |
|
|
|
if (overflow_tasks_.find(key) == overflow_tasks_.end()) { |
|
|
|
overflow_tasks_[key] = 1; |
|
|
|
} |
|
|
|
if (overflow_tasks_[key] == 5) { |
|
|
|
auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); |
|
|
|
MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name; |
|
|
|
overflow_tasks_[key] = 0; |
|
|
|
auto node = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); |
|
|
|
|
|
|
|
if (!node) { |
|
|
|
MS_LOG(WARNING) << "Node run task overflow, node name is unknown."; |
|
|
|
} else { |
|
|
|
overflow_tasks_[key]++; |
|
|
|
auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid) + |
|
|
|
std::to_string(current_graph_->graph_id()); |
|
|
|
if (overflow_tasks_.find(key) == overflow_tasks_.end() || overflow_tasks_[key] == 5) { |
|
|
|
// print overflow info |
|
|
|
MS_LOG(WARNING) << "Node run task overflow, node name: " << node->fullname_with_scope() |
|
|
|
<< "Task overflow infos task_id: " << task_fail_info->taskid |
|
|
|
<< ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid |
|
|
|
<< ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode |
|
|
|
<< ", trace: " << trace::DumpSourceLines(node); |
|
|
|
overflow_tasks_[key] = 1; |
|
|
|
} else { |
|
|
|
overflow_tasks_[key]++; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid |
|
|
|
<< ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid |
|
|
|
<< ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode; |
|
|
|
task_fail_infoes_.push_back(*task_fail_info); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
string AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taskid) { |
|
|
|
auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(AscendKernelRuntime::current_graph_id_); |
|
|
|
CNodePtr AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taskid) { |
|
|
|
auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(current_graph_->graph_id()); |
|
|
|
for (const auto &iter : runtime_info_map) { |
|
|
|
auto task_id = std::get<kTupleTaskId>(*iter.second); |
|
|
|
auto stream_id = std::get<kTupleStreamId>(*iter.second); |
|
|
|
if (task_id == taskid && stream_id == streamid) { |
|
|
|
MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; |
|
|
|
return iter.first; |
|
|
|
auto &execute_node = current_graph_->execution_order(); |
|
|
|
auto node = std::find_if(execute_node.begin(), execute_node.end(), |
|
|
|
[&iter](const auto &node) { return node->fullname_with_scope() == iter.first; }); |
|
|
|
if (node != execute_node.end()) { |
|
|
|
return *node; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
return ""; |
|
|
|
return nullptr; |
|
|
|
} |
|
|
|
|
|
|
|
void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { |
|
|
|
MS_EXCEPTION_IF_NULL(graph); |
|
|
|
const std::string local_path = std::string("./task_error_dump/") + std::to_string(task_fail_infoes_.at(0).deviceid); |
|
|
|
for (const auto &task_fail_info : task_fail_infoes_) { |
|
|
|
auto full_scope_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info.streamid, task_fail_info.taskid); |
|
|
|
MS_LOG(ERROR) << "Task fail infos task_id: " << task_fail_info.taskid << ", stream_id: " << task_fail_info.streamid |
|
|
|
<< ", tid: " << task_fail_info.tid << ", device_id: " << task_fail_info.deviceid |
|
|
|
<< ", retcode: " << task_fail_info.retcode; |
|
|
|
auto node = AscendKernelRuntime::GetErrorNodeName(task_fail_info.streamid, task_fail_info.taskid); |
|
|
|
// Dump error data in local path |
|
|
|
if (full_scope_name.empty()) { |
|
|
|
if (node == nullptr) { |
|
|
|
continue; |
|
|
|
} |
|
|
|
for (const auto &node : graph->execution_order()) { |
|
|
|
if (node->fullname_with_scope() == full_scope_name) { |
|
|
|
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path |
|
|
|
<< " trace: " << trace::DumpSourceLines(node); |
|
|
|
E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); |
|
|
|
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); |
|
|
|
} |
|
|
|
} |
|
|
|
auto full_scope_name = node->fullname_with_scope(); |
|
|
|
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path |
|
|
|
<< " trace: " << trace::DumpSourceLines(node); |
|
|
|
E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); |
|
|
|
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -662,7 +664,7 @@ bool AscendKernelRuntime::RunDynamicKernelAsync(const session::KernelGraph *grap |
|
|
|
} |
|
|
|
|
|
|
|
bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { |
|
|
|
current_graph_id_ = graph->graph_id(); |
|
|
|
current_graph_ = graph; |
|
|
|
InnerSetContext(); |
|
|
|
MS_EXCEPTION_IF_NULL(graph); |
|
|
|
if (graph->is_dynamic_shape()) { |
|
|
|
@@ -689,7 +691,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { |
|
|
|
bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); |
|
|
|
if (!status) { |
|
|
|
DumpTaskExceptionInfo(graph); |
|
|
|
std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir"; |
|
|
|
std::string file_name = "task_error_debug" + std::to_string(graph->graph_id()) + ".ir"; |
|
|
|
auto graph_tmp = std::make_shared<session::KernelGraph>(*graph); |
|
|
|
DumpIR(file_name, graph_tmp); |
|
|
|
#ifdef ENABLE_TDTQUE |
|
|
|
|