From 245dca495960b593bcfc648bee80cd430d3371c4 Mon Sep 17 00:00:00 2001 From: jjfeing Date: Wed, 30 Dec 2020 09:03:30 +0800 Subject: [PATCH] add overflow operator err msg --- graphengine | 2 +- .../device/ascend/ascend_kernel_runtime.cc | 29 +++--- .../device/ascend/ascend_kernel_runtime.h | 5 +- mindspore/ccsrc/utils/CMakeLists.txt | 4 + mindspore/ccsrc/utils/runtime_error_code.cc | 98 +++++++++++++++++++ mindspore/ccsrc/utils/runtime_error_codes.h | 24 +++++ tests/ut/cpp/stub/runtime/runtime_stub.cc | 2 +- 7 files changed, 144 insertions(+), 20 deletions(-) create mode 100644 mindspore/ccsrc/utils/runtime_error_code.cc create mode 100644 mindspore/ccsrc/utils/runtime_error_codes.h diff --git a/graphengine b/graphengine index 1b4f857762..79e930c7b8 160000 --- a/graphengine +++ b/graphengine @@ -1 +1 @@ -Subproject commit 1b4f85776269f567d11153807ae7badc91803083 +Subproject commit 79e930c7b86f39426ccde9e7941b2bfbcaf2c1f1 diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 890fad9b74..7aabb61bc3 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -47,6 +47,7 @@ #include "utils/shape_utils.h" #include "utils/trace_base.h" #include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" +#include "utils/runtime_error_codes.h" #include "debug/anf_ir_dump.h" #ifdef MEM_REUSE_DEBUG #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" @@ -112,9 +113,8 @@ std::string GetRankId() { } } // namespace -std::vector AscendKernelRuntime::task_fail_infoes_ = {}; +std::vector AscendKernelRuntime::task_fail_infoes_ = {}; uint32_t AscendKernelRuntime::current_graph_id_ = 0; -std::map AscendKernelRuntime::overflow_tasks_; AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } void AscendKernelRuntime::SetContext() { @@ -538,26 +538,24 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { } } -void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) { +void AscendKernelRuntime::TaskFailCallback(rtExceptionInfo *task_fail_info) { MS_EXCEPTION_IF_NULL(task_fail_info); static std::mutex exception_mutex; std::lock_guard lock(exception_mutex); if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) { - auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid); - if (overflow_tasks_.find(key) == overflow_tasks_.end()) { - overflow_tasks_[key] = 1; - } - if (overflow_tasks_[key] == 5) { - auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); - MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name; - overflow_tasks_[key] = 0; + auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); + if (node_name.empty()) { + MS_LOG(WARNING) << "Can not get node by task id: " << task_fail_info->taskid + << ", stream id:" << task_fail_info->streamid; } else { - overflow_tasks_[key]++; + MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name << "(task id: " << task_fail_info->taskid + << ", stream id:" << task_fail_info->streamid << ")"; } } else { MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid << ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid - << ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode; + << ", device_id: " << task_fail_info->deviceid + << ", error msg: " << GetErrorMsg(task_fail_info->retcode); task_fail_infoes_.push_back(*task_fail_info); } } @@ -586,7 +584,8 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *grap } for (const auto &node : graph->execution_order()) { if (node->fullname_with_scope() == full_scope_name) { - MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path + MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/otput data to: " << local_path + << " Error msg: " << GetErrorMsg(task_fail_info.retcode) << " trace: " << trace::DumpSourceLines(node); E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); @@ -689,7 +688,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); if (!status) { DumpTaskExceptionInfo(graph); - std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir"; + std::string file_name = "task_error_debug_" + std::to_string(current_graph_id_) + ".ir"; auto graph_tmp = std::make_shared(*graph); DumpIR(file_name, graph_tmp); #ifdef ENABLE_TDTQUE diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index bb4c2c9206..f2bd898151 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -81,7 +81,7 @@ class AscendKernelRuntime : public KernelRuntime { void LaunchDataDump(GraphId graph_id); static string GetErrorNodeName(uint32_t streamid, uint32_t taskid); static void DumpTaskExceptionInfo(const session::KernelGraph *graph); - static void TaskFailCallback(rtTaskFailInfo *task_fail_info); + static void TaskFailCallback(rtExceptionInfo *task_fail_info); void ReportProfilingData(); rtContext_t rt_context_{nullptr}; @@ -91,8 +91,7 @@ class AscendKernelRuntime : public KernelRuntime { unordered_map> graph_data_dumper_; std::map, std::string> stream_id_task_id_op_name_map_; static uint32_t current_graph_id_; - static std::map overflow_tasks_; - static std::vector task_fail_infoes_; + static std::vector task_fail_infoes_; }; MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); diff --git a/mindspore/ccsrc/utils/CMakeLists.txt b/mindspore/ccsrc/utils/CMakeLists.txt index 71d68729b9..421d8f4cb5 100644 --- a/mindspore/ccsrc/utils/CMakeLists.txt +++ b/mindspore/ccsrc/utils/CMakeLists.txt @@ -4,6 +4,10 @@ if (NOT ENABLE_GE) file(GLOB_RECURSE _UTILS_GE_SRC_FILES ./callbacks_ge.cc) list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_GE_SRC_FILES}) endif () +if (NOT ENABLE_D AND NOT ENABLE_TESTCASES) + file(GLOB_RECURSE _UTILS_D_SRC_FILES ./runtime_error_code.cc) + list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_D_SRC_FILES}) +endif() set_property(SOURCE ${_UTILS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_UTILS) add_library(_mindspore_utils_obj OBJECT ${_UTILS_SRC_LIST}) diff --git a/mindspore/ccsrc/utils/runtime_error_code.cc b/mindspore/ccsrc/utils/runtime_error_code.cc new file mode 100644 index 0000000000..8ee2919b95 --- /dev/null +++ b/mindspore/ccsrc/utils/runtime_error_code.cc @@ -0,0 +1,98 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "utils/runtime_error_codes.h" +#include +#include +#include "graphengine/inc/external/runtime/rt_error_codes.h" + +const std::map error_msg = { + {ACL_RT_SUCCESS, "success"}, + {ACL_ERROR_RT_PARAM_INVALID, "param invalid"}, + {ACL_ERROR_RT_INVALID_DEVICEID, "invalid device id"}, + {ACL_ERROR_RT_CONTEXT_NULL, "current context null"}, + {ACL_ERROR_RT_STREAM_CONTEXT, "stream not in current context"}, + {ACL_ERROR_RT_MODEL_CONTEXT, "model not in current context"}, + {ACL_ERROR_RT_STREAM_MODEL, "stream not in model"}, + {ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"}, + {ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL, " event timestamp reversal"}, + {ACL_ERROR_RT_ADDR_UNALIGNED, "memory address unaligned"}, + {ACL_ERROR_RT_FILE_OPEN, "open file failed"}, + {ACL_ERROR_RT_FILE_WRITE, "write file failed"}, + {ACL_ERROR_RT_STREAM_SUBSCRIBE, "error subscribe stream"}, + {ACL_ERROR_RT_THREAD_SUBSCRIBE, "error subscribe thread"}, + {ACL_ERROR_RT_GROUP_NOT_SET, "group not set"}, + {ACL_ERROR_RT_GROUP_NOT_CREATE, "group not create"}, + {ACL_ERROR_RT_STREAM_NO_CB_REG, "callback not register to stream"}, + {ACL_ERROR_RT_INVALID_MEMORY_TYPE, "invalid memory type"}, + {ACL_ERROR_RT_INVALID_HANDLE, "invalid handle"}, + {ACL_ERROR_RT_INVALID_MALLOC_TYPE, "invalid malloc type"}, + {ACL_ERROR_RT_FEATURE_NOT_SUPPORT, "feature not support"}, + {ACL_ERROR_RT_MEMORY_ALLOCATION, "memory allocation error"}, + {ACL_ERROR_RT_MEMORY_FREE, "memory free error"}, + {ACL_ERROR_RT_AICORE_OVER_FLOW, "aicore over flow"}, + {ACL_ERROR_RT_NO_DEVICE, "no device"}, + {ACL_ERROR_RT_RESOURCE_ALLOC_FAIL, "resource alloc fail"}, + {ACL_ERROR_RT_NO_PERMISSION, "no permission"}, + {ACL_ERROR_RT_NO_EVENT_RESOURCE, "no event resource"}, + {ACL_ERROR_RT_NO_STREAM_RESOURCE, "no stream resource"}, + {ACL_ERROR_RT_NO_NOTIFY_RESOURCE, "no notify resource"}, + {ACL_ERROR_RT_NO_MODEL_RESOURCE, "no model resource"}, + {ACL_ERROR_RT_INTERNAL_ERROR, "runtime internal error"}, + {ACL_ERROR_RT_TS_ERROR, "ts internel error"}, + {ACL_ERROR_RT_STREAM_TASK_FULL, "task full in stream"}, + {ACL_ERROR_RT_STREAM_TASK_EMPTY, " task empty in stream"}, + {ACL_ERROR_RT_STREAM_NOT_COMPLETE, "stream not complete"}, + {ACL_ERROR_RT_END_OF_SEQUENCE, "end of sequence"}, + {ACL_ERROR_RT_EVENT_NOT_COMPLETE, "event not complete"}, + {ACL_ERROR_RT_CONTEXT_RELEASE_ERROR, "context release error"}, + {ACL_ERROR_RT_SOC_VERSION, "soc version error"}, + {ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT, "task type not support"}, + {ACL_ERROR_RT_LOST_HEARTBEAT, "ts lost heartbeat"}, + {ACL_ERROR_RT_MODEL_EXECUTE, " model execute failed"}, + {ACL_ERROR_RT_REPORT_TIMEOUT, "report timeout"}, + {ACL_ERROR_RT_SYS_DMA, "sys dma error"}, + {ACL_ERROR_RT_AICORE_TIMEOUT, "aicore timeout"}, + {ACL_ERROR_RT_AICORE_EXCEPTION, "aicore exception"}, + {ACL_ERROR_RT_AICORE_TRAP_EXCEPTION, " aicore trap exception"}, + {ACL_ERROR_RT_AICPU_TIMEOUT, " aicpu timeout"}, + {ACL_ERROR_RT_AICPU_EXCEPTION, "aicpu exception"}, + {ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR, " aicpu datadump response error"}, + {ACL_ERROR_RT_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"}, + {ACL_ERROR_RT_PROFILING_ERROR, "profiling error"}, + {ACL_ERROR_RT_IPC_ERROR, "ipc error"}, + {ACL_ERROR_RT_MODEL_ABORT_NORMAL, "model abort normal"}, + {ACL_ERROR_RT_KERNEL_UNREGISTERING, "kernel unregistering"}, + {ACL_ERROR_RT_RINGBUFFER_NOT_INIT, "ringbuffer not init"}, + {ACL_ERROR_RT_RINGBUFFER_NO_DATA, "ringbuffer no data"}, + {ACL_ERROR_RT_KERNEL_LOOKUP, "kernel lookup error"}, + {ACL_ERROR_RT_KERNEL_DUPLICATE, "kernel register duplicate"}, + {ACL_ERROR_RT_DEBUG_REGISTER_FAIL, "debug register failed"}, + {ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL, "debug unregister failed"}, + {ACL_ERROR_RT_LABEL_CONTEXT, "label not in current context"}, + {ACL_ERROR_RT_PROGRAM_USE_OUT, "program register num use out"}, + {ACL_ERROR_RT_DEV_SETUP_ERROR, "device setup error"}, + {ACL_ERROR_RT_DRV_INTERNAL_ERROR, "drv internal error"}, +}; + +namespace mindspore { +std::string GetErrorMsg(uint32_t rt_error_code) { + auto find_iter = error_msg.find(rt_error_code); + if (find_iter == error_msg.end()) { + return "Return error code unknown, ret code: " + std::to_string(rt_error_code); + } + return find_iter->second; +} +} // namespace mindspore diff --git a/mindspore/ccsrc/utils/runtime_error_codes.h b/mindspore/ccsrc/utils/runtime_error_codes.h new file mode 100644 index 0000000000..556807b5a5 --- /dev/null +++ b/mindspore/ccsrc/utils/runtime_error_codes.h @@ -0,0 +1,24 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ +#define MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ + +#include +namespace mindspore { +std::string GetErrorMsg(uint32_t rt_error_code); +} // namespace mindspore +#endif // MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ diff --git a/tests/ut/cpp/stub/runtime/runtime_stub.cc b/tests/ut/cpp/stub/runtime/runtime_stub.cc index 73ba1aed6b..5f5a9d99c9 100644 --- a/tests/ut/cpp/stub/runtime/runtime_stub.cc +++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc @@ -157,6 +157,6 @@ RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCal RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback) {return RT_ERROR_NONE; } -RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback) { +RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallback callback) { return RT_ERROR_NONE; }