| @@ -1 +1 @@ | |||
| Subproject commit 1b4f85776269f567d11153807ae7badc91803083 | |||
| Subproject commit 79e930c7b86f39426ccde9e7941b2bfbcaf2c1f1 | |||
| @@ -47,6 +47,7 @@ | |||
| #include "utils/shape_utils.h" | |||
| #include "utils/trace_base.h" | |||
| #include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" | |||
| #include "utils/runtime_error_codes.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| #ifdef MEM_REUSE_DEBUG | |||
| #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | |||
| @@ -112,9 +113,8 @@ std::string GetRankId() { | |||
| } | |||
| } // namespace | |||
| std::vector<rtTaskFailInfo> AscendKernelRuntime::task_fail_infoes_ = {}; | |||
| std::vector<rtExceptionInfo> AscendKernelRuntime::task_fail_infoes_ = {}; | |||
| uint32_t AscendKernelRuntime::current_graph_id_ = 0; | |||
| std::map<std::string, uint32_t> AscendKernelRuntime::overflow_tasks_; | |||
| AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | |||
| void AscendKernelRuntime::SetContext() { | |||
| @@ -538,26 +538,24 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { | |||
| } | |||
| } | |||
| void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) { | |||
| void AscendKernelRuntime::TaskFailCallback(rtExceptionInfo *task_fail_info) { | |||
| MS_EXCEPTION_IF_NULL(task_fail_info); | |||
| static std::mutex exception_mutex; | |||
| std::lock_guard<std::mutex> lock(exception_mutex); | |||
| if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) { | |||
| auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid); | |||
| if (overflow_tasks_.find(key) == overflow_tasks_.end()) { | |||
| overflow_tasks_[key] = 1; | |||
| } | |||
| if (overflow_tasks_[key] == 5) { | |||
| auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); | |||
| MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name; | |||
| overflow_tasks_[key] = 0; | |||
| auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); | |||
| if (node_name.empty()) { | |||
| MS_LOG(WARNING) << "Can not get node by task id: " << task_fail_info->taskid | |||
| << ", stream id:" << task_fail_info->streamid; | |||
| } else { | |||
| overflow_tasks_[key]++; | |||
| MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name << "(task id: " << task_fail_info->taskid | |||
| << ", stream id:" << task_fail_info->streamid << ")"; | |||
| } | |||
| } else { | |||
| MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid | |||
| << ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid | |||
| << ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode; | |||
| << ", device_id: " << task_fail_info->deviceid | |||
| << ", error msg: " << GetErrorMsg(task_fail_info->retcode); | |||
| task_fail_infoes_.push_back(*task_fail_info); | |||
| } | |||
| } | |||
| @@ -586,7 +584,8 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *grap | |||
| } | |||
| for (const auto &node : graph->execution_order()) { | |||
| if (node->fullname_with_scope() == full_scope_name) { | |||
| MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path | |||
| MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/otput data to: " << local_path | |||
| << " Error msg: " << GetErrorMsg(task_fail_info.retcode) | |||
| << " trace: " << trace::DumpSourceLines(node); | |||
| E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); | |||
| E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); | |||
| @@ -689,7 +688,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | |||
| bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); | |||
| if (!status) { | |||
| DumpTaskExceptionInfo(graph); | |||
| std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir"; | |||
| std::string file_name = "task_error_debug_" + std::to_string(current_graph_id_) + ".ir"; | |||
| auto graph_tmp = std::make_shared<session::KernelGraph>(*graph); | |||
| DumpIR(file_name, graph_tmp); | |||
| #ifdef ENABLE_TDTQUE | |||
| @@ -81,7 +81,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| void LaunchDataDump(GraphId graph_id); | |||
| static string GetErrorNodeName(uint32_t streamid, uint32_t taskid); | |||
| static void DumpTaskExceptionInfo(const session::KernelGraph *graph); | |||
| static void TaskFailCallback(rtTaskFailInfo *task_fail_info); | |||
| static void TaskFailCallback(rtExceptionInfo *task_fail_info); | |||
| void ReportProfilingData(); | |||
| rtContext_t rt_context_{nullptr}; | |||
| @@ -91,8 +91,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_; | |||
| std::map<std::pair<uint32_t, uint32_t>, std::string> stream_id_task_id_op_name_map_; | |||
| static uint32_t current_graph_id_; | |||
| static std::map<std::string, uint32_t> overflow_tasks_; | |||
| static std::vector<rtTaskFailInfo> task_fail_infoes_; | |||
| static std::vector<rtExceptionInfo> task_fail_infoes_; | |||
| }; | |||
| MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); | |||
| @@ -4,6 +4,10 @@ if (NOT ENABLE_GE) | |||
| file(GLOB_RECURSE _UTILS_GE_SRC_FILES ./callbacks_ge.cc) | |||
| list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_GE_SRC_FILES}) | |||
| endif () | |||
| if (NOT ENABLE_D AND NOT ENABLE_TESTCASES) | |||
| file(GLOB_RECURSE _UTILS_D_SRC_FILES ./runtime_error_code.cc) | |||
| list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_D_SRC_FILES}) | |||
| endif() | |||
| set_property(SOURCE ${_UTILS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_UTILS) | |||
| add_library(_mindspore_utils_obj OBJECT ${_UTILS_SRC_LIST}) | |||
| @@ -0,0 +1,98 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "utils/runtime_error_codes.h" | |||
| #include <map> | |||
| #include <vector> | |||
| #include "graphengine/inc/external/runtime/rt_error_codes.h" | |||
| const std::map<uint32_t, std::string> error_msg = { | |||
| {ACL_RT_SUCCESS, "success"}, | |||
| {ACL_ERROR_RT_PARAM_INVALID, "param invalid"}, | |||
| {ACL_ERROR_RT_INVALID_DEVICEID, "invalid device id"}, | |||
| {ACL_ERROR_RT_CONTEXT_NULL, "current context null"}, | |||
| {ACL_ERROR_RT_STREAM_CONTEXT, "stream not in current context"}, | |||
| {ACL_ERROR_RT_MODEL_CONTEXT, "model not in current context"}, | |||
| {ACL_ERROR_RT_STREAM_MODEL, "stream not in model"}, | |||
| {ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"}, | |||
| {ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL, " event timestamp reversal"}, | |||
| {ACL_ERROR_RT_ADDR_UNALIGNED, "memory address unaligned"}, | |||
| {ACL_ERROR_RT_FILE_OPEN, "open file failed"}, | |||
| {ACL_ERROR_RT_FILE_WRITE, "write file failed"}, | |||
| {ACL_ERROR_RT_STREAM_SUBSCRIBE, "error subscribe stream"}, | |||
| {ACL_ERROR_RT_THREAD_SUBSCRIBE, "error subscribe thread"}, | |||
| {ACL_ERROR_RT_GROUP_NOT_SET, "group not set"}, | |||
| {ACL_ERROR_RT_GROUP_NOT_CREATE, "group not create"}, | |||
| {ACL_ERROR_RT_STREAM_NO_CB_REG, "callback not register to stream"}, | |||
| {ACL_ERROR_RT_INVALID_MEMORY_TYPE, "invalid memory type"}, | |||
| {ACL_ERROR_RT_INVALID_HANDLE, "invalid handle"}, | |||
| {ACL_ERROR_RT_INVALID_MALLOC_TYPE, "invalid malloc type"}, | |||
| {ACL_ERROR_RT_FEATURE_NOT_SUPPORT, "feature not support"}, | |||
| {ACL_ERROR_RT_MEMORY_ALLOCATION, "memory allocation error"}, | |||
| {ACL_ERROR_RT_MEMORY_FREE, "memory free error"}, | |||
| {ACL_ERROR_RT_AICORE_OVER_FLOW, "aicore over flow"}, | |||
| {ACL_ERROR_RT_NO_DEVICE, "no device"}, | |||
| {ACL_ERROR_RT_RESOURCE_ALLOC_FAIL, "resource alloc fail"}, | |||
| {ACL_ERROR_RT_NO_PERMISSION, "no permission"}, | |||
| {ACL_ERROR_RT_NO_EVENT_RESOURCE, "no event resource"}, | |||
| {ACL_ERROR_RT_NO_STREAM_RESOURCE, "no stream resource"}, | |||
| {ACL_ERROR_RT_NO_NOTIFY_RESOURCE, "no notify resource"}, | |||
| {ACL_ERROR_RT_NO_MODEL_RESOURCE, "no model resource"}, | |||
| {ACL_ERROR_RT_INTERNAL_ERROR, "runtime internal error"}, | |||
| {ACL_ERROR_RT_TS_ERROR, "ts internel error"}, | |||
| {ACL_ERROR_RT_STREAM_TASK_FULL, "task full in stream"}, | |||
| {ACL_ERROR_RT_STREAM_TASK_EMPTY, " task empty in stream"}, | |||
| {ACL_ERROR_RT_STREAM_NOT_COMPLETE, "stream not complete"}, | |||
| {ACL_ERROR_RT_END_OF_SEQUENCE, "end of sequence"}, | |||
| {ACL_ERROR_RT_EVENT_NOT_COMPLETE, "event not complete"}, | |||
| {ACL_ERROR_RT_CONTEXT_RELEASE_ERROR, "context release error"}, | |||
| {ACL_ERROR_RT_SOC_VERSION, "soc version error"}, | |||
| {ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT, "task type not support"}, | |||
| {ACL_ERROR_RT_LOST_HEARTBEAT, "ts lost heartbeat"}, | |||
| {ACL_ERROR_RT_MODEL_EXECUTE, " model execute failed"}, | |||
| {ACL_ERROR_RT_REPORT_TIMEOUT, "report timeout"}, | |||
| {ACL_ERROR_RT_SYS_DMA, "sys dma error"}, | |||
| {ACL_ERROR_RT_AICORE_TIMEOUT, "aicore timeout"}, | |||
| {ACL_ERROR_RT_AICORE_EXCEPTION, "aicore exception"}, | |||
| {ACL_ERROR_RT_AICORE_TRAP_EXCEPTION, " aicore trap exception"}, | |||
| {ACL_ERROR_RT_AICPU_TIMEOUT, " aicpu timeout"}, | |||
| {ACL_ERROR_RT_AICPU_EXCEPTION, "aicpu exception"}, | |||
| {ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR, " aicpu datadump response error"}, | |||
| {ACL_ERROR_RT_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"}, | |||
| {ACL_ERROR_RT_PROFILING_ERROR, "profiling error"}, | |||
| {ACL_ERROR_RT_IPC_ERROR, "ipc error"}, | |||
| {ACL_ERROR_RT_MODEL_ABORT_NORMAL, "model abort normal"}, | |||
| {ACL_ERROR_RT_KERNEL_UNREGISTERING, "kernel unregistering"}, | |||
| {ACL_ERROR_RT_RINGBUFFER_NOT_INIT, "ringbuffer not init"}, | |||
| {ACL_ERROR_RT_RINGBUFFER_NO_DATA, "ringbuffer no data"}, | |||
| {ACL_ERROR_RT_KERNEL_LOOKUP, "kernel lookup error"}, | |||
| {ACL_ERROR_RT_KERNEL_DUPLICATE, "kernel register duplicate"}, | |||
| {ACL_ERROR_RT_DEBUG_REGISTER_FAIL, "debug register failed"}, | |||
| {ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL, "debug unregister failed"}, | |||
| {ACL_ERROR_RT_LABEL_CONTEXT, "label not in current context"}, | |||
| {ACL_ERROR_RT_PROGRAM_USE_OUT, "program register num use out"}, | |||
| {ACL_ERROR_RT_DEV_SETUP_ERROR, "device setup error"}, | |||
| {ACL_ERROR_RT_DRV_INTERNAL_ERROR, "drv internal error"}, | |||
| }; | |||
| namespace mindspore { | |||
| std::string GetErrorMsg(uint32_t rt_error_code) { | |||
| auto find_iter = error_msg.find(rt_error_code); | |||
| if (find_iter == error_msg.end()) { | |||
| return "Return error code unknown, ret code: " + std::to_string(rt_error_code); | |||
| } | |||
| return find_iter->second; | |||
| } | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,24 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ | |||
| #define MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ | |||
| #include <string> | |||
| namespace mindspore { | |||
| std::string GetErrorMsg(uint32_t rt_error_code); | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_ | |||
| @@ -157,6 +157,6 @@ RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCal | |||
| RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback) {return RT_ERROR_NONE; } | |||
| RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback) { | |||
| RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallback callback) { | |||
| return RT_ERROR_NONE; | |||
| } | |||