Browse Source

add overflow operator err msg

tags/v1.1.1
jjfeing yanghaoran 5 years ago
parent
commit
245dca4959
7 changed files with 144 additions and 20 deletions
  1. +1
    -1
      graphengine
  2. +14
    -15
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  3. +2
    -3
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
  4. +4
    -0
      mindspore/ccsrc/utils/CMakeLists.txt
  5. +98
    -0
      mindspore/ccsrc/utils/runtime_error_code.cc
  6. +24
    -0
      mindspore/ccsrc/utils/runtime_error_codes.h
  7. +1
    -1
      tests/ut/cpp/stub/runtime/runtime_stub.cc

+ 1
- 1
graphengine

@@ -1 +1 @@
Subproject commit 1b4f85776269f567d11153807ae7badc91803083
Subproject commit 79e930c7b86f39426ccde9e7941b2bfbcaf2c1f1

+ 14
- 15
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -47,6 +47,7 @@
#include "utils/shape_utils.h"
#include "utils/trace_base.h"
#include "graphengine/inc/external/acl/error_codes/rt_error_codes.h"
#include "utils/runtime_error_codes.h"
#include "debug/anf_ir_dump.h"
#ifdef MEM_REUSE_DEBUG
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
@@ -112,9 +113,8 @@ std::string GetRankId() {
}
} // namespace

std::vector<rtTaskFailInfo> AscendKernelRuntime::task_fail_infoes_ = {};
std::vector<rtExceptionInfo> AscendKernelRuntime::task_fail_infoes_ = {};
uint32_t AscendKernelRuntime::current_graph_id_ = 0;
std::map<std::string, uint32_t> AscendKernelRuntime::overflow_tasks_;
AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }

void AscendKernelRuntime::SetContext() {
@@ -538,26 +538,24 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) {
}
}

void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) {
void AscendKernelRuntime::TaskFailCallback(rtExceptionInfo *task_fail_info) {
MS_EXCEPTION_IF_NULL(task_fail_info);
static std::mutex exception_mutex;
std::lock_guard<std::mutex> lock(exception_mutex);
if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) {
auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid);
if (overflow_tasks_.find(key) == overflow_tasks_.end()) {
overflow_tasks_[key] = 1;
}
if (overflow_tasks_[key] == 5) {
auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid);
MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name;
overflow_tasks_[key] = 0;
auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid);
if (node_name.empty()) {
MS_LOG(WARNING) << "Can not get node by task id: " << task_fail_info->taskid
<< ", stream id:" << task_fail_info->streamid;
} else {
overflow_tasks_[key]++;
MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name << "(task id: " << task_fail_info->taskid
<< ", stream id:" << task_fail_info->streamid << ")";
}
} else {
MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid
<< ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid
<< ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode;
<< ", device_id: " << task_fail_info->deviceid
<< ", error msg: " << GetErrorMsg(task_fail_info->retcode);
task_fail_infoes_.push_back(*task_fail_info);
}
}
@@ -586,7 +584,8 @@ void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *grap
}
for (const auto &node : graph->execution_order()) {
if (node->fullname_with_scope() == full_scope_name) {
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/otput data to: " << local_path
<< " Error msg: " << GetErrorMsg(task_fail_info.retcode)
<< " trace: " << trace::DumpSourceLines(node);
E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr);
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr);
@@ -689,7 +688,7 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) {
bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors);
if (!status) {
DumpTaskExceptionInfo(graph);
std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir";
std::string file_name = "task_error_debug_" + std::to_string(current_graph_id_) + ".ir";
auto graph_tmp = std::make_shared<session::KernelGraph>(*graph);
DumpIR(file_name, graph_tmp);
#ifdef ENABLE_TDTQUE


+ 2
- 3
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h View File

@@ -81,7 +81,7 @@ class AscendKernelRuntime : public KernelRuntime {
void LaunchDataDump(GraphId graph_id);
static string GetErrorNodeName(uint32_t streamid, uint32_t taskid);
static void DumpTaskExceptionInfo(const session::KernelGraph *graph);
static void TaskFailCallback(rtTaskFailInfo *task_fail_info);
static void TaskFailCallback(rtExceptionInfo *task_fail_info);
void ReportProfilingData();

rtContext_t rt_context_{nullptr};
@@ -91,8 +91,7 @@ class AscendKernelRuntime : public KernelRuntime {
unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_;
std::map<std::pair<uint32_t, uint32_t>, std::string> stream_id_task_id_op_name_map_;
static uint32_t current_graph_id_;
static std::map<std::string, uint32_t> overflow_tasks_;
static std::vector<rtTaskFailInfo> task_fail_infoes_;
static std::vector<rtExceptionInfo> task_fail_infoes_;
};

MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime);


+ 4
- 0
mindspore/ccsrc/utils/CMakeLists.txt View File

@@ -4,6 +4,10 @@ if (NOT ENABLE_GE)
file(GLOB_RECURSE _UTILS_GE_SRC_FILES ./callbacks_ge.cc)
list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_GE_SRC_FILES})
endif ()
if (NOT ENABLE_D AND NOT ENABLE_TESTCASES)
file(GLOB_RECURSE _UTILS_D_SRC_FILES ./runtime_error_code.cc)
list(REMOVE_ITEM _UTILS_SRC_LIST ${_UTILS_D_SRC_FILES})
endif()

set_property(SOURCE ${_UTILS_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_UTILS)
add_library(_mindspore_utils_obj OBJECT ${_UTILS_SRC_LIST})

+ 98
- 0
mindspore/ccsrc/utils/runtime_error_code.cc View File

@@ -0,0 +1,98 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/runtime_error_codes.h"
#include <map>
#include <vector>
#include "graphengine/inc/external/runtime/rt_error_codes.h"

const std::map<uint32_t, std::string> error_msg = {
{ACL_RT_SUCCESS, "success"},
{ACL_ERROR_RT_PARAM_INVALID, "param invalid"},
{ACL_ERROR_RT_INVALID_DEVICEID, "invalid device id"},
{ACL_ERROR_RT_CONTEXT_NULL, "current context null"},
{ACL_ERROR_RT_STREAM_CONTEXT, "stream not in current context"},
{ACL_ERROR_RT_MODEL_CONTEXT, "model not in current context"},
{ACL_ERROR_RT_STREAM_MODEL, "stream not in model"},
{ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"},
{ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL, " event timestamp reversal"},
{ACL_ERROR_RT_ADDR_UNALIGNED, "memory address unaligned"},
{ACL_ERROR_RT_FILE_OPEN, "open file failed"},
{ACL_ERROR_RT_FILE_WRITE, "write file failed"},
{ACL_ERROR_RT_STREAM_SUBSCRIBE, "error subscribe stream"},
{ACL_ERROR_RT_THREAD_SUBSCRIBE, "error subscribe thread"},
{ACL_ERROR_RT_GROUP_NOT_SET, "group not set"},
{ACL_ERROR_RT_GROUP_NOT_CREATE, "group not create"},
{ACL_ERROR_RT_STREAM_NO_CB_REG, "callback not register to stream"},
{ACL_ERROR_RT_INVALID_MEMORY_TYPE, "invalid memory type"},
{ACL_ERROR_RT_INVALID_HANDLE, "invalid handle"},
{ACL_ERROR_RT_INVALID_MALLOC_TYPE, "invalid malloc type"},
{ACL_ERROR_RT_FEATURE_NOT_SUPPORT, "feature not support"},
{ACL_ERROR_RT_MEMORY_ALLOCATION, "memory allocation error"},
{ACL_ERROR_RT_MEMORY_FREE, "memory free error"},
{ACL_ERROR_RT_AICORE_OVER_FLOW, "aicore over flow"},
{ACL_ERROR_RT_NO_DEVICE, "no device"},
{ACL_ERROR_RT_RESOURCE_ALLOC_FAIL, "resource alloc fail"},
{ACL_ERROR_RT_NO_PERMISSION, "no permission"},
{ACL_ERROR_RT_NO_EVENT_RESOURCE, "no event resource"},
{ACL_ERROR_RT_NO_STREAM_RESOURCE, "no stream resource"},
{ACL_ERROR_RT_NO_NOTIFY_RESOURCE, "no notify resource"},
{ACL_ERROR_RT_NO_MODEL_RESOURCE, "no model resource"},
{ACL_ERROR_RT_INTERNAL_ERROR, "runtime internal error"},
{ACL_ERROR_RT_TS_ERROR, "ts internel error"},
{ACL_ERROR_RT_STREAM_TASK_FULL, "task full in stream"},
{ACL_ERROR_RT_STREAM_TASK_EMPTY, " task empty in stream"},
{ACL_ERROR_RT_STREAM_NOT_COMPLETE, "stream not complete"},
{ACL_ERROR_RT_END_OF_SEQUENCE, "end of sequence"},
{ACL_ERROR_RT_EVENT_NOT_COMPLETE, "event not complete"},
{ACL_ERROR_RT_CONTEXT_RELEASE_ERROR, "context release error"},
{ACL_ERROR_RT_SOC_VERSION, "soc version error"},
{ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT, "task type not support"},
{ACL_ERROR_RT_LOST_HEARTBEAT, "ts lost heartbeat"},
{ACL_ERROR_RT_MODEL_EXECUTE, " model execute failed"},
{ACL_ERROR_RT_REPORT_TIMEOUT, "report timeout"},
{ACL_ERROR_RT_SYS_DMA, "sys dma error"},
{ACL_ERROR_RT_AICORE_TIMEOUT, "aicore timeout"},
{ACL_ERROR_RT_AICORE_EXCEPTION, "aicore exception"},
{ACL_ERROR_RT_AICORE_TRAP_EXCEPTION, " aicore trap exception"},
{ACL_ERROR_RT_AICPU_TIMEOUT, " aicpu timeout"},
{ACL_ERROR_RT_AICPU_EXCEPTION, "aicpu exception"},
{ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR, " aicpu datadump response error"},
{ACL_ERROR_RT_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"},
{ACL_ERROR_RT_PROFILING_ERROR, "profiling error"},
{ACL_ERROR_RT_IPC_ERROR, "ipc error"},
{ACL_ERROR_RT_MODEL_ABORT_NORMAL, "model abort normal"},
{ACL_ERROR_RT_KERNEL_UNREGISTERING, "kernel unregistering"},
{ACL_ERROR_RT_RINGBUFFER_NOT_INIT, "ringbuffer not init"},
{ACL_ERROR_RT_RINGBUFFER_NO_DATA, "ringbuffer no data"},
{ACL_ERROR_RT_KERNEL_LOOKUP, "kernel lookup error"},
{ACL_ERROR_RT_KERNEL_DUPLICATE, "kernel register duplicate"},
{ACL_ERROR_RT_DEBUG_REGISTER_FAIL, "debug register failed"},
{ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL, "debug unregister failed"},
{ACL_ERROR_RT_LABEL_CONTEXT, "label not in current context"},
{ACL_ERROR_RT_PROGRAM_USE_OUT, "program register num use out"},
{ACL_ERROR_RT_DEV_SETUP_ERROR, "device setup error"},
{ACL_ERROR_RT_DRV_INTERNAL_ERROR, "drv internal error"},
};

namespace mindspore {
std::string GetErrorMsg(uint32_t rt_error_code) {
auto find_iter = error_msg.find(rt_error_code);
if (find_iter == error_msg.end()) {
return "Return error code unknown, ret code: " + std::to_string(rt_error_code);
}
return find_iter->second;
}
} // namespace mindspore

+ 24
- 0
mindspore/ccsrc/utils/runtime_error_codes.h View File

@@ -0,0 +1,24 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_
#define MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_

#include <string>
namespace mindspore {
std::string GetErrorMsg(uint32_t rt_error_code);
} // namespace mindspore
#endif // MINDSPORE_CCSRC_UTILS_RUNTIME_ERROR_CODES_H_

+ 1
- 1
tests/ut/cpp/stub/runtime/runtime_stub.cc View File

@@ -157,6 +157,6 @@ RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCal

RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback) {return RT_ERROR_NONE; }

RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback) {
RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallback callback) {
return RT_ERROR_NONE;
}

Loading…
Cancel
Save