Browse Source

enable debugger by default and set correct log message severity

tags/v1.0.0
John Tzanakakis 5 years ago
parent
commit
b0a7ebdeb0
13 changed files with 106 additions and 95 deletions
  1. +3
    -4
      build.sh
  2. +0
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
  3. +7
    -5
      mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
  4. +12
    -10
      mindspore/ccsrc/backend/session/ascend_session.cc
  5. +1
    -15
      mindspore/ccsrc/backend/session/cpu_session.cc
  6. +6
    -4
      mindspore/ccsrc/backend/session/gpu_session.cc
  7. +54
    -36
      mindspore/ccsrc/debug/debugger/debugger.cc
  8. +9
    -0
      mindspore/ccsrc/debug/debugger/debugger.h
  9. +1
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
  10. +10
    -8
      mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
  11. +2
    -0
      mindspore/ccsrc/runtime/device/kernel_runtime.cc
  12. +1
    -0
      mindspore/ccsrc/runtime/device/kernel_runtime.h
  13. +0
    -4
      mindspore/core/utils/ms_context.cc

+ 3
- 4
build.sh View File

@@ -56,7 +56,7 @@ usage()
echo " -K Compile with AKG, default on"
echo " -s Enable serving module, default off"
echo " -w Enable acl module, default off"
echo " -B Enable debugger, default off"
echo " -B Enable debugger, default on"
echo " -E Enable IBVERBS for parameter server, default off"
echo " -l Compile with python dependency, default on"
}
@@ -102,7 +102,7 @@ checkopts()
ENABLE_AKG="on"
ENABLE_SERVING="off"
ENABLE_ACL="off"
ENABLE_DEBUGGER="off"
ENABLE_DEBUGGER="on"
ENABLE_IBVERBS="off"
ENABLE_PYTHON="on"
ENABLE_GPU="off"
@@ -282,8 +282,7 @@ checkopts()
;;
B)
check_on_off $OPTARG B
ENABLE_DEBUGGER="on"
echo "enable debugger"
ENABLE_DEBUGGER="$OPTARG"
;;
E)
ENABLE_IBVERBS="on"


+ 0
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc View File

@@ -16,9 +16,6 @@
#include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif

namespace mindspore {
namespace kernel {
@@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
output[i] = val[i];
}

#ifdef ENABLE_DEBUGGER
// debugger will suspend execution is neccessary
Debugger::GetInstance()->PostDebugOp();
#endif

return true;
}
} // namespace kernel


+ 7
- 5
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc View File

@@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
MS_EXCEPTION_IF_NULL(kernel_prev);
#ifdef ENABLE_DEBUGGER
auto debugger_ = mindspore::Debugger::GetInstance();
DebugServices *debug_services = debugger_->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
std::string current_kernel_name = kernel_curr->scope_full_name();
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
return false;
if (debugger_->DebuggerBackendEnabled()) {
DebugServices *debug_services = debugger_->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
std::string current_kernel_name = kernel_curr->scope_full_name();
if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
return false;
}
}
#endif
auto curr_stream_id = kernel_curr->stream_id();


+ 12
- 10
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph)
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
#ifdef ENABLE_DEBUGGER
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
// TensorData will be freed up here
tensor_loader->EmptyTensor();
uint32_t iter_num = tensor_loader->GetIterNum();
tensor_loader->set_iter_num(++iter_num);
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
tensor_loader->EmptyPrevTensor();
if (debugger_->DebuggerBackendEnabled()) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
// TensorData will be freed up here
tensor_loader->EmptyTensor();
uint32_t iter_num = tensor_loader->GetIterNum();
tensor_loader->set_iter_num(++iter_num);
(void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
tensor_loader->EmptyPrevTensor();
}
#endif
MS_LOG(INFO) << "Finish!";
}


+ 1
- 15
mindspore/ccsrc/backend/session/cpu_session.cc View File

@@ -26,9 +26,6 @@
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "frontend/parallel/ps/util.h"
#endif
@@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
summary_outputs = kernel_graph->summary_nodes();
runtime_.IncreaseSummaryRefCount(summary_outputs);
}
#ifdef ENABLE_DEBUGGER
// debugger pre-execution processing
if (debugger_) {
debugger_->PreExecute(kernel_graph);
}
#endif

bool ret = runtime_.Run(kernel_graph.get(), false);
if (!ret) {
MS_LOG(EXCEPTION) << "Run graph failed";
@@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
runtime_.DecreaseSummaryRefCount(summary_outputs);
}

#ifdef ENABLE_DEBUGGER
// debugger post-execution processing
if (debugger_) {
debugger_->PostExecute();
}
#endif
MS_LOG(INFO) << "Run graph end";
}



+ 6
- 4
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info
#ifdef ENABLE_DEBUGGER
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
#ifdef ENABLE_DUMP_E2E
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
}
#endif
}



+ 54
- 36
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -80,25 +80,16 @@ void Debugger::EnableDebugger() {
grpc_client_ = nullptr;
debug_services_ = nullptr;

// see if dump is enabled
bool dump_enabled = false;
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
dump_enabled = runtime_instance->DumpDataEnabled();
}
// see if dump using debugger backend is enabled
bool dump_enabled = CheckDebuggerDumpEnabled();
MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;

// get env variables to configure debugger
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
if (env_enable_str != nullptr) {
MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str;
if (std::strcmp(env_enable_str, "1") == 0) {
debugger_enabled_ = true;
}
}
// check if debugger enabled
debugger_enabled_ = CheckDebuggerEnabled();
MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;

if (!debugger_enabled_ && !dump_enabled) {
MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
return;
}

@@ -109,7 +100,7 @@ void Debugger::EnableDebugger() {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
host = std::string(env_host_str);
} else {
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
host = "localhost";
}
// configure grpc port
@@ -119,7 +110,7 @@ void Debugger::EnableDebugger() {
MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
port = std::string(env_port_str);
} else {
MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
port = "50051";
}

@@ -140,8 +131,8 @@ void Debugger::EnableDebugger() {
MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
"step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
} else {
MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
"usage for large models.";
}
#ifdef ENABLE_D
// set operation overflow info
@@ -180,6 +171,29 @@ void Debugger::EnableDebugger() {
debug_services_ = std::make_unique<DebugServices>();
}

bool Debugger::CheckDebuggerDumpEnabled() {
// see if dump is enabled
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
return runtime_instance->DumpDataEnabled();
}
return false;
}

bool Debugger::CheckDebuggerEnabled() {
// get env variables to configure debugger
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
if (env_enable_str != nullptr) {
if (std::strcmp(env_enable_str, "1") == 0) {
return true;
}
}
return false;
}

bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }

void Debugger::Reset() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@@ -201,25 +215,29 @@ void Debugger::Reset() {
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// check and save graph_ptr, suspend if graph is new
CheckGraphPtr(graph_ptr);
if (debugger_->DebuggerBackendEnabled()) {
// check and save graph_ptr, suspend if graph is new
CheckGraphPtr(graph_ptr);
}
}

void Debugger::PostExecute() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// analyze tensor data and send the watchpoints been hit
if (run_level_ == "node") {
MS_LOG(INFO) << "Debugger is in node level mode ";
return;
}
if (debugger_enabled_ && !is_dataset_graph_) {
if (device_target_ != kGPUDevice) {
num_step_++;
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
SendWatchpointsAndSuspend(CheckWatchpoints());
} else {
CommandLoop();
if (debugger_->DebuggerBackendEnabled()) {
// analyze tensor data and send the watchpoints been hit
if (run_level_ == "node") {
MS_LOG(INFO) << "Debugger is in node level mode ";
return;
}
if (debugger_enabled_ && !is_dataset_graph_) {
if (device_target_ != kGPUDevice) {
num_step_++;
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
SendWatchpointsAndSuspend(CheckWatchpoints());
} else {
CommandLoop();
}
}
}
}
@@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() {
auto node_name = AnfAlgo::GetCNodeName(node);
MS_LOG(INFO) << "node: " << node->fullname_with_scope();
if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
<< node_name;
MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
<< node_name;
is_dataset_graph_ = true;
return;
}


+ 9
- 0
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();

// check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled();

private:
// private constructor for singleton
Debugger();
@@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// read env variable for grpc client
void EnableDebugger();

// check if dump using debugger backend is enabled
bool CheckDebuggerDumpEnabled();

// check if debugger enabled
bool CheckDebuggerEnabled();

// check and save graph pointer
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);



+ 1
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h View File

@@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime {
~AscendKernelRuntime() override;
bool Init() override;
bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
bool LoadData(session::KernelGraph *graph, Debugger *debugger);
bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
bool GenTask(const session::KernelGraph *graph);
bool LoadTask(const session::KernelGraph *graph);
bool RunTask(const session::KernelGraph *graph);


+ 10
- 8
mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc View File

@@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() {
#ifdef ENABLE_DEBUGGER
auto debugger = mindspore::Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
});
if (debugger->DebuggerBackendEnabled()) {
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
});
}
#endif
MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
}


+ 2
- 0
mindspore/ccsrc/runtime/device/kernel_runtime.cc View File

@@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d
return false;
}

bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; }

bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
MS_EXCEPTION_IF_NULL(kernel);
if (AnfAlgo::OutputAddrExist(kernel, index)) {


+ 1
- 0
mindspore/ccsrc/runtime/device/kernel_runtime.h View File

@@ -59,6 +59,7 @@ class KernelRuntime {
bool DumpDataEnabled();
bool DumpDataEnabledIteration();
virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
bool LaunchKernel(const session::KernelGraph *graph);


+ 0
- 4
mindspore/core/utils/ms_context.cc View File

@@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
set_param<bool>(MS_CTX_ENABLE_TASK_SINK, true);
set_param<bool>(MS_CTX_IR_FUSION_FLAG, true);
set_param<bool>(MS_CTX_ENABLE_HCCL, false);
#ifdef ENABLE_DEBUGGER
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, false);
#else
set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, true);
#endif
set_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY, true);
set_param<bool>(MS_CTX_PRECOMPILE_ONLY, false);
set_param<bool>(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);


Loading…
Cancel
Save