enable debugger by default and set correct log message severity

5 years ago · b0a7ebdeb0
--- a/build.sh
+++ b/build.sh
@@ -56,7 +56,7 @@ usage()
  echo "    -K Compile with AKG, default on"
  echo "    -s Enable serving module, default off"
  echo "    -w Enable acl module, default off"
  echo "    -B Enable debugger, default off"
  echo "    -B Enable debugger, default on"
  echo "    -E Enable IBVERBS for parameter server, default off"
  echo "    -l Compile with python dependency, default on"
 }
@@ -102,7 +102,7 @@ checkopts()
  ENABLE_AKG="on"
  ENABLE_SERVING="off"
  ENABLE_ACL="off"
  ENABLE_DEBUGGER="off"
  ENABLE_DEBUGGER="on"
  ENABLE_IBVERBS="off"
  ENABLE_PYTHON="on"
  ENABLE_GPU="off"
@@ -282,8 +282,7 @@ checkopts()
        ;;
      B)
        check_on_off $OPTARG B
        ENABLE_DEBUGGER="on"
        echo "enable debugger"
        ENABLE_DEBUGGER="$OPTARG"
        ;;
      E)
        ENABLE_IBVERBS="on"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
@@ -16,9 +16,6 @@
 #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif

 namespace mindspore {
 namespace kernel {
@@ -39,11 +36,6 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
    output[i] = val[i];
  }

 #ifdef ENABLE_DEBUGGER
  // debugger will suspend execution is neccessary
  Debugger::GetInstance()->PostDebugOp();
 #endif

  return true;
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@@ -80,11 +80,13 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
  MS_EXCEPTION_IF_NULL(kernel_prev);
 #ifdef ENABLE_DEBUGGER
  auto debugger_ = mindspore::Debugger::GetInstance();
  DebugServices *debug_services = debugger_->debug_services();
  auto watchpoint_table = debug_services->GetWatchpointTable();
  std::string current_kernel_name = kernel_curr->scope_full_name();
  if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
    return false;
  if (debugger_->DebuggerBackendEnabled()) {
    DebugServices *debug_services = debugger_->debug_services();
    auto watchpoint_table = debug_services->GetWatchpointTable();
    std::string current_kernel_name = kernel_curr->scope_full_name();
    if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) {
      return false;
    }
  }
 #endif
  auto curr_stream_id = kernel_curr->stream_id();
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -605,16 +605,18 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph)
  MS_LOG(INFO) << "Start!";
  MS_EXCEPTION_IF_NULL(kernel_graph);
 #ifdef ENABLE_DEBUGGER
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  DebugServices *debug_services = debugger_->debug_services();
  TensorLoader *tensor_loader = debug_services->tensor_loader();
  // TensorData will be freed up here
  tensor_loader->EmptyTensor();
  uint32_t iter_num = tensor_loader->GetIterNum();
  tensor_loader->set_iter_num(++iter_num);
  (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
  tensor_loader->EmptyPrevTensor();
  if (debugger_->DebuggerBackendEnabled()) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    DebugServices *debug_services = debugger_->debug_services();
    TensorLoader *tensor_loader = debug_services->tensor_loader();
    // TensorData will be freed up here
    tensor_loader->EmptyTensor();
    uint32_t iter_num = tensor_loader->GetIterNum();
    tensor_loader->set_iter_num(++iter_num);
    (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get());
    tensor_loader->EmptyPrevTensor();
  }
 #endif
  MS_LOG(INFO) << "Finish!";
 }
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -26,9 +26,6 @@
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
 #include "frontend/parallel/ps/util.h"
 #endif
@@ -112,12 +109,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    summary_outputs = kernel_graph->summary_nodes();
    runtime_.IncreaseSummaryRefCount(summary_outputs);
  }
 #ifdef ENABLE_DEBUGGER
  // debugger pre-execution processing
  if (debugger_) {
    debugger_->PreExecute(kernel_graph);
  }
 #endif

  bool ret = runtime_.Run(kernel_graph.get(), false);
  if (!ret) {
    MS_LOG(EXCEPTION) << "Run graph failed";
@@ -128,12 +120,6 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
    runtime_.DecreaseSummaryRefCount(summary_outputs);
  }

 #ifdef ENABLE_DEBUGGER
  // debugger post-execution processing
  if (debugger_) {
    debugger_->PostExecute();
  }
 #endif
  MS_LOG(INFO) << "Run graph end";
 }

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -351,10 +351,12 @@ void GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info
 #ifdef ENABLE_DEBUGGER
 void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
 #ifdef ENABLE_DUMP_E2E
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
  if (debugger_->DebuggerBackendEnabled()) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    (void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
  }
 #endif
 }

--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -80,25 +80,16 @@ void Debugger::EnableDebugger() {
  grpc_client_ = nullptr;
  debug_services_ = nullptr;

  // see if dump is enabled
  bool dump_enabled = false;
  if (device_target_ == kGPUDevice) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    dump_enabled = runtime_instance->DumpDataEnabled();
  }
  // see if dump using debugger backend is enabled
  bool dump_enabled = CheckDebuggerDumpEnabled();
  MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;

  // get env variables to configure debugger
  const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
  if (env_enable_str != nullptr) {
    MS_LOG(INFO) << "Getenv ENABLE_MS_DEBUGGER: " << env_enable_str;
    if (std::strcmp(env_enable_str, "1") == 0) {
      debugger_enabled_ = true;
    }
  }
  // check if debugger enabled
  debugger_enabled_ = CheckDebuggerEnabled();
  MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;

  if (!debugger_enabled_ && !dump_enabled) {
    MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
    MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
    return;
  }

@@ -109,7 +100,7 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
    host = std::string(env_host_str);
  } else {
    MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
    host = "localhost";
  }
  // configure grpc port
@@ -119,7 +110,7 @@ void Debugger::EnableDebugger() {
    MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
    port = std::string(env_port_str);
  } else {
    MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
    MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
    port = "50051";
  }

@@ -140,8 +131,8 @@ void Debugger::EnableDebugger() {
    MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
                       "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
  } else {
    MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
                       "usage for large models.";
    MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
                    "usage for large models.";
  }
 #ifdef ENABLE_D
  // set operation overflow info
@@ -180,6 +171,29 @@ void Debugger::EnableDebugger() {
  debug_services_ = std::make_unique<DebugServices>();
 }

 bool Debugger::CheckDebuggerDumpEnabled() {
  // see if dump is enabled
  if (device_target_ == kGPUDevice) {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    return runtime_instance->DumpDataEnabled();
  }
  return false;
 }

 bool Debugger::CheckDebuggerEnabled() {
  // get env variables to configure debugger
  const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
  if (env_enable_str != nullptr) {
    if (std::strcmp(env_enable_str, "1") == 0) {
      return true;
    }
  }
  return false;
 }

 bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }

 void Debugger::Reset() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
@@ -201,25 +215,29 @@ void Debugger::Reset() {
 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  // check and save graph_ptr, suspend if graph is new
  CheckGraphPtr(graph_ptr);
  if (debugger_->DebuggerBackendEnabled()) {
    // check and save graph_ptr, suspend if graph is new
    CheckGraphPtr(graph_ptr);
  }
 }

 void Debugger::PostExecute() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  // analyze tensor data and send the watchpoints been hit
  if (run_level_ == "node") {
    MS_LOG(INFO) << "Debugger is in node level mode ";
    return;
  }
  if (debugger_enabled_ && !is_dataset_graph_) {
    if (device_target_ != kGPUDevice) {
      num_step_++;
      MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
      SendWatchpointsAndSuspend(CheckWatchpoints());
    } else {
      CommandLoop();
  if (debugger_->DebuggerBackendEnabled()) {
    // analyze tensor data and send the watchpoints been hit
    if (run_level_ == "node") {
      MS_LOG(INFO) << "Debugger is in node level mode ";
      return;
    }
    if (debugger_enabled_ && !is_dataset_graph_) {
      if (device_target_ != kGPUDevice) {
        num_step_++;
        MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
        SendWatchpointsAndSuspend(CheckWatchpoints());
      } else {
        CommandLoop();
      }
    }
  }
 }
@@ -302,8 +320,8 @@ void Debugger::CheckDatasetGraph() {
    auto node_name = AnfAlgo::GetCNodeName(node);
    MS_LOG(INFO) << "node: " << node->fullname_with_scope();
    if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
      MS_LOG(WARNING) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
                      << node_name;
      MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
                   << node_name;
      is_dataset_graph_ = true;
      return;
    }
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -96,6 +96,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();

  // check if any feature that uses the debugger backend is enabled
  bool DebuggerBackendEnabled();

 private:
  // private constructor for singleton
  Debugger();
@@ -105,6 +108,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // read env variable for grpc client
  void EnableDebugger();

  // check if dump using debugger backend is enabled
  bool CheckDebuggerDumpEnabled();

  // check if debugger enabled
  bool CheckDebuggerEnabled();

  // check and save graph pointer
  void CheckGraphPtr(const KernelGraphPtr &graph_ptr);

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -40,7 +40,7 @@ class AscendKernelRuntime : public KernelRuntime {
  ~AscendKernelRuntime() override;
  bool Init() override;
  bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
  bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
  bool GenTask(const session::KernelGraph *graph);
  bool LoadTask(const session::KernelGraph *graph);
  bool RunTask(const session::KernelGraph *graph);
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -97,14 +97,16 @@ void DataDumper::LoadDumpInfo() {
 #ifdef ENABLE_DEBUGGER
  auto debugger = mindspore::Debugger::GetInstance();
  MS_EXCEPTION_IF_NULL(debugger);
  std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
  // extract stream id, task id and opname from runtime_info_map for overflow detection
  std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
                 std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
                 [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
                   -> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
                   return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
                 });
  if (debugger->DebuggerBackendEnabled()) {
    std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
    // extract stream id, task id and opname from runtime_info_map for overflow detection
    std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
                   std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
                   [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
                     -> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
                     return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
                   });
  }
 #endif
  MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
 }
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -49,6 +49,8 @@ bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *d
  return false;
 }

 bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; }

 bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
  MS_EXCEPTION_IF_NULL(kernel);
  if (AnfAlgo::OutputAddrExist(kernel, index)) {
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -59,6 +59,7 @@ class KernelRuntime {
  bool DumpDataEnabled();
  bool DumpDataEnabledIteration();
  virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
  virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
  virtual bool Load(session::KernelGraph *graph, bool is_task_sink);
  virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0;
  bool LaunchKernel(const session::KernelGraph *graph);
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -53,11 +53,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_TASK_SINK, true);
  set_param<bool>(MS_CTX_IR_FUSION_FLAG, true);
  set_param<bool>(MS_CTX_ENABLE_HCCL, false);
 #ifdef ENABLE_DEBUGGER
  set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, false);
 #else
  set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, true);
 #endif
  set_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY, true);
  set_param<bool>(MS_CTX_PRECOMPILE_ONLY, false);
  set_param<bool>(MS_CTX_ENABLE_AUTO_MIXED_PRECISION, false);