Add debugger to new unified GPU runtime

4 years ago · 6ed17d52b1
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -34,6 +34,15 @@
 #include "debug/data_dump/e2e_dump.h"
 #include "utils/config_manager.h"
 #include "debug/env_config_parser.h"
 #include "utils/comm_manager.h"
 #include "runtime/framework/actor/actor_common.h"
 #include "runtime/hardware/device_context_manager.h"
 #include "debug/anf_ir_dump.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/proto_exporter.h"
 #else
 #include "debug/debugger/proto_exporter_stub.h"
 #endif

 using debugger::Chunk;
 using debugger::EventReply;
@@ -228,6 +237,9 @@ bool Debugger::CheckDebuggerDumpEnabled() const {
  // see if dump is enabled
  if (device_target_ == kGPUDevice) {
    return device::KernelRuntime::DumpDataEnabled();
  } else if (IsMindRTUsed()) {
    auto &dump_json_parser = DumpJsonParser::GetInstance();
    return dump_json_parser.e2e_dump_enabled();
  }
  return false;
 }
@@ -289,8 +301,23 @@ void Debugger::Reset() {
  graph_ptr_list_.clear();
 }

 void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
  // Only GPU is supported for MindRTBackend
  if (device_target_ != kGPUDevice) {
    return;
  }
  uint32_t graph_sum = graphs.size();
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
    const auto &graph = graphs[graph_index];
    if (debugger_) {
      debugger_->PreExecute(graph, graph_sum);
    }
    DumpSetup(graph);
  }
 }
 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
  // access lock for public method

  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
  auto graph_id = graph_ptr->graph_id();
@@ -313,7 +340,6 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
      if (!debugger_enabled_) {
        EnableDebugger();
      }

      if (debugger_enabled_) {
        if (graph_proto_list_.size()) {
          // only send compiled graphs once.
@@ -323,7 +349,9 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
          LoadParametersAndConst();
          // revert graph ptr to original value
          graph_ptr_ = dbg_graph_ptr;

          SendMultiGraphsAndSuspend(graph_proto_list_);

          graph_proto_list_.clear();
        } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
          // stop only when receive the first sub run graph for each step
@@ -351,6 +379,89 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
  // resets for the new graph
  suspended_at_last_kernel_ = 0;
 }
 bool Debugger::DumpDataEnabledIteration() const {
  auto &dump_json_parser = DumpJsonParser::GetInstance();
  if (!dump_json_parser.e2e_dump_enabled()) {
    return false;
  }

  auto cur_iter = dump_json_parser.cur_dump_iter();
  if (dump_json_parser.IsDumpIter(cur_iter)) {
    return true;
  }
  return false;
 }

 void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  const auto &device_context =
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
  uint32_t rank_id = device_context->GetRankID();
  if (debugger_->DebuggerBackendEnabled()) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get());
  } else {
    DumpJsonParser::GetInstance().UpdateDumpIter();
  }
 }

 void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
  MS_LOG(INFO) << "Start!";
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  const auto &device_context =
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
  uint32_t rank_id = device_context->GetRankID();
  MS_EXCEPTION_IF_NULL(kernel_graph);
  E2eDump::DumpSetup(kernel_graph.get(), rank_id);
  MS_LOG(INFO) << "Finish!";
 }
 void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
  // This function will be called for new GPU runtime using MindRTBackend
  auto &json_parser = DumpJsonParser::GetInstance();
  if (json_parser.e2e_dump_enabled()) {
    auto ms_context = MsContext::GetInstance();
    MS_EXCEPTION_IF_NULL(ms_context);
    std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
    uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
    const auto &device_context =
      device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
    uint32_t rank_id = device_context->GetRankID();
    kernel_graph->set_root_graph_id(kernel_graph->graph_id());
    std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
    std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
    std::string target_dir = root_dir + "/graphs";
    std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
    DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
    DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
    DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
                      kernel_graph->execution_order());
  }
 }
 void Debugger::PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
  // Only GPU is supported for MindRTBackend
  if (device_target_ != kGPUDevice) {
    return;
  }
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
    const auto &graph = graphs[graph_index];
    bool dump_enabled = debugger_->DumpDataEnabledIteration();
    // debug used for dump
    if (debugger_ && dump_enabled) {
      debugger_->Dump(graph);
    } else {
      DumpJsonParser::GetInstance().UpdateDumpIter();
    }
    if (debugger_) {
      debugger_->PostExecute();
    }
  }
 }

 void Debugger::PostExecute() {
  // access lock for public method
@@ -365,6 +476,7 @@ void Debugger::PostExecute() {
        num_step_++;
      }
      SendWatchpoints(CheckWatchpoints());

      // no need to suspend at each graph for GPU, suspension happens in preExecute
      if (device_target_ != kGPUDevice) {
        CommandLoop();
@@ -388,7 +500,6 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
  }
  return false;
 }

 void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
@@ -405,6 +516,7 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
      if (!hits.empty()) {
        SendWatchpoints(hits);
        CommandLoop();

        hit_empty_flag = false;
      }
    }
@@ -507,7 +619,6 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
  ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
  return model.graph();
 }

 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  if (SendMetadata(true)) {
    // send graph to Mindinsight server
@@ -533,7 +644,9 @@ bool Debugger::SendMetadata(bool version_check) {
  MS_LOG(INFO) << "Is training done?" << training_done_;
  // set graph munber to not_dataset_graph_sum_
  metadata.set_graph_num(not_dataset_graph_sum_);

  EventReply reply_metadata = grpc_client_->SendMetadata(metadata);

  bool ret = false;
  if (reply_metadata.status() == reply_metadata.OK) {
    if (version_check) {
@@ -575,6 +688,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
    auto graph_size = graph.ByteSize();
    if (graph_size > g_chunk_size) {
      auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);

      for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
        chunk.set_buffer(sub_graph_str[i]);
        chunked_graph_proto_list.push_back(chunk);
@@ -834,7 +948,6 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
  }
  return tensor_list;
 }

 void Debugger::Exit() {
  // clear resource before exit
  // debugger will notify main thread to exit because main thread can only exit at step boundary
@@ -1171,6 +1284,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
    return;
  }
  // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
  if (IsMindRTUsed() && (device_target_ == kGPUDevice)) {
    if (!anf_node->isa<ValueNode>() &&
        !(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
      return;
    }
  }
  // for parameters and value nodes, set its execution order to be 0;
  int exec_order = 0;
  std::string node_name = anf_node->fullname_with_scope();
@@ -1268,6 +1388,14 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
    ++num_step_;
  }
 }
 void Debugger::UpdateStepNumGPU() {
  // UpdateStepNum with DebugActor::DebugOnStepEnd
  if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
    // access lock for public method
    std::lock_guard<std::mutex> a_lock(access_lock_);
    ++num_step_;
  }
 }

 void Debugger::ClearCurrentData() {
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -73,6 +73,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // reset debugger
  void Reset();

  void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
  // enable debugger
  // send graph and wait for command
  // do nothing if graph is set already
@@ -82,6 +83,16 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // don't need a graph_ptr because it is saved during pre_execute
  void PostExecute();

  bool DumpDataEnabledIteration() const;

  void Dump(const KernelGraphPtr &kernel_graph) const;

  void DumpSetup(const KernelGraphPtr &kernel_graph) const;

  void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);

  void PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);

  bool ReadNodeDataRequired(const CNodePtr &kernel) const;

  void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
@@ -132,6 +143,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  void UpdateStepNum(const session::KernelGraph *graph);

  void UpdateStepNumGPU();

  void ClearCurrentData();

  void LoadGraphOutputs();
@@ -194,7 +207,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void ProcessKSetCMD(const EventReply &reply);
  // Process the KViewCMD
  void ProcessKViewCMD(const EventReply &reply);

  // set what nodes and conditions to watch
  void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
                     const ProtoVector<WatchCondition_Parameter> &parameters);
@@ -228,6 +240,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index);

  // class members

  std::unique_ptr<GrpcClient> grpc_client_;
  std::unique_ptr<DebugServices> debug_services_;
  KernelGraphPtr graph_ptr_;
@@ -249,6 +262,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::map<uint32_t, std::string> overflow_bin_path_;
  // flag to keep track of the very first suspension of debugger
  bool initial_suspend_;

  std::list<GraphProto> graph_proto_list_;
  std::list<KernelGraphPtr> graph_ptr_list_;

@@ -261,9 +275,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
 };

 using DebuggerPtr = std::shared_ptr<Debugger>;

 // get debugger ModelProto
 std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);

 ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);

 // for getting proto DataType from Type of Tensor
@@ -282,7 +296,6 @@ int32_t GetWatchpointID(const EventReply &reply);
 bool GetWatchpointDelete(const EventReply &reply);
 ProtoVector<TensorProto> GetTensors(const EventReply &reply);
 bool GetMiVersionMatched(const EventReply &reply);

 // get the full name of a tensor, which is the name used in TensorLoader
 std::string GetTensorFullName(const TensorProto &tensor);

--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
@@ -167,7 +167,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
 }

 void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *context) {
  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, device_context_, context, &GetAID());
  Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID());
 }

 void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@@ -15,20 +15,134 @@
 */

 #include "runtime/framework/actor/debug_actor.h"
 #include <vector>
 #include <memory>
 #include <string>
 #include "runtime/framework/actor/debug_aware_actor.h"
 #include "mindrt/include/async/async.h"
 #include "utils/log_adapter.h"
 #ifdef ENABLE_GPU
 #include "debug/debugger/debugger.h"
 #include "runtime/device/gpu/gpu_device_address.h"

 using mindspore::kernel::AddressPtr;
 using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
 #endif
 namespace mindspore {
 namespace runtime {
 void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
                       const AID *from_aid) {

 #ifdef ENABLE_GPU
 static const size_t PARAMETER_OUTPUT_INDEX = 0;

 std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
  // define a vector containing real output number
  std::vector<int> real_outputs;
  // P.BatchNorm is used for training and inference
  // can add the filter list for more operators here....
  if (node_name == "BatchNorm") {
    MS_LOG(INFO) << "loading node named " << node_name;
    real_outputs.insert(real_outputs.end(), {0, 3, 4});
  } else {
    // by default, TensorLoader will load all outputs
    for (size_t j = 0; j < output_size; ++j) {
      real_outputs.push_back(j);
    }
  }
  return real_outputs;
 }
 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
  // get inputs
  auto kernel_inputs = launch_info_->inputs_;
  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
  for (size_t j = 0; j < input_size; ++j) {
    auto input_kernel = cnode->input(j + 1);
    std::string input_kernel_name = input_kernel->fullname_with_scope();
    auto addr = kernel_inputs[j];
    auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
    // For example, this happens with the Depend op
    if (type == kMetaTypeNone) {
      continue;
    }
    auto format = kOpFormat_DEFAULT;
    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
    string input_tensor_name = input_kernel_name + ':' + "0";
    ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
    }
  }
 }
 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
  // get outputs
  auto kernel_outputs = launch_info_->outputs_;
  auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
  auto node_name = AnfAlgo::GetCNodeName(cnode);
  std::string kernel_name = cnode->fullname_with_scope();
  std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);

  for (int j : real_outputs) {
    auto addr = kernel_outputs[j];
    auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
    // For example, this happens with the Depend op
    if (type == kMetaTypeNone) {
      continue;
    }
    auto format = kOpFormat_DEFAULT;
    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
    string tensor_name = kernel_name + ':' + std::to_string(j);
    ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
 }
 #endif

 void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_,
                       const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, const AID *from_aid) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(device_context);
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
  // todo debug.

 // todo debug.
 #ifdef ENABLE_GPU
  if (node->isa<CNode>()) {
    const auto &cnode = node->cast<CNodePtr>();
    auto debugger = Debugger::GetInstance();
    if (debugger) {
      std::string kernel_name = cnode->fullname_with_scope();
      debugger->SetCurNode(kernel_name);
      bool read_data = false;
      auto &dump_json_parser = DumpJsonParser::GetInstance();
      bool dump_enabled = debugger->DumpDataEnabledIteration();
      if (dump_enabled) {
        auto dump_mode = dump_json_parser.dump_mode();
        // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list
        if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) {
          read_data = true;
        }
      } else if (debugger->debugger_enabled()) {
        read_data = debugger->ReadNodeDataRequired(cnode);
      }
      if (read_data) {
        if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
          LoadInputs(cnode, launch_info_, exec_order_);
        }
        if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
          LoadOutputs(cnode, launch_info_, exec_order_);
        }
        // check if the node is last kernel
        bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
        debugger->PostExecuteNode(cnode, last_kernel);
      }
    }
    exec_order_ += 1;
  }
 #endif
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
@@ -36,8 +150,16 @@ void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_conte
 void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
  // todo debug.

 // todo debug.
 #ifdef ENABLE_GPU
  auto debugger = Debugger::GetInstance();
  if (debugger) {
    debugger->Debugger::UpdateStepNumGPU();
    debugger->Debugger::LoadParametersAndConst();
    // Reset exec_order for the next step
    exec_order_ = 0;
  }
 #endif
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.h
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.h
@@ -24,6 +24,7 @@
 namespace mindspore {
 namespace runtime {
 using mindspore::device::DeviceContext;
 using mindspore::kernel::KernelLaunchInfo;

 // The debug actor is used to debug and dump kernel info, it gets the kernel real time execution info in the device, so
 // it is synchronous and blocked.
@@ -33,12 +34,17 @@ class DebugActor : public ActorBase {
  ~DebugActor() override = default;

  // The debug of each node.
  void Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context,
             const AID *from_aid);
  void Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context,
             OpContext<DeviceTensor> *op_context, const AID *from_aid);

  // The debug on step end.
  void DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid);

 private:
  // class members
  uint32_t exec_order_ = 0;
 };

 }  // namespace runtime
 }  // namespace mindspore

--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
@@ -169,7 +169,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) {
 }

 void KernelActor::SendDebugReq(OpContext<DeviceTensor> *context) {
  Async(*debug_aid_, &DebugActor::Debug, kernel_, device_context_, context, &GetAID());
  Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID());
 }

 void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) {
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@@ -24,6 +24,10 @@
 #include "ir/tensor.h"
 #include "backend/optimizer/common/helper.h"
 #include "base/base_ref_utils.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 #include "debug/data_dump/dump_json_parser.h"

 namespace mindspore {
 namespace runtime {
@@ -278,6 +282,9 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(device_context);

  auto &json_parser = DumpJsonParser::GetInstance();
  json_parser.Parse();

  // Execute optimization pass.
  auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output());
  device_context->OptimizeGraph(graph);
@@ -297,13 +304,20 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
  }

  graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));

 #ifdef ENABLE_DEBUGGER
  auto debugger = Debugger::GetInstance();
  debugger->DumpInGraphCompiler(graph);
 #endif
  MS_EXCEPTION_IF_NULL(session_);
  session_->InitAllBucket(graph, device_context);

  session_->SetSummaryNodes(graph.get());
  SetSummaryNodesRefCount(graph.get());

 #ifdef ENABLE_DEBUGGER
  if (debugger && debugger->DebuggerBackendEnabled()) {
    debugger->LoadGraphs(graph);
  }
 #endif
  return graph->graph_id();
 }

--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@@ -31,7 +31,9 @@
 #ifdef ENABLE_DUMP_IR
 #include "debug/rdr/recorder_manager.h"
 #endif

 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 namespace mindspore {
 namespace runtime {
 namespace {
@@ -371,6 +373,18 @@ void GraphScheduler::Initialize() {
    (void)actorMgr->Spawn(base_recorder_actor, true);
  }
 #endif
 // Create and schedule debug actor.
 #ifdef ENABLE_DEBUGGER
  auto debugger = mindspore::Debugger::GetInstance();
  if (debugger->DebuggerBackendEnabled()) {
    auto debug_actor = std::make_shared<DebugActor>();
    MS_EXCEPTION_IF_NULL(debug_actor);
    debug_aid_ = &(debug_actor->GetAID());
    auto base_debug_actor = static_cast<ActorReference>(debug_actor);
    base_debug_actor->set_thread_pool(thread_pool_);
    (void)actorMgr->Spawn(base_debug_actor, true);
  }
 #endif
 }

 ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info, GraphExecutionStrategy strategy) {
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@@ -37,6 +37,7 @@
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "debug/rdr/running_data_recorder.h"
 #include "utils/comm_manager.h"
 #include "debug/debugger/debugger.h"

 namespace mindspore {
 namespace device {
@@ -91,6 +92,12 @@ bool GPUDeviceContext::Initialize() {
    (*init_nccl_comm_funcptr)();
  }

  auto rank_id = GetRankID();
  auto &json_parser = DumpJsonParser::GetInstance();
  // Dump json config file if dump is enabled
  json_parser.CopyJsonToDir(rank_id);
  json_parser.CopyMSCfgJsonToDir(rank_id);

  initialized_ = true;
  return ret;
 }
@@ -125,6 +132,12 @@ bool GPUDeviceContext::InitDevice() {

 void GPUDeviceContext::Destroy() {
  // Release GPU buffer manager resource
  auto debugger = Debugger::GetInstance();
  if (debugger && debugger->debugger_enabled()) {
    debugger->SetTrainingDone(true);
    debugger->SendMetadata(false);
  }

  if (GpuBufferMgr::GetInstance().IsInit()) {
    if (!GpuBufferMgr::GetInstance().IsClosed() && !GpuBufferMgr::GetInstance().CloseNotify()) {
      MS_LOG(EXCEPTION) << "Could not close gpu data queue.";
--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -36,7 +36,9 @@
 #ifdef ENABLE_GE
 #include "utils/callbacks_ge.h"
 #endif

 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 namespace mindspore {
 namespace compile {
 bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
@@ -577,10 +579,24 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
  const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
  MS_EXCEPTION_IF_NULL(actor_set);
  runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);

 // PreExecuteGraph
 #ifdef ENABLE_DEBUGGER
  auto debugger = Debugger::GetInstance();
  if (debugger) {
    debugger->Debugger::PreExecuteGraphDebugger(graph_compiler_info.graphs_);
  }
 #endif
  if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
    MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
  }

 // PostExecuteGraph
 #ifdef ENABLE_DEBUGGER
  if (debugger) {
    debugger->Debugger::PostExecuteGraphDebugger(graph_compiler_info.graphs_);
  }
 #endif
  // Sync device stream.
  const auto &first_device_context = graph_compiler_info.device_contexts_[0];
  MS_EXCEPTION_IF_NULL(first_device_context);
@@ -644,6 +660,15 @@ void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node,
  }
 }

 #ifdef ENABLE_DEBUGGER
 void MindRTBackend::SetDebugger() {
  auto debugger_ = Debugger::GetInstance();
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET));
 }
 #endif

 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
  MS_EXCEPTION_IF_NULL(root_graph);
  MS_EXCEPTION_IF_NULL(graph_compiler_);
--- a/mindspore/ccsrc/vm/backend.h
+++ b/mindspore/ccsrc/vm/backend.h
@@ -118,6 +118,9 @@ class MindRTBackend : public Backend {
  // Run Graph in the pyNative mode.
  void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
                const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
 #ifdef ENABLE_DEBUGGER
  void SetDebugger() override;
 #endif

 private:
  // The parameter func_graph is a graph, it can be either a root graph or a sub graph,