partial support for multi root graph in online debugger

4 years ago · 7f682ba2f6
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -335,7 +335,41 @@ void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tenso
 }
 #endif

 #ifdef ONLINE_DBG_MODE
 bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  auto debugger = Debugger::GetInstance();
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
      device_target == kAscendDevice) {
    if (cur_root_graph_id != id) {
      return false;
    }
  }
  return true;
 }

 const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
  std::shared_ptr<TensorData> prev_tensor_data;
  if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
    // not supporting watchpoints that need prev tensor for multi root graph networks.
    MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
    prev_tensor_data = nullptr;
  } else {
    prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  }
  if (prev_tensor_data) {
    *prev_num_elements = prev_tensor_data->GetNumElements();
    return prev_tensor_data->GetDataPtr();
  }
  return nullptr;
 }
 #endif

 void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  // check history error_code only for offline debugger
  if (history_not_found) {
    *error_code = ITensorSummary::HISTORY_NOT_FOUND;  // error code for history not found
  }
@@ -401,13 +435,14 @@ void DebugServices::CheckWatchpointsForTensor(
    bool history_not_found = 0;
    previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
 #else
    std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
    if (prev_tensor_data) {
      previous_tensor_ptr = prev_tensor_data->GetDataPtr();
      prev_num_elements = prev_tensor_data->GetNumElements();
    if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
      MS_LOG(DEBUG)
        << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
        << tensor->GetName();
      continue;
    }
    previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
 #endif

    std::unique_ptr<ITensorSummary> base_summary_ptr;
    if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
      base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
@@ -440,7 +475,6 @@ void DebugServices::CheckWatchpointsForTensor(
          tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
      }
    }

 #ifdef OFFLINE_DBG_MODE
    SetTensorToNotInUse(tensor, previous_tensor_ptr);
    // in offline mode remove the need for the data
@@ -448,6 +482,7 @@ void DebugServices::CheckWatchpointsForTensor(
 #endif
  }
 }

 void DebugServices::CheckWatchpoints(
  std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
@@ -1362,6 +1397,14 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
    if (std::get<1>(result) == nullptr) {
      continue;
    }
 #ifdef ONLINE_DBG_MODE
    if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
      MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
                   << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
                   << ".";
      MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
    }
 #endif
    (void)ret_name->emplace_back(std::get<0>(result));
    (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
    (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -260,6 +260,8 @@ class DebugServices {
    const std::vector<parameter_t> &parameter_list);
 #endif

  const void *PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name);

  void CheckHistoryErrorCode(int *error_code, bool history_not_found);

  void CheckWatchpointsForTensor(partitioned_names *chunk_names, partitioned_names *chunk_slots,
@@ -411,6 +413,8 @@ class DebugServices {
  bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;

  bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;

  bool CompareCurrentRootGraph(uint32_t id);
 #endif

  std::vector<std::shared_ptr<TensorData>> GetTensor() const;
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -77,6 +77,8 @@ Debugger::Debugger()
      node_name_(""),
      cur_name_(""),
      training_done_(false),
      send_metadata_done_(false),
      received_new_graph_(false),
      is_dataset_graph_(false),
      partial_memory_(false),
      initial_suspend_(true),
@@ -284,20 +286,35 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
  }
  // Store graphs that are run in one step.
  graph_ptr_step_vec_ = graphs;
  prev_root_graph_id_ = cur_root_graph_id_;
  // set first run graph as the root graph
  cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
  MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
                << " for step: " << num_step_ << ".";
  MS_LOG(DEBUG) << "Set root graph for all the subgraphs:";
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
    const auto &graph = graphs[graph_index];
    // set root graph id for GPU mindrt runtime.
    MS_LOG(DEBUG) << "Set root graph for graph: " << graph->graph_id() << " to: " << cur_root_graph_id_ << ".";
    graph->set_root_graph_id(cur_root_graph_id_);
    if (debugger_) {
      debugger_->PreExecute(graph);
    }
  }
 }

 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
  auto graph_id = graph_ptr->graph_id();
 void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
  // for GPU root graphs are set in PreExecuteGraphDebugger.
  if (device_target_ != kAscendDevice) {
    return;
  }
  prev_root_graph_id_ = cur_root_graph_id_;
  cur_root_graph_id_ = root_graph_id;
  MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
                << " for step: " << num_step_ << ".";
 }

 void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
  // collect rungrap_ids to update step number in multigraph case
  if (!rungraph_id_list_.size()) {
    rungraph_id_list_.push_back(graph_id);
@@ -307,6 +324,17 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
      rungraph_id_list_.push_back(graph_id);
    }
  }
 }

 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
  auto graph_id = graph_ptr->graph_id();
  MS_LOG(DEBUG) << "PreExecute for graph: " << graph_id << " in step: " << num_step_ << ".";
  StoreRunGraphIdList(graph_id);
  SetCurrentAndPrevRootGraph(graph_ptr->root_graph_id());
  // multiple graphs
  if (graph_proto_list_.size() > 1) {
    // there are more than one graphs are not dataset_graph
@@ -315,20 +343,22 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
    }
  } else if (graph_proto_list_.size() == 1) {
    // single graph, and not the initial step
    if (device_target_ == kGPUDevice && num_step_ != 0) {
    if (device_target_ == kGPUDevice && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) &&
        num_step_ != 0) {
      if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
        CommandLoop();
      }
      debug_services_->ResetLoadedTensors();
    }
    // In single graph case, reset graph_ptr_ to be nullptr for the initial step
    if (num_step_ == 0) {
    // In single graph case, reset graph_ptr_ to be nullptr when debugger receives a new graph
    if (received_new_graph_) {
      graph_ptr_ = nullptr;
      CheckGraphPtr(graph_ptr);
    }
  } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
  } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice &&
             !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
    // Multiple graph, and not the initial step,
    // stop only when receive the first sub run graph for each step
    // stop only when receive the first sub run graph for each step for old runtime
    // if we have stopped for the last kernel before, no need to stop again
    if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
      return;
@@ -359,6 +389,7 @@ void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
    SendMultiGraphsAndSuspend(graph_proto_list_);

    graph_proto_list_.clear();
    received_new_graph_ = false;
  }
 }

@@ -474,14 +505,19 @@ void Debugger::PostExecute() {
      }
      SendWatchpoints(CheckWatchpoints());

      // no need to suspend at each graph for GPU, suspension happens in preExecute
      if (device_target_ != kGPUDevice) {
      // no need to suspend at each graph for GPU old runtime, suspension happens in preExecute
      if (device_target_ == kAscendDevice) {
        CommandLoop();
      } else if (device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
        if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
          CommandLoop();
        }
      }
    }
    // Only keep parameters in the current map
    // GPU ResetLoadedTensors happens in preExecute
    if (device_target_ != kGPUDevice) {
    // Only keep parameters in th current map
    // GPU ResetLoadedTensors for old runtime happens in preExecute
    if ((device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
        device_target_ == kAscendDevice) {
      debug_services_->ResetLoadedTensors();
    }
  }
@@ -534,6 +570,7 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
  if (graph_ptr_ != graph_ptr) {
    MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
    received_new_graph_ = true;
    // save new graph_ptr
    graph_ptr_ = graph_ptr;
    CheckDatasetGraph();
@@ -559,12 +596,16 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
    graph_ptr_ = graph_ptr;
    if (!is_dataset_graph_) {
      // only try to enable debugger if it is not a dataset graph
      EnableDebugger();
      if (!debugger_enabled_) {
        EnableDebugger();
      }
      if (debugger_enabled_) {
        LoadParametersAndConst();
        // get graph proto and send to Mindinsight
        auto graph_proto = graph_proto_list_.front();
        SendGraphAndSuspend(graph_proto);
        graph_proto_list_.clear();
        received_new_graph_ = false;
      }
    }
  }
@@ -636,16 +677,17 @@ void Debugger::SendHeartbeat(int32_t period) {
 }

 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  if (SendMetadata(true)) {
    // send graph to Mindinsight server
    MS_EXCEPTION_IF_NULL(grpc_client_);
    EventReply reply = grpc_client_->SendGraph(graph_proto);
    if (reply.status() != reply.OK) {
      MS_LOG(ERROR) << "Error: SendGraph failed";
    }
    // enter command loop, wait and process commands
    CommandLoop();
  if (!CheckSendMetadata()) {
    return;
  }
  // send graph to Mindinsight server
  MS_EXCEPTION_IF_NULL(grpc_client_);
  EventReply reply = grpc_client_->SendGraph(graph_proto);
  if (reply.status() != reply.OK) {
    MS_LOG(ERROR) << "Error: SendGraph failed";
  }
  // enter command loop, wait and process commands
  CommandLoop();
 }

 bool Debugger::SendMetadata(bool version_check) {
@@ -695,7 +737,7 @@ bool Debugger::SendMetadata(bool version_check) {
 }

 void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list) {
  if (!SendMetadata(true)) {
  if (!CheckSendMetadata()) {
    return;
  }
  MS_EXCEPTION_IF_NULL(grpc_client_);
@@ -732,10 +774,20 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
  CommandLoop();
 }

 bool Debugger::CheckSendMetadata() {
  if (!send_metadata_done_) {
    if (!SendMetadata(true)) {
      return false;
    }
    send_metadata_done_ = true;
  }
  return true;
 }

 void Debugger::CommandLoop() {
  // prepare metadata
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
  std::string device_name = std::to_string(device_id_) + ":" + std::to_string(cur_root_graph_id_);
  Metadata metadata;

  metadata.set_device_name(device_name);
@@ -1051,8 +1103,8 @@ std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &
  debug_services_->SearchNodesTensors(name, &result_list);
  for (auto result : result_list) {
    auto tensor = std::get<1>(result);
    if (!tensor) {
      // tensor was not found, creating empty tensor base.
    if (!tensor || cur_root_graph_id_ != tensor->GetRootGraphId()) {
      // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor base.
      TensorBase tensor_base_item;
      tensor_base_item.set_data_size(0);
      tensor_base_item.set_data_type(0);
@@ -1080,8 +1132,8 @@ std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto
  debug_services_->SearchNodesTensors(name, &result_list);
  for (auto result : result_list) {
    auto tensor = std::get<1>(result);
    if (!tensor) {
      // tensor was not found, creating empty tensor summary.
    if (!tensor || cur_root_graph_id_ != tensor->GetRootGraphId()) {
      // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor summary.
      DebugServices::TensorStat tensor_stat;
      AddTensorStatInfo(tensor_stat, &tensor_summary_list);
      continue;
@@ -1326,7 +1378,7 @@ bool Debugger::CheckIp(const std::string &host) const {

 uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }

 void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index) {
 void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
  MS_EXCEPTION_IF_NULL(anf_node);
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
    return;
@@ -1362,7 +1414,7 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
  } else {
    keep_prev = false;
  }
  bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev);
  bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id);
  if (!ret) {
    MS_LOG(ERROR) << "LoadMemToHost:"
                  << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@@ -1374,35 +1426,36 @@ void Debugger::LoadParametersAndConst() {
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  // load parameters
  MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id() << ".";
  auto root_graph_id = graph_ptr_->root_graph_id();
  const auto &parameters = graph_ptr_->inputs();
  for (auto &item : parameters) {
    LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX);
    LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  }
  // load value nodes
  // get all constant values from the graph
  MS_LOG(INFO) << "Start to load value nodes for graph " << graph_ptr_->graph_id() << ".";
  const auto value_nodes = graph_ptr_->graph_value_nodes();
  for (auto &item : value_nodes) {
    LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX);
    LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  }
 }

 void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
  if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  // load parameters
  MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
  const auto &parameters = graph_ptr_->inputs();
  auto root_graph_id = graph->root_graph_id();
  const auto &parameters = graph->inputs();
  for (auto &item : parameters) {
    LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX);
    LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  }
  // load value nodes
  // get all constant values from the graph
  MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
  const auto value_nodes = graph_ptr_->graph_value_nodes();
  const auto value_nodes = graph->graph_value_nodes();
  for (auto &item : value_nodes) {
    LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX);
    LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  }
 }

@@ -1410,6 +1463,7 @@ void Debugger::LoadGraphOutputs() {
  if (!(debugger_enabled() && device_target_ == kAscendDevice)) return;
  MS_EXCEPTION_IF_NULL(graph_ptr_);
  const auto &apply_kernels = graph_ptr_->execution_order();
  auto root_graph_id = graph_ptr_->root_graph_id();
  // for kernels, execution order starts from 1
  int exec_order = 1;
  for (const auto &node : apply_kernels) {
@@ -1435,7 +1489,7 @@ void Debugger::LoadGraphOutputs() {
      auto format = kOpFormat_DEFAULT;
      string tensor_name = kernel_name + ':' + std::to_string(j);
      ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
      auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false);
      auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id);
      if (!ret) {
        MS_LOG(ERROR) << "LoadMemToHost:"
                      << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@@ -1463,6 +1517,7 @@ void Debugger::UpdateStepNumGPU() {
    // access lock for public method
    std::lock_guard<std::mutex> a_lock(access_lock_);
    ++num_step_;
    MS_LOG(DEBUG) << "Update step for GPU, current step: " << num_step_;
  }
 }

--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -80,6 +80,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // do nothing if graph is set already
  void PreExecute(const KernelGraphPtr &graph_ptr);

  void SetCurrentAndPrevRootGraph(uint32_t root_graph_id);

  void StoreRunGraphIdList(uint32_t graph_id);

  // analyze tensors and wait for command
  // don't need a graph_ptr because it is saved during pre_execute
  void PostExecute();
@@ -131,6 +135,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // version_check should be true if you want the function to do backend compatibility check with Mindinsight
  bool SendMetadata(bool version_check);

  bool CheckSendMetadata();

  void LoadParametersAndConst();

  void LoadParametersAndConst(const KernelGraphPtr &graph);
@@ -149,6 +155,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  uint32_t GetFirstRunGraphId() const;

  uint32_t GetCurrentRootGraphId() const { return cur_root_graph_id_; }

  uint32_t GetPrevRootGraphId() const { return prev_root_graph_id_; }

  void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; }

  const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; }
@@ -246,7 +256,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // Check if the IP is valid
  bool CheckIp(const std::string &host) const;

  void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index);
  void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id);

  // class members

@@ -263,9 +273,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::string node_name_;
  std::string cur_name_;
  bool training_done_;
  bool send_metadata_done_;
  bool received_new_graph_;
  bool is_dataset_graph_;
  bool partial_memory_;
  std::mutex access_lock_;
  uint32_t cur_root_graph_id_ = UINT32_MAX;
  uint32_t prev_root_graph_id_ = UINT32_MAX;
  // flag to keep track of the very first suspension of debugger
  bool initial_suspend_;
  bool enable_heartbeat_;
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
@@ -52,7 +52,8 @@ std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &
  return real_outputs;
 }

 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_,
                uint32_t root_graph_id) {
  // get inputs
  auto kernel_inputs = launch_info_->inputs_;
  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
@@ -70,7 +71,8 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uin
    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
    string input_tensor_name = input_kernel_name + ':' + "0";
    ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true);
    auto ret =
      gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true, root_graph_id);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -79,7 +81,8 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uin
  }
 }

 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) {
 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_,
                 uint32_t root_graph_id) {
  // get outputs
  auto kernel_outputs = launch_info_->outputs_;
  auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
@@ -99,7 +102,7 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, ui
    auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type);
    string tensor_name = kernel_name + ':' + std::to_string(j);
    ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false);
    auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false, root_graph_id);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@@ -136,15 +139,17 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_
  }
  auto &dump_json_parser = DumpJsonParser::GetInstance();
  bool dump_enabled = debugger->DumpDataEnabledIteration();
  auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto root_graph_id = kernel_graph->root_graph_id();
  if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
    LoadInputs(cnode, launch_info_, exec_order_);
    LoadInputs(cnode, launch_info_, exec_order_, root_graph_id);
  }
  if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
    LoadOutputs(cnode, launch_info_, exec_order_);
    LoadOutputs(cnode, launch_info_, exec_order_, root_graph_id);
  }
  // Dump kernel
  if (dump_enabled) {
    auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
    MS_EXCEPTION_IF_NULL(kernel_graph);
    auto graph_id = kernel_graph->graph_id();
    debugger->DumpSingleNode(cnode, graph_id);
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.h
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.h
@@ -26,9 +26,11 @@ namespace mindspore {

 std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size);

 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_);
 void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_,
                uint32_t root_graph_id);

 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_);
 void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_,
                 uint32_t root_graph_id);

 bool CheckReadData(const CNodePtr &cnode);

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -589,8 +589,8 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::

 #ifdef ENABLE_DEBUGGER
 bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &,
                                        const ShapeVector &host_shape, TypeId host_type, size_t slot,
                                        bool keep_prev) const {
                                        const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
                                        uint32_t root_graph_id) const {
  bool ret = false;
  auto debugger = Debugger::GetInstance();
  MS_EXCEPTION_IF_NULL(debugger);
@@ -619,6 +619,7 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
  tensor_data->SetByteSize(LongToSize(out_tensor->data().nbytes()));
  tensor_data->SetType((unsigned int)host_type);
  tensor_data->SetShape(out_tensor->shape());
  tensor_data->SetRootGraphId(root_graph_id);
  ret = debugger->LoadNewTensor(tensor_data, keep_prev);
  return ret;
 }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.h
@@ -62,7 +62,8 @@ class AscendDeviceAddress : public DeviceAddress {
 #endif
 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
                     uint32_t root_graph_id = 0) const override;
 #endif

 private:
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@@ -118,7 +118,8 @@ class DeviceAddress : public mindspore::DeviceSync {
  }
 #ifdef ENABLE_DEBUGGER
  virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const {
                             const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
                             uint32_t root_graph_id = 0) const {
    return true;
  }
 #endif
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -141,8 +141,8 @@ GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }

 #ifdef ENABLE_DEBUGGER
 bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                                     const ShapeVector &host_shape, TypeId host_type, size_t slot,
                                     bool keep_prev) const {
                                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
                                     uint32_t root_graph_id) const {
  bool ret = false;
  if (size_ == 0) {
    return true;
@@ -171,6 +171,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
  tensor_data->SetByteSize(out_tensor->data().nbytes());
  tensor_data->SetType((unsigned int)host_type);
  tensor_data->SetShape(out_tensor->shape());
  tensor_data->SetRootGraphId(root_graph_id);
  ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev);
  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
  return ret;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.h
@@ -54,7 +54,8 @@ class GPUDeviceAddress : public DeviceAddress {

 #ifdef ENABLE_DEBUGGER
  bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override;
                     const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
                     uint32_t root_graph_id = 0) const override;
 #endif

 private:
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@@ -114,10 +114,10 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
 #ifdef ENABLE_DEBUGGER
  auto debugger = Debugger::GetInstance();
  if (debugger != nullptr) {
    debugger->Debugger::UpdateStepNumGPU();
    // Reset exec_order for the next step
    exec_order_ = 0;
    debugger->Debugger::PostExecuteGraphDebugger();
    debugger->Debugger::UpdateStepNumGPU();
  }
 #else
 #ifndef ENABLE_SECURITY
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@@ -324,6 +324,7 @@ GraphId GraphCompiler::CompileGraph(const AnfNodePtrList &nodes, const AnfNodePt
  auto backend_node = graph->output();
  MS_EXCEPTION_IF_NULL(backend_node);
  graph->CacheGraphOutputToFrontNodeWithIndex({backend_node}, outputs);
  graph->set_root_graph_id(graph_id);

  return graph_id;
 }