!16321 Fixing online debugger issue for multigraph GPU, skipping nodes when user clicks "next node"

From: @parastooashtari Reviewed-by: @john_tzanakakis,@robingrosman Signed-off-by: @john_tzanakakis
5 years ago · ef29ce7481
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -452,7 +452,7 @@ void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_gra
    DumpJsonParser::GetInstance().UpdateDumpIter();
  }
  if (debugger_) {
    debugger_->PostExecute(kernel_graph);
    debugger_->PostExecute();
  }
 }

--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -258,7 +258,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
      AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
      if (is_hit || error_code) {
        std::vector<int>::iterator iter;
        // if the execution order is repeated,inserts the new one before the others with same execution order.
        // if the execution order is repeated, inserts the new one before the others with same execution order.
        iter = std::lower_bound(exec_order.begin(), exec_order.end(), tensor->GetExecutionOrder());
        int position = iter - exec_order.begin();
        exec_order.insert(iter, tensor->GetExecutionOrder());
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -329,6 +329,13 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
          graph_ptr_ = dbg_graph_ptr;
          SendMultiGraphsAndSuspend(graph_proto_list_);
          graph_proto_list_.clear();
        } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
          // stop only when receive the first sub run graph for each step
          // if we have stopped for the last kernel before, no need to stop again
          if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
            CommandLoop();
          }
          debug_services_->ResetLoadedTensors();
        }
      }
    }
@@ -339,9 +346,11 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
    }
    CheckGraphPtr(graph_ptr);
  }
  // resets for the new graph
  suspended_at_last_kernel_ = 0;
 }

 void Debugger::PostExecute(const KernelGraphPtr &graph_ptr) {
 void Debugger::PostExecute() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  if (pipeline::ExecutorPy::GetDebugTerminate()) {
@@ -354,17 +363,16 @@ void Debugger::PostExecute(const KernelGraphPtr &graph_ptr) {
        num_step_++;
      }
      SendWatchpoints(CheckWatchpoints());
      if (graph_ptr != nullptr && device_target_ == kGPUDevice) {
        auto graph_id = graph_ptr->graph_id();
        if (graph_id == rungraph_id_list_.front()) {
          CommandLoop();
        }
      } else {
      // no need to suspend at each graph for GPU, suspension happens in preExecute
      if (device_target_ != kGPUDevice) {
        CommandLoop();
      }
    }
    // Only keep parameters in the current map
    debug_services_->ResetLoadedTensors();
    // GPU ResetLoadedTensors happens in preExecute
    if (device_target_ != kGPUDevice) {
      debug_services_->ResetLoadedTensors();
    }
  }
 }

@@ -398,9 +406,12 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
        hit_empty_flag = false;
      }
    }
    if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_) && !last_kernel) {
    if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
      // if kernel is not watchpoint and is next_to or continue_to node, suspend
      // No need to suspend if this is the last node in graph since PostExecute suspends at the end of graph
      // sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
      if (last_kernel) {
        suspended_at_last_kernel_ = 1;
      }
      CommandLoop();
    }
    return;
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -80,7 +80,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  // analyze tensors and wait for command
  // don't need a graph_ptr because it is saved during pre_execute
  void PostExecute(const KernelGraphPtr &graph_ptr = nullptr);
  void PostExecute();

  bool ReadNodeDataRequired(const CNodePtr &kernel) const;

@@ -235,6 +235,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::string device_target_;
  int32_t num_step_;
  bool debugger_enabled_;
  bool suspended_at_last_kernel_;
  std::string run_level_;
  std::string node_name_;
  std::string cur_name_;