Debugger multi-graph support implementation

Other Contributor: Adel Shafiei, John Tzanakakis
5 years ago · 1b6265fa43
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -160,6 +160,11 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {

  HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo));
  memo.clear();
  // load graphs to debugger.
  if (debugger_) {
    LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo));
  }
  memo.clear();

  UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo));
  memo.clear();
@@ -191,7 +196,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
  // build kernel
  BuildKernel(root_graph);
  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(root_graph);
    debugger_->PreExecute(root_graph, graph_sum_);
  }
  SetSummaryNodes(root_graph.get());
  // Alloc memory for child graph's inputs
@@ -271,7 +276,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (debugger_ && debugger_->partial_memory()) {
    debugger_->PreExecute(graph);
    debugger_->PreExecute(graph, graph_sum_);
  }
  if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
    MS_LOG(INFO) << "Precompile only, stop in build kernel step";
@@ -329,7 +334,7 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
  // load input data from user input
  LoadInputData(kernel_graph, inputs);
  if (debugger_) {
    debugger_->PreExecute(kernel_graph);
    debugger_->PreExecute(kernel_graph, graph_sum_);
  }
 #if ENABLE_CPU && ENABLE_D
  // Initialize parameter server
@@ -962,6 +967,23 @@ void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph,
  MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id();
 }

 void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
                                    NotNull<std::set<KernelGraphPtr> *> const memo) const {
  if (memo->find(graph) != memo->end()) {
    return;
  }
  memo->insert(graph.get());

  MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id();

  debugger_->LoadGraphs(graph);
  MS_LOG(INFO) << "graph_sum_: " << graph_sum_;
  for (auto &child_graph : graph->child_graph_order()) {
    LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo);
  }
  MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id();
 }

 void AscendSession::AssignStaticMemory(NotNull<KernelGraphPtr> graph,
                                       NotNull<std::set<KernelGraphPtr> *> const memo) const {
  if (memo->find(graph) != memo->end()) {
--- a/mindspore/ccsrc/backend/session/ascend_session.h
+++ b/mindspore/ccsrc/backend/session/ascend_session.h
@@ -125,6 +125,7 @@ class AscendSession : public SessionBasic {
                               size_t *const raise_precision_count, size_t *const reduce_precision_count) const;
  void IrFusionPass(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo);
  void HardwareOptimize(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
  void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
  void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
  void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;

--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -333,12 +333,21 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  }
  // Alloc memory, including static memory and dynamic memory
  AllocateMemory(graph.get());

 #ifdef ENABLE_DEBUGGER
  if (debugger_) {
    debugger_->LoadGraphs(graph);
  }
 #endif
  MS_LOG(INFO) << "CompileGraph graph_id: " << graph_id;

  return graph_id;
 }

 void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
                              VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
  MS_LOG(INFO) << "RunGraph graph_id: " << graph_id;
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
  PreIterationDbg(kernel_graph);
@@ -414,7 +423,7 @@ bool GPUSession::DumpDataEnabledIteration() const {

 void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
  if (debugger_) {
    debugger_->PreExecute(kernel_graph);
    debugger_->PreExecute(kernel_graph, graph_sum_);
  }
  PreLoadTensor(kernel_graph);
 }
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -26,6 +26,7 @@ service EventListener {
  rpc SendGraph (stream Chunk) returns (EventReply) {};
  rpc SendTensors (stream TensorProto) returns (EventReply) {};
  rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
  rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
 }

 message Metadata {
@@ -36,11 +37,14 @@ message Metadata {
  // the full name of current node
  string cur_node = 4;
  // check if training is done.
  bool training_done = 5;
  bool training_done = 5; 
  // the number of total graphs
  int32 graph_num = 6;
 }

 message Chunk {
  bytes buffer = 1;
    bytes buffer = 1;
    bool finished = 2;
 }

 message EventReply {
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -34,6 +34,7 @@
 #include "debug/data_dump/e2e_dump_util.h"
 #include "utils/config_manager.h"

 using debugger::Chunk;
 using debugger::EventReply;
 using debugger::GraphProto;
 using debugger::ModelProto;
@@ -69,7 +70,8 @@ Debugger::Debugger()
      partial_memory_(false),
      last_overflow_bin_(0),
      overflow_bin_path_(""),
      initial_suspend_(true) {
      initial_suspend_(true),
      not_dataset_graph_sum_(0) {
  if (CheckDebuggerEnabled()) {
    // configure partial memory reuse
    partial_memory_ = CheckDebuggerPartialMemoryEnabled();
@@ -259,12 +261,47 @@ void Debugger::Reset() {
  stream_task_to_opname_.clear();
 }

 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
  if (debugger_->DebuggerBackendEnabled()) {
    // check and save graph_ptr, suspend if graph is new
  auto graph_id = graph_ptr->graph_id();
  // collect rungrap_ids to update step number in multigraph case
  if (!rungraph_id_list_.size()) {
    rungraph_id_list_.push_back(graph_id);

  } else {
    if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
      rungraph_id_list_.push_back(graph_id);
    }
  }
  // check and save graph_ptr, suspend if graph is new
  MS_LOG(INFO) << "total number graph: " << graph_sum;
  // multiple graphs
  if (graph_sum > 1) {
    // there are more than one graphs are not dataset_graph
    if (not_dataset_graph_sum_ > 0) {
      // only try to enable debugger if they are not all dataset graphs
      if (!debugger_enabled_) {
        EnableDebugger();
      }

      if (debugger_enabled_) {
        if (graph_proto_list_.size()) {
          // only send compiled graphs once.
          SendMultiGraphsAndSuspend(graph_proto_list_, graph_sum);
          graph_proto_list_.clear();
        } else if (graph_id == rungraph_id_list_.front()) {
          // stop only when receive the first sub run graph for each step
          CommandLoop();
        }
      }
    }
  } else if (graph_proto_list_.size() == 1) {
    // In single graph case, reset graph_ptr_ to be nullptr for the initial step
    if (num_step_ == 0) {
      graph_ptr_ = nullptr;
    }
    CheckGraphPtr(graph_ptr);
  }
 }
@@ -346,20 +383,38 @@ void Debugger::SetStreamTaskToOpnameMap(const std::map<std::pair<uint32_t, uint3
  stream_task_to_opname_ = mapping;
 }

 void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
 void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
  if (graph_ptr_ != graph_ptr) {
    MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id();
    MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
    // save new graph_ptr
    graph_ptr_ = graph_ptr;
    // check if it is dataset graph
    CheckDatasetGraph();
    if (!is_dataset_graph_) {
      // get proto for new graph_ptr
      auto graph_proto = GetGraphProto(graph_ptr);
      // add new graph proto to graph_proto_list_
      graph_proto_list_.push_back(graph_proto);
      not_dataset_graph_sum_++;
    }
    // reset is_dataset_graph to be false
    is_dataset_graph_ = false;
  }
 }

 // In single graph cases, check single graph ptr
 void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
  if (graph_ptr_ != graph_ptr) {
    MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
    // save new graph_ptr
    graph_ptr_ = graph_ptr;
    if (!is_dataset_graph_) {
      // only try to enable debugger if it is not a dataset graph
      EnableDebugger();
      if (debugger_enabled_) {
        LoadParametersAndConst();
        // get graph proto and send to mindinsight
        SendGraphAndSuspend(GetGraphProto());
        auto graph_proto = graph_proto_list_.front();
        SendGraphAndSuspend(graph_proto);
      }
    }
  }
@@ -386,7 +441,7 @@ void Debugger::CheckDatasetGraph() {
  is_dataset_graph_ = false;
 }

 GraphProto Debugger::GetGraphProto() const {
 GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
  // convert kernel graph to debugger modelproto
  ModelProto model = GetDebuggerFuncGraphProto(graph_ptr_);
  return model.graph();
@@ -413,12 +468,49 @@ void Debugger::SendMetadata() {
  metadata.set_cur_node(cur_name_);
  metadata.set_training_done(training_done_);
  MS_LOG(INFO) << "Is training done?" << training_done_;
  // set graph munber to not_dataset_graph_sum_
  metadata.set_graph_num(not_dataset_graph_sum_);
  EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
  if (reply_metadata.status() != reply_metadata.OK) {
    MS_LOG(ERROR) << "Error: SendMetadata failed";
  }
 }

 void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum) {
  SendMetadata();
  // send multiple graphs to mindinght server
  // split graph into chunks if one graph is larger than chunk size
  std::list<Chunk> chunked_graph_proto_list;
  Chunk chunk;
  for (auto graph : graph_proto_list) {
    std::string str = graph.SerializeAsString();
    auto graph_size = graph.ByteSize();
    if (graph_size > CHUNK_SIZE) {
      auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
      for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
        chunk.set_buffer(sub_graph_str[i]);
        chunked_graph_proto_list.push_back(chunk);
        if (i < sub_graph_str.size() - 1) {
          chunk.set_finished(false);
        } else {
          chunk.set_finished(true);
          chunked_graph_proto_list.push_back(chunk);
        }
      }
    } else {
      chunk.set_buffer(str);
      chunk.set_finished(true);
      chunked_graph_proto_list.push_back(chunk);
    }
  }
  EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
  if (reply.status() != reply.OK) {
    MS_LOG(ERROR) << "Error: SendGraph failed";
  }
  // enter command loop, wait and process commands
  CommandLoop();
 }

 void Debugger::CommandLoop() {
  // prepare metadata
  std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
@@ -923,6 +1015,8 @@ bool Debugger::CheckPort(const char *port) {
  return true;
 }

 uint32_t Debugger::GetFirstRunGraphId() { return rungraph_id_list_.front(); }

 void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index) {
  MS_EXCEPTION_IF_NULL(anf_node);
  if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
@@ -996,6 +1090,13 @@ void Debugger::LoadGraphOutputs() {
      }
    }
    for (size_t j = 0; j < output_size; ++j) {
      auto kernel_info = static_cast<device::KernelInfo *>(node->kernel_info());
      MS_EXCEPTION_IF_NULL(kernel_info);
      auto addr_test = kernel_info->GetOutputAddr(j);
      if (addr_test == nullptr) {
        MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
        continue;
      }
      auto addr = AnfAlgo::GetOutputAddr(node, j);
      MS_EXCEPTION_IF_NULL(addr);
      auto type = AnfAlgo::GetOutputInferDataType(node, j);
@@ -1015,9 +1116,14 @@ void Debugger::LoadGraphOutputs() {
  }
 }

 void Debugger::UpdateStepNum() {
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()))
 void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
  // update step number if we are processing the first graph (to support multigraph)
  if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
      (graph->graph_id() == debugger_->GetFirstRunGraphId())) {
    // access lock for public method
    std::lock_guard<std::mutex> a_lock(access_lock_);
    ++num_step_;
  }
 }

 void Debugger::ClearCurrentData() {
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -68,7 +68,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // enable debugger
  // send graph and wait for command
  // do nothing if graph is set already
  void PreExecute(const KernelGraphPtr &graph_ptr);
  void PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum = 1);

  // analyze tensors and wait for command
  // don't need a graph_ptr because it is saved during pre_execute
@@ -106,7 +106,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  void LoadParametersAndConst();

  void UpdateStepNum();
  void UpdateStepNum(const session::KernelGraph *graph);

  void ClearCurrentData();

@@ -114,6 +114,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  void CheckDatasetSinkMode();

  void LoadGraphs(const KernelGraphPtr &graph_ptr);

  uint32_t GetFirstRunGraphId();

 private:
  // private constructor for singleton
  Debugger();
@@ -138,11 +142,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  void CheckDatasetGraph();

  // serialize graph and get proto
  GraphProto GetGraphProto() const;
  GraphProto GetGraphProto(const KernelGraphPtr &graph_ptr) const;

  // send graph and enter command wait loop
  void SendGraphAndSuspend(const GraphProto &graph_proto);

  void SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum);

  // wait for command and process command
  // send command request and process reply in a loop
  // break if RunCMD
@@ -197,9 +203,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::string overflow_bin_path_;
  // flag to keep track of the very first suspension of debugger
  bool initial_suspend_;
  std::list<GraphProto> graph_proto_list_;

  // singleton
  static std::mutex instance_lock_;
  static std::shared_ptr<Debugger> debugger_;
  uint32_t not_dataset_graph_sum_;
  std::list<uint32_t> rungraph_id_list_;
 };

 using DebuggerPtr = std::shared_ptr<Debugger>;
--- a/mindspore/ccsrc/debug/debugger/grpc_client.cc
+++ b/mindspore/ccsrc/debug/debugger/grpc_client.cc
@@ -69,7 +69,7 @@ EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
  return reply;
 }

 std::vector<std::string> ChunkString(std::string str, int graph_size) {
 std::vector<std::string> GrpcClient::ChunkString(std::string str, int graph_size) {
  std::vector<std::string> buf;
  int size_iter = 0;
  while (size_iter < graph_size) {
@@ -118,6 +118,28 @@ EventReply GrpcClient::SendGraph(const GraphProto &graph) {
  return reply;
 }

 EventReply GrpcClient::SendMultiGraphs(const std::list<Chunk> &chunks) {
  EventReply reply;
  grpc::ClientContext context;

  std::unique_ptr<grpc::ClientWriter<Chunk> > writer(stub_->SendMultiGraphs(&context, &reply));
  for (const auto &chunk : chunks) {
    if (!writer->Write(chunk)) {
      break;
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
  }
  writer->WritesDone();
  grpc::Status status = writer->Finish();

  if (!status.ok()) {
    MS_LOG(ERROR) << "RPC failed: SendMultigraphs";
    MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
    reply.set_status(EventReply_Status_FAILED);
  }
  return reply;
 }

 EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
  EventReply reply;
  grpc::ClientContext context;
--- a/mindspore/ccsrc/debug/debugger/grpc_client.h
+++ b/mindspore/ccsrc/debug/debugger/grpc_client.h
@@ -19,9 +19,11 @@
 #include <grpcpp/grpcpp.h>
 #include <string>
 #include <list>
 #include <vector>
 #include <memory>
 #include "proto/debug_grpc.grpc.pb.h"

 using debugger::Chunk;
 using debugger::EventListener;
 using debugger::EventReply;
 using debugger::GraphProto;
@@ -52,8 +54,12 @@ class GrpcClient {

  EventReply SendTensors(const std::list<TensorProto> &tensors);

  EventReply SendMultiGraphs(const std::list<Chunk> &chunks);

  EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);

  std::vector<std::string> ChunkString(std::string str, int graph_size);

 private:
  std::unique_ptr<EventListener::Stub> stub_;
 };
--- a/mindspore/ccsrc/debug/debugger/proto_exporter.cc
+++ b/mindspore/ccsrc/debug/debugger/proto_exporter.cc
@@ -354,6 +354,8 @@ void DebuggerProtoExporter::ExportFuncGraph(const FuncGraphPtr &func_graph, debu
  // set graph name
  graph_proto->set_name(func_graph->ToString());

  MS_LOG(INFO) << "graph names: " << func_graph->ToString();

  ExportParameters(func_graph, graph_proto);

  ExportCNodes(func_graph, graph_proto, &const_map);
@@ -433,6 +435,7 @@ void DebuggerProtoExporter::ExportCNode(const FuncGraphPtr &func_graph, const CN

    // add full_name for debugger
    node_proto->set_full_name(node->fullname_with_scope());
    MS_LOG(INFO) << "full_name: " << node->fullname_with_scope();

    // process OP inputs
    for (size_t i = 1; i < inputs.size(); ++i) {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -577,8 +577,8 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
  AllocInplaceNodeMemory(graph);

  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
  if (!mock) {
    debugger_->UpdateStepNum();
  if (!mock && debugger_) {
    debugger_->UpdateStepNum(graph);
  }
  auto &kernels = graph->execution_order();
  int exec_order = 1;