send info when training is done

5 years ago · dffa61b228
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -35,6 +35,8 @@ message Metadata {
  string backend = 3;
  // the full name of current node
  string cur_node = 4;
  // check if training is done.
  bool training_done = 5; 
 }

 message Chunk {
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -57,6 +57,7 @@ Debugger::Debugger()
      run_level_(""),
      node_name_(""),
      cur_name_(""),
      training_done_(false),
      is_dataset_graph_(false),
      partial_memory_(false),
      last_overflow_bin_(0),
@@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const {
 }

 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  SendMetadata();
  // send graph to mindinght server
  EventReply reply = grpc_client_->SendGraph(graph_proto);
  if (reply.status() != reply.OK) {
    MS_LOG(ERROR) << "Error: SendGraph failed";
  }
  // enter command loop, wait and process commands
  CommandLoop();
 }

 void Debugger::SendMetadata() {
  // prepare metadata
  std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
  Metadata metadata;
@@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  metadata.set_cur_step(num_step_);
  metadata.set_backend(device_target_);
  metadata.set_cur_node(cur_name_);
  metadata.set_training_done(training_done_);
  MS_LOG(INFO) << "Is training done?" << training_done_;
  EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
  if (reply_metadata.status() != reply_metadata.OK) {
    MS_LOG(ERROR) << "Error: SendMetadata failed";
  }
  // send graph to mindinght server
  EventReply reply = grpc_client_->SendGraph(graph_proto);
  if (reply.status() != reply.OK) {
    MS_LOG(ERROR) << "Error: SendGraph failed";
  }
  // enter command loop, wait and process commands
  CommandLoop();
 }

 void Debugger::CommandLoop() {
@@ -365,6 +372,7 @@ void Debugger::CommandLoop() {
  metadata.set_cur_step(num_step_);
  metadata.set_backend(device_target_);
  metadata.set_cur_node(cur_name_);
  metadata.set_training_done(training_done_);

  // loop exit flag
  bool run = false;
@@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
  return op_names;
 }

 void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }

 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // check if any feature that uses the debugger backend is enabled
  bool DebuggerBackendEnabled();

  void SetTrainingDone(bool training_done);

  void SendMetadata();

 private:
  // private constructor for singleton
  Debugger();
@@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::string run_level_;
  std::string node_name_;
  std::string cur_name_;
  bool training_done_;
  bool is_dataset_graph_;
  bool partial_memory_;
  std::mutex access_lock_;
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() {

 void AscendKernelRuntime::ReleaseDeviceRes() {
  MS_LOG(INFO) << "Ascend finalize start";
 #ifdef ENABLE_DEBUGGER
  if (debugger_ && debugger_->debugger_enabled()) {
    debugger_->SetTrainingDone(true);
    debugger_->SendMetadata();
  }
 #endif
  if (!initialized_) {
    return;
  }
@@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  const auto &apply_kernels = graph->execution_order();
  // for kernels, execution order starts from 1
  int exec_order = 1;
  auto debugger_ = mindspore::Debugger::GetInstance();
  DebugServices *debug_services = debugger_->debug_services();
  auto debugger_i = mindspore::Debugger::GetInstance();
  DebugServices *debug_services = debugger_i->debug_services();
  auto watchpoint_table = debug_services->GetWatchpointTable();
  for (const auto &node : apply_kernels) {
    MS_EXCEPTION_IF_NULL(node);
    auto node_name = AnfAlgo::GetCNodeName(node);
    std::string kernel_name = node->fullname_with_scope();
    auto output_size = AnfAlgo::GetOutputTensorNum(node);
    if (debugger_->partial_memory()) {
    if (debugger_i->partial_memory()) {
      if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) {
        continue;
      }
@@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger)
 bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
  MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DEBUGGER
  debugger_ = debugger;
  MS_LOG(INFO) << "Start load step";
  uint32_t cur_iter = 0;
  MS_LOG(INFO) << "Cur iter is " << cur_iter;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
    auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
    (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                         [](size_t inner_item) { return SizeToInt(inner_item); });
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
    if (!ret) {
      MS_LOG(ERROR) << "LoadMemToHost:"
                    << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() {

 void GPUKernelRuntime::ReleaseDeviceRes() {
  // For dataset mode.
 #ifdef ENABLE_DEBUGGER
  if (debugger_ && debugger_->debugger_enabled()) {
    debugger_->SetTrainingDone(true);
    debugger_->SendMetadata();
  }
 #endif
  if (GpuBufferMgr::GetInstance().IsInit()) {
    if (!GpuBufferMgr::GetInstance().IsClosed()) {
      if (!GpuBufferMgr::GetInstance().CloseNotify()) {
@@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
  AllocCommunicationOpDynamicRes(graph);

 #ifdef ENABLE_DEBUGGER
  debugger_ = debugger;
  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
  if (!mock) {
    UpdateStepNum(debugger, dump_enabled);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -124,6 +124,10 @@ class KernelRuntime {
 #ifdef ENABLE_DUMP_E2E
  DumpConfPtr dump_conf_ptr_;
 #endif

 #ifdef ENABLE_DEBUGGER
  Debugger *debugger_;
 #endif
  void *stream_ = nullptr;
  std::shared_ptr<MemoryManager> mem_manager_{nullptr};
 };