diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
index 6c627d730b..5c1ca5ceed 100644
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -35,6 +35,8 @@ message Metadata {
   string backend = 3;
   // the full name of current node
   string cur_node = 4;
+  // check if training is done.
+  bool training_done = 5; 
 }
 
 message Chunk {
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 58f451177c..752b796a5a 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -57,6 +57,7 @@ Debugger::Debugger()
       run_level_(""),
       node_name_(""),
       cur_name_(""),
+      training_done_(false),
       is_dataset_graph_(false),
       partial_memory_(false),
       last_overflow_bin_(0),
@@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const {
 }
 
 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
+  SendMetadata();
+  // send graph to mindinght server
+  EventReply reply = grpc_client_->SendGraph(graph_proto);
+  if (reply.status() != reply.OK) {
+    MS_LOG(ERROR) << "Error: SendGraph failed";
+  }
+  // enter command loop, wait and process commands
+  CommandLoop();
+}
+
+void Debugger::SendMetadata() {
   // prepare metadata
   std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
   Metadata metadata;
@@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
   metadata.set_cur_step(num_step_);
   metadata.set_backend(device_target_);
   metadata.set_cur_node(cur_name_);
+  metadata.set_training_done(training_done_);
+  MS_LOG(INFO) << "Is training done?" << training_done_;
   EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
   if (reply_metadata.status() != reply_metadata.OK) {
     MS_LOG(ERROR) << "Error: SendMetadata failed";
   }
-  // send graph to mindinght server
-  EventReply reply = grpc_client_->SendGraph(graph_proto);
-  if (reply.status() != reply.OK) {
-    MS_LOG(ERROR) << "Error: SendGraph failed";
-  }
-  // enter command loop, wait and process commands
-  CommandLoop();
 }
 
 void Debugger::CommandLoop() {
@@ -365,6 +372,7 @@ void Debugger::CommandLoop() {
   metadata.set_cur_step(num_step_);
   metadata.set_backend(device_target_);
   metadata.set_cur_node(cur_name_);
+  metadata.set_training_done(training_done_);
 
   // loop exit flag
   bool run = false;
@@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() {
   return op_names;
 }
 
+void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
+
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 53e55f6576..7a5cd1b8dc 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   // check if any feature that uses the debugger backend is enabled
   bool DebuggerBackendEnabled();
 
+  void SetTrainingDone(bool training_done);
+
+  void SendMetadata();
+
  private:
   // private constructor for singleton
   Debugger();
@@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
   std::string run_level_;
   std::string node_name_;
   std::string cur_name_;
+  bool training_done_;
   bool is_dataset_graph_;
   bool partial_memory_;
   std::mutex access_lock_;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index 7b0f2621cf..1575a20015 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() {
 
 void AscendKernelRuntime::ReleaseDeviceRes() {
   MS_LOG(INFO) << "Ascend finalize start";
+#ifdef ENABLE_DEBUGGER
+  if (debugger_ && debugger_->debugger_enabled()) {
+    debugger_->SetTrainingDone(true);
+    debugger_->SendMetadata();
+  }
+#endif
   if (!initialized_) {
     return;
   }
@@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
   const auto &apply_kernels = graph->execution_order();
   // for kernels, execution order starts from 1
   int exec_order = 1;
-  auto debugger_ = mindspore::Debugger::GetInstance();
-  DebugServices *debug_services = debugger_->debug_services();
+  auto debugger_i = mindspore::Debugger::GetInstance();
+  DebugServices *debug_services = debugger_i->debug_services();
   auto watchpoint_table = debug_services->GetWatchpointTable();
   for (const auto &node : apply_kernels) {
     MS_EXCEPTION_IF_NULL(node);
     auto node_name = AnfAlgo::GetCNodeName(node);
     std::string kernel_name = node->fullname_with_scope();
     auto output_size = AnfAlgo::GetOutputTensorNum(node);
-    if (debugger_->partial_memory()) {
+    if (debugger_i->partial_memory()) {
       if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) {
         continue;
       }
@@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger)
 bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
   MS_EXCEPTION_IF_NULL(graph);
 #ifdef ENABLE_DEBUGGER
+  debugger_ = debugger;
   MS_LOG(INFO) << "Start load step";
   uint32_t cur_iter = 0;
   MS_LOG(INFO) << "Cur iter is " << cur_iter;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 0a411159a0..9466f471ad 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
     auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
     (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
                          [](size_t inner_item) { return SizeToInt(inner_item); });
-    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
+    auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
     if (!ret) {
       MS_LOG(ERROR) << "LoadMemToHost:"
                     << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() {
 
 void GPUKernelRuntime::ReleaseDeviceRes() {
   // For dataset mode.
+#ifdef ENABLE_DEBUGGER
+  if (debugger_ && debugger_->debugger_enabled()) {
+    debugger_->SetTrainingDone(true);
+    debugger_->SendMetadata();
+  }
+#endif
   if (GpuBufferMgr::GetInstance().IsInit()) {
     if (!GpuBufferMgr::GetInstance().IsClosed()) {
       if (!GpuBufferMgr::GetInstance().CloseNotify()) {
@@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
   AllocCommunicationOpDynamicRes(graph);
 
 #ifdef ENABLE_DEBUGGER
+  debugger_ = debugger;
   bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
   if (!mock) {
     UpdateStepNum(debugger, dump_enabled);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 5c87e0998b..d12bd08b09 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -124,6 +124,10 @@ class KernelRuntime {
 #ifdef ENABLE_DUMP_E2E
   DumpConfPtr dump_conf_ptr_;
 #endif
+
+#ifdef ENABLE_DEBUGGER
+  Debugger *debugger_;
+#endif
   void *stream_ = nullptr;
   std::shared_ptr<MemoryManager> mem_manager_{nullptr};
 };