diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto index 6c627d730b..5c1ca5ceed 100644 --- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto +++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto @@ -35,6 +35,8 @@ message Metadata { string backend = 3; // the full name of current node string cur_node = 4; + // check if training is done. + bool training_done = 5; } message Chunk { diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 58f451177c..752b796a5a 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -57,6 +57,7 @@ Debugger::Debugger() run_level_(""), node_name_(""), cur_name_(""), + training_done_(false), is_dataset_graph_(false), partial_memory_(false), last_overflow_bin_(0), @@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const { } void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { + SendMetadata(); + // send graph to mindinght server + EventReply reply = grpc_client_->SendGraph(graph_proto); + if (reply.status() != reply.OK) { + MS_LOG(ERROR) << "Error: SendGraph failed"; + } + // enter command loop, wait and process commands + CommandLoop(); +} + +void Debugger::SendMetadata() { // prepare metadata std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); Metadata metadata; @@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { metadata.set_cur_step(num_step_); metadata.set_backend(device_target_); metadata.set_cur_node(cur_name_); + metadata.set_training_done(training_done_); + MS_LOG(INFO) << "Is training done?" << training_done_; EventReply reply_metadata = grpc_client_->SendMetadata(metadata); if (reply_metadata.status() != reply_metadata.OK) { MS_LOG(ERROR) << "Error: SendMetadata failed"; } - // send graph to mindinght server - EventReply reply = grpc_client_->SendGraph(graph_proto); - if (reply.status() != reply.OK) { - MS_LOG(ERROR) << "Error: SendGraph failed"; - } - // enter command loop, wait and process commands - CommandLoop(); } void Debugger::CommandLoop() { @@ -365,6 +372,7 @@ void Debugger::CommandLoop() { metadata.set_cur_step(num_step_); metadata.set_backend(device_target_); metadata.set_cur_node(cur_name_); + metadata.set_training_done(training_done_); // loop exit flag bool run = false; @@ -787,4 +795,6 @@ std::vector Debugger::CheckOpOverflow() { return op_names; } +void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; } + } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 53e55f6576..7a5cd1b8dc 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this { // check if any feature that uses the debugger backend is enabled bool DebuggerBackendEnabled(); + void SetTrainingDone(bool training_done); + + void SendMetadata(); + private: // private constructor for singleton Debugger(); @@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this { std::string run_level_; std::string node_name_; std::string cur_name_; + bool training_done_; bool is_dataset_graph_; bool partial_memory_; std::mutex access_lock_; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 7b0f2621cf..1575a20015 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() { void AscendKernelRuntime::ReleaseDeviceRes() { MS_LOG(INFO) << "Ascend finalize start"; +#ifdef ENABLE_DEBUGGER + if (debugger_ && debugger_->debugger_enabled()) { + debugger_->SetTrainingDone(true); + debugger_->SendMetadata(); + } +#endif if (!initialized_) { return; } @@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { const auto &apply_kernels = graph->execution_order(); // for kernels, execution order starts from 1 int exec_order = 1; - auto debugger_ = mindspore::Debugger::GetInstance(); - DebugServices *debug_services = debugger_->debug_services(); + auto debugger_i = mindspore::Debugger::GetInstance(); + DebugServices *debug_services = debugger_i->debug_services(); auto watchpoint_table = debug_services->GetWatchpointTable(); for (const auto &node : apply_kernels) { MS_EXCEPTION_IF_NULL(node); auto node_name = AnfAlgo::GetCNodeName(node); std::string kernel_name = node->fullname_with_scope(); auto output_size = AnfAlgo::GetOutputTensorNum(node); - if (debugger_->partial_memory()) { + if (debugger_i->partial_memory()) { if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { continue; } @@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); #ifdef ENABLE_DEBUGGER + debugger_ = debugger; MS_LOG(INFO) << "Start load step"; uint32_t cur_iter = 0; MS_LOG(INFO) << "Cur iter is " << cur_iter; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 0a411159a0..9466f471ad 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), [](size_t inner_item) { return SizeToInt(inner_item); }); - auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); + auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost:" << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; @@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() { void GPUKernelRuntime::ReleaseDeviceRes() { // For dataset mode. +#ifdef ENABLE_DEBUGGER + if (debugger_ && debugger_->debugger_enabled()) { + debugger_->SetTrainingDone(true); + debugger_->SendMetadata(); + } +#endif if (GpuBufferMgr::GetInstance().IsInit()) { if (!GpuBufferMgr::GetInstance().IsClosed()) { if (!GpuBufferMgr::GetInstance().CloseNotify()) { @@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De AllocCommunicationOpDynamicRes(graph); #ifdef ENABLE_DEBUGGER + debugger_ = debugger; bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); if (!mock) { UpdateStepNum(debugger, dump_enabled); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 5c87e0998b..d12bd08b09 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -124,6 +124,10 @@ class KernelRuntime { #ifdef ENABLE_DUMP_E2E DumpConfPtr dump_conf_ptr_; #endif + +#ifdef ENABLE_DEBUGGER + Debugger *debugger_; +#endif void *stream_ = nullptr; std::shared_ptr mem_manager_{nullptr}; };