| @@ -35,6 +35,8 @@ message Metadata { | |||||
| string backend = 3; | string backend = 3; | ||||
| // the full name of current node | // the full name of current node | ||||
| string cur_node = 4; | string cur_node = 4; | ||||
| // check if training is done. | |||||
| bool training_done = 5; | |||||
| } | } | ||||
| message Chunk { | message Chunk { | ||||
| @@ -57,6 +57,7 @@ Debugger::Debugger() | |||||
| run_level_(""), | run_level_(""), | ||||
| node_name_(""), | node_name_(""), | ||||
| cur_name_(""), | cur_name_(""), | ||||
| training_done_(false), | |||||
| is_dataset_graph_(false), | is_dataset_graph_(false), | ||||
| partial_memory_(false), | partial_memory_(false), | ||||
| last_overflow_bin_(0), | last_overflow_bin_(0), | ||||
| @@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const { | |||||
| } | } | ||||
| void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | ||||
| SendMetadata(); | |||||
| // send graph to mindinght server | |||||
| EventReply reply = grpc_client_->SendGraph(graph_proto); | |||||
| if (reply.status() != reply.OK) { | |||||
| MS_LOG(ERROR) << "Error: SendGraph failed"; | |||||
| } | |||||
| // enter command loop, wait and process commands | |||||
| CommandLoop(); | |||||
| } | |||||
| void Debugger::SendMetadata() { | |||||
| // prepare metadata | // prepare metadata | ||||
| std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); | std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); | ||||
| Metadata metadata; | Metadata metadata; | ||||
| @@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | |||||
| metadata.set_cur_step(num_step_); | metadata.set_cur_step(num_step_); | ||||
| metadata.set_backend(device_target_); | metadata.set_backend(device_target_); | ||||
| metadata.set_cur_node(cur_name_); | metadata.set_cur_node(cur_name_); | ||||
| metadata.set_training_done(training_done_); | |||||
| MS_LOG(INFO) << "Is training done?" << training_done_; | |||||
| EventReply reply_metadata = grpc_client_->SendMetadata(metadata); | EventReply reply_metadata = grpc_client_->SendMetadata(metadata); | ||||
| if (reply_metadata.status() != reply_metadata.OK) { | if (reply_metadata.status() != reply_metadata.OK) { | ||||
| MS_LOG(ERROR) << "Error: SendMetadata failed"; | MS_LOG(ERROR) << "Error: SendMetadata failed"; | ||||
| } | } | ||||
| // send graph to mindinght server | |||||
| EventReply reply = grpc_client_->SendGraph(graph_proto); | |||||
| if (reply.status() != reply.OK) { | |||||
| MS_LOG(ERROR) << "Error: SendGraph failed"; | |||||
| } | |||||
| // enter command loop, wait and process commands | |||||
| CommandLoop(); | |||||
| } | } | ||||
| void Debugger::CommandLoop() { | void Debugger::CommandLoop() { | ||||
| @@ -365,6 +372,7 @@ void Debugger::CommandLoop() { | |||||
| metadata.set_cur_step(num_step_); | metadata.set_cur_step(num_step_); | ||||
| metadata.set_backend(device_target_); | metadata.set_backend(device_target_); | ||||
| metadata.set_cur_node(cur_name_); | metadata.set_cur_node(cur_name_); | ||||
| metadata.set_training_done(training_done_); | |||||
| // loop exit flag | // loop exit flag | ||||
| bool run = false; | bool run = false; | ||||
| @@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() { | |||||
| return op_names; | return op_names; | ||||
| } | } | ||||
| void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| // check if any feature that uses the debugger backend is enabled | // check if any feature that uses the debugger backend is enabled | ||||
| bool DebuggerBackendEnabled(); | bool DebuggerBackendEnabled(); | ||||
| void SetTrainingDone(bool training_done); | |||||
| void SendMetadata(); | |||||
| private: | private: | ||||
| // private constructor for singleton | // private constructor for singleton | ||||
| Debugger(); | Debugger(); | ||||
| @@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| std::string run_level_; | std::string run_level_; | ||||
| std::string node_name_; | std::string node_name_; | ||||
| std::string cur_name_; | std::string cur_name_; | ||||
| bool training_done_; | |||||
| bool is_dataset_graph_; | bool is_dataset_graph_; | ||||
| bool partial_memory_; | bool partial_memory_; | ||||
| std::mutex access_lock_; | std::mutex access_lock_; | ||||
| @@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() { | |||||
| void AscendKernelRuntime::ReleaseDeviceRes() { | void AscendKernelRuntime::ReleaseDeviceRes() { | ||||
| MS_LOG(INFO) << "Ascend finalize start"; | MS_LOG(INFO) << "Ascend finalize start"; | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| if (debugger_ && debugger_->debugger_enabled()) { | |||||
| debugger_->SetTrainingDone(true); | |||||
| debugger_->SendMetadata(); | |||||
| } | |||||
| #endif | |||||
| if (!initialized_) { | if (!initialized_) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| const auto &apply_kernels = graph->execution_order(); | const auto &apply_kernels = graph->execution_order(); | ||||
| // for kernels, execution order starts from 1 | // for kernels, execution order starts from 1 | ||||
| int exec_order = 1; | int exec_order = 1; | ||||
| auto debugger_ = mindspore::Debugger::GetInstance(); | |||||
| DebugServices *debug_services = debugger_->debug_services(); | |||||
| auto debugger_i = mindspore::Debugger::GetInstance(); | |||||
| DebugServices *debug_services = debugger_i->debug_services(); | |||||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | auto watchpoint_table = debug_services->GetWatchpointTable(); | ||||
| for (const auto &node : apply_kernels) { | for (const auto &node : apply_kernels) { | ||||
| MS_EXCEPTION_IF_NULL(node); | MS_EXCEPTION_IF_NULL(node); | ||||
| auto node_name = AnfAlgo::GetCNodeName(node); | auto node_name = AnfAlgo::GetCNodeName(node); | ||||
| std::string kernel_name = node->fullname_with_scope(); | std::string kernel_name = node->fullname_with_scope(); | ||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | auto output_size = AnfAlgo::GetOutputTensorNum(node); | ||||
| if (debugger_->partial_memory()) { | |||||
| if (debugger_i->partial_memory()) { | |||||
| if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { | if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) | |||||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { | bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { | ||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| debugger_ = debugger; | |||||
| MS_LOG(INFO) << "Start load step"; | MS_LOG(INFO) << "Start load step"; | ||||
| uint32_t cur_iter = 0; | uint32_t cur_iter = 0; | ||||
| MS_LOG(INFO) << "Cur iter is " << cur_iter; | MS_LOG(INFO) << "Cur iter is " << cur_iter; | ||||
| @@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); | auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); | ||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | ||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | [](size_t inner_item) { return SizeToInt(inner_item); }); | ||||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); | |||||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); | |||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(ERROR) << "LoadMemToHost:" | MS_LOG(ERROR) << "LoadMemToHost:" | ||||
| << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | ||||
| @@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() { | |||||
| void GPUKernelRuntime::ReleaseDeviceRes() { | void GPUKernelRuntime::ReleaseDeviceRes() { | ||||
| // For dataset mode. | // For dataset mode. | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| if (debugger_ && debugger_->debugger_enabled()) { | |||||
| debugger_->SetTrainingDone(true); | |||||
| debugger_->SendMetadata(); | |||||
| } | |||||
| #endif | |||||
| if (GpuBufferMgr::GetInstance().IsInit()) { | if (GpuBufferMgr::GetInstance().IsInit()) { | ||||
| if (!GpuBufferMgr::GetInstance().IsClosed()) { | if (!GpuBufferMgr::GetInstance().IsClosed()) { | ||||
| if (!GpuBufferMgr::GetInstance().CloseNotify()) { | if (!GpuBufferMgr::GetInstance().CloseNotify()) { | ||||
| @@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| AllocCommunicationOpDynamicRes(graph); | AllocCommunicationOpDynamicRes(graph); | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| debugger_ = debugger; | |||||
| bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | ||||
| if (!mock) { | if (!mock) { | ||||
| UpdateStepNum(debugger, dump_enabled); | UpdateStepNum(debugger, dump_enabled); | ||||
| @@ -124,6 +124,10 @@ class KernelRuntime { | |||||
| #ifdef ENABLE_DUMP_E2E | #ifdef ENABLE_DUMP_E2E | ||||
| DumpConfPtr dump_conf_ptr_; | DumpConfPtr dump_conf_ptr_; | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| Debugger *debugger_; | |||||
| #endif | |||||
| void *stream_ = nullptr; | void *stream_ = nullptr; | ||||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | ||||
| }; | }; | ||||