| @@ -35,6 +35,8 @@ message Metadata { | |||
| string backend = 3; | |||
| // the full name of current node | |||
| string cur_node = 4; | |||
| // check if training is done. | |||
| bool training_done = 5; | |||
| } | |||
| message Chunk { | |||
| @@ -57,6 +57,7 @@ Debugger::Debugger() | |||
| run_level_(""), | |||
| node_name_(""), | |||
| cur_name_(""), | |||
| training_done_(false), | |||
| is_dataset_graph_(false), | |||
| partial_memory_(false), | |||
| last_overflow_bin_(0), | |||
| @@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const { | |||
| } | |||
| void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | |||
| SendMetadata(); | |||
| // send graph to mindinght server | |||
| EventReply reply = grpc_client_->SendGraph(graph_proto); | |||
| if (reply.status() != reply.OK) { | |||
| MS_LOG(ERROR) << "Error: SendGraph failed"; | |||
| } | |||
| // enter command loop, wait and process commands | |||
| CommandLoop(); | |||
| } | |||
| void Debugger::SendMetadata() { | |||
| // prepare metadata | |||
| std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); | |||
| Metadata metadata; | |||
| @@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | |||
| metadata.set_cur_step(num_step_); | |||
| metadata.set_backend(device_target_); | |||
| metadata.set_cur_node(cur_name_); | |||
| metadata.set_training_done(training_done_); | |||
| MS_LOG(INFO) << "Is training done?" << training_done_; | |||
| EventReply reply_metadata = grpc_client_->SendMetadata(metadata); | |||
| if (reply_metadata.status() != reply_metadata.OK) { | |||
| MS_LOG(ERROR) << "Error: SendMetadata failed"; | |||
| } | |||
| // send graph to mindinght server | |||
| EventReply reply = grpc_client_->SendGraph(graph_proto); | |||
| if (reply.status() != reply.OK) { | |||
| MS_LOG(ERROR) << "Error: SendGraph failed"; | |||
| } | |||
| // enter command loop, wait and process commands | |||
| CommandLoop(); | |||
| } | |||
| void Debugger::CommandLoop() { | |||
| @@ -365,6 +372,7 @@ void Debugger::CommandLoop() { | |||
| metadata.set_cur_step(num_step_); | |||
| metadata.set_backend(device_target_); | |||
| metadata.set_cur_node(cur_name_); | |||
| metadata.set_training_done(training_done_); | |||
| // loop exit flag | |||
| bool run = false; | |||
| @@ -787,4 +795,6 @@ std::vector<std::string> Debugger::CheckOpOverflow() { | |||
| return op_names; | |||
| } | |||
| void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; } | |||
| } // namespace mindspore | |||
| @@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| // check if any feature that uses the debugger backend is enabled | |||
| bool DebuggerBackendEnabled(); | |||
| void SetTrainingDone(bool training_done); | |||
| void SendMetadata(); | |||
| private: | |||
| // private constructor for singleton | |||
| Debugger(); | |||
| @@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| std::string run_level_; | |||
| std::string node_name_; | |||
| std::string cur_name_; | |||
| bool training_done_; | |||
| bool is_dataset_graph_; | |||
| bool partial_memory_; | |||
| std::mutex access_lock_; | |||
| @@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() { | |||
| void AscendKernelRuntime::ReleaseDeviceRes() { | |||
| MS_LOG(INFO) << "Ascend finalize start"; | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (debugger_ && debugger_->debugger_enabled()) { | |||
| debugger_->SetTrainingDone(true); | |||
| debugger_->SendMetadata(); | |||
| } | |||
| #endif | |||
| if (!initialized_) { | |||
| return; | |||
| } | |||
| @@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||
| const auto &apply_kernels = graph->execution_order(); | |||
| // for kernels, execution order starts from 1 | |||
| int exec_order = 1; | |||
| auto debugger_ = mindspore::Debugger::GetInstance(); | |||
| DebugServices *debug_services = debugger_->debug_services(); | |||
| auto debugger_i = mindspore::Debugger::GetInstance(); | |||
| DebugServices *debug_services = debugger_i->debug_services(); | |||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | |||
| for (const auto &node : apply_kernels) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||
| std::string kernel_name = node->fullname_with_scope(); | |||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||
| if (debugger_->partial_memory()) { | |||
| if (debugger_i->partial_memory()) { | |||
| if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { | |||
| continue; | |||
| } | |||
| @@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) | |||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| #ifdef ENABLE_DEBUGGER | |||
| debugger_ = debugger; | |||
| MS_LOG(INFO) << "Start load step"; | |||
| uint32_t cur_iter = 0; | |||
| MS_LOG(INFO) << "Cur iter is " << cur_iter; | |||
| @@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||
| auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); | |||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); | |||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||
| << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | |||
| @@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() { | |||
| void GPUKernelRuntime::ReleaseDeviceRes() { | |||
| // For dataset mode. | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (debugger_ && debugger_->debugger_enabled()) { | |||
| debugger_->SetTrainingDone(true); | |||
| debugger_->SendMetadata(); | |||
| } | |||
| #endif | |||
| if (GpuBufferMgr::GetInstance().IsInit()) { | |||
| if (!GpuBufferMgr::GetInstance().IsClosed()) { | |||
| if (!GpuBufferMgr::GetInstance().CloseNotify()) { | |||
| @@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||
| AllocCommunicationOpDynamicRes(graph); | |||
| #ifdef ENABLE_DEBUGGER | |||
| debugger_ = debugger; | |||
| bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | |||
| if (!mock) { | |||
| UpdateStepNum(debugger, dump_enabled); | |||
| @@ -124,6 +124,10 @@ class KernelRuntime { | |||
| #ifdef ENABLE_DUMP_E2E | |||
| DumpConfPtr dump_conf_ptr_; | |||
| #endif | |||
| #ifdef ENABLE_DEBUGGER | |||
| Debugger *debugger_; | |||
| #endif | |||
| void *stream_ = nullptr; | |||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | |||
| }; | |||