From dffa61b228719321fb6805ba05b46a58fc931b8b Mon Sep 17 00:00:00 2001
From: lichen_101010
Date: Thu, 3 Sep 2020 16:40:20 -0400
Subject: [PATCH] send info when training is done
---
.../ccsrc/debug/debugger/debug_grpc.proto | 2 ++
mindspore/ccsrc/debug/debugger/debugger.cc | 24 +++++++++++++------
mindspore/ccsrc/debug/debugger/debugger.h | 5 ++++
.../device/ascend/ascend_kernel_runtime.cc | 13 +++++++---
.../runtime/device/gpu/gpu_kernel_runtime.cc | 9 ++++++-
.../ccsrc/runtime/device/kernel_runtime.h | 4 ++++
6 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
index 6c627d730b..5c1ca5ceed 100644
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -35,6 +35,8 @@ message Metadata {
string backend = 3;
// the full name of current node
string cur_node = 4;
+ // check if training is done.
+ bool training_done = 5;
}
message Chunk {
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 58f451177c..752b796a5a 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -57,6 +57,7 @@ Debugger::Debugger()
run_level_(""),
node_name_(""),
cur_name_(""),
+ training_done_(false),
is_dataset_graph_(false),
partial_memory_(false),
last_overflow_bin_(0),
@@ -336,6 +337,17 @@ GraphProto Debugger::GetGraphProto() const {
}
void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
+ SendMetadata();
+ // send graph to mindinght server
+ EventReply reply = grpc_client_->SendGraph(graph_proto);
+ if (reply.status() != reply.OK) {
+ MS_LOG(ERROR) << "Error: SendGraph failed";
+ }
+ // enter command loop, wait and process commands
+ CommandLoop();
+}
+
+void Debugger::SendMetadata() {
// prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
@@ -343,17 +355,12 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_);
+ metadata.set_training_done(training_done_);
+ MS_LOG(INFO) << "Is training done?" << training_done_;
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) {
MS_LOG(ERROR) << "Error: SendMetadata failed";
}
- // send graph to mindinght server
- EventReply reply = grpc_client_->SendGraph(graph_proto);
- if (reply.status() != reply.OK) {
- MS_LOG(ERROR) << "Error: SendGraph failed";
- }
- // enter command loop, wait and process commands
- CommandLoop();
}
void Debugger::CommandLoop() {
@@ -365,6 +372,7 @@ void Debugger::CommandLoop() {
metadata.set_cur_step(num_step_);
metadata.set_backend(device_target_);
metadata.set_cur_node(cur_name_);
+ metadata.set_training_done(training_done_);
// loop exit flag
bool run = false;
@@ -787,4 +795,6 @@ std::vector Debugger::CheckOpOverflow() {
return op_names;
}
+void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
+
} // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index 53e55f6576..7a5cd1b8dc 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -99,6 +99,10 @@ class Debugger : public std::enable_shared_from_this {
// check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled();
+ void SetTrainingDone(bool training_done);
+
+ void SendMetadata();
+
private:
// private constructor for singleton
Debugger();
@@ -164,6 +168,7 @@ class Debugger : public std::enable_shared_from_this {
std::string run_level_;
std::string node_name_;
std::string cur_name_;
+ bool training_done_;
bool is_dataset_graph_;
bool partial_memory_;
std::mutex access_lock_;
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index 7b0f2621cf..1575a20015 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -170,6 +170,12 @@ bool AscendKernelRuntime::NeedDestroyHccl() {
void AscendKernelRuntime::ReleaseDeviceRes() {
MS_LOG(INFO) << "Ascend finalize start";
+#ifdef ENABLE_DEBUGGER
+ if (debugger_ && debugger_->debugger_enabled()) {
+ debugger_->SetTrainingDone(true);
+ debugger_->SendMetadata();
+ }
+#endif
if (!initialized_) {
return;
}
@@ -354,15 +360,15 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
const auto &apply_kernels = graph->execution_order();
// for kernels, execution order starts from 1
int exec_order = 1;
- auto debugger_ = mindspore::Debugger::GetInstance();
- DebugServices *debug_services = debugger_->debug_services();
+ auto debugger_i = mindspore::Debugger::GetInstance();
+ DebugServices *debug_services = debugger_i->debug_services();
auto watchpoint_table = debug_services->GetWatchpointTable();
for (const auto &node : apply_kernels) {
MS_EXCEPTION_IF_NULL(node);
auto node_name = AnfAlgo::GetCNodeName(node);
std::string kernel_name = node->fullname_with_scope();
auto output_size = AnfAlgo::GetOutputTensorNum(node);
- if (debugger_->partial_memory()) {
+ if (debugger_i->partial_memory()) {
if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) {
continue;
}
@@ -431,6 +437,7 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger)
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
#ifdef ENABLE_DEBUGGER
+ debugger_ = debugger;
MS_LOG(INFO) << "Start load step";
uint32_t cur_iter = 0;
MS_LOG(INFO) << "Cur iter is " << cur_iter;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 0a411159a0..9466f471ad 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -256,7 +256,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
- auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
+ auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@@ -368,6 +368,12 @@ bool GPUKernelRuntime::InitDevice() {
void GPUKernelRuntime::ReleaseDeviceRes() {
// For dataset mode.
+#ifdef ENABLE_DEBUGGER
+ if (debugger_ && debugger_->debugger_enabled()) {
+ debugger_->SetTrainingDone(true);
+ debugger_->SendMetadata();
+ }
+#endif
if (GpuBufferMgr::GetInstance().IsInit()) {
if (!GpuBufferMgr::GetInstance().IsClosed()) {
if (!GpuBufferMgr::GetInstance().CloseNotify()) {
@@ -684,6 +690,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
AllocCommunicationOpDynamicRes(graph);
#ifdef ENABLE_DEBUGGER
+ debugger_ = debugger;
bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
if (!mock) {
UpdateStepNum(debugger, dump_enabled);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 5c87e0998b..d12bd08b09 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -124,6 +124,10 @@ class KernelRuntime {
#ifdef ENABLE_DUMP_E2E
DumpConfPtr dump_conf_ptr_;
#endif
+
+#ifdef ENABLE_DEBUGGER
+ Debugger *debugger_;
+#endif
void *stream_ = nullptr;
std::shared_ptr mem_manager_{nullptr};
};