From 7499c72d5402674543fc8240e355a84b95c34da4 Mon Sep 17 00:00:00 2001
From: lichen_101010
Date: Mon, 20 Jul 2020 19:22:43 -0400
Subject: [PATCH] mindspore grpc implementation
fix bugs for grpc implementation
addressed peer review comments
delete device_target code from Adel
add checksinglewatchpoint function for node level debugger
set the device target when sending metadata
add current node name
fix bugs for current node name
fix run_level_ bug
fix bugs for CheckSingleWatchpoint
fix multi-outputs node issue
fix num_step_ bug
fix continue_to previous node issue
fix run_level issue
fix merge conflict
smart kernel read, watch hit stop mid-sep, fix step number, read input tensors
cleanup the code and isolate UpdataStepNum function
do cpplint, Cppcheck and clang-format check
recover CMakeList.txt
mindspore grpc implementation
fix bugs for grpc implementation
addressed peer review comments
delete device_target code from Adel
add checksinglewatchpoint function for node level debugger
set the device target when sending metadata
add current node name
fix bugs for current node name
fix run_level_ bug
fix bugs for CheckSingleWatchpoint
fix multi-outputs node issue
fix num_step_ bug
fix continue_to previous node issue
fix run_level issue
fix merge conflict
smart kernel read, watch hit stop mid-sep, fix step number, read input tensors
cleanup the code and isolate UpdataStepNum function
do cpplint, Cppcheck and clang-format check
recover CMakeList.txt
only update step_num in one place
fix clang-format error
fix CI errors part2
update graphengine version
addressed comments
---
mindspore/ccsrc/debug/debug_services.cc | 55 ++++++++
mindspore/ccsrc/debug/debug_services.h | 3 +
.../ccsrc/debug/debugger/debug_grpc.proto | 16 ++-
mindspore/ccsrc/debug/debugger/debugger.cc | 121 +++++++++++++++++-
mindspore/ccsrc/debug/debugger/debugger.h | 18 +++
mindspore/ccsrc/debug/tensor_load.h | 16 +++
.../runtime/device/gpu/gpu_device_address.cc | 2 -
.../runtime/device/gpu/gpu_kernel_runtime.cc | 61 ++++++++-
8 files changed, 283 insertions(+), 9 deletions(-)
diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc
index cc6c5c53ad..1e99168c1e 100644
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -171,6 +171,61 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector
}
}
+void DebugServices::CheckSingleWatchpoint(std::shared_ptr watchtensor, std::string *name, std::string *slot,
+ char **data_ptr, unsigned int *data_size, int *condition,
+ unsigned int *wacthpoint_id) {
+ std::lock_guard lg(lock_);
+
+ std::string current_watchtensor_name;
+ current_watchtensor_name = watchtensor->GetName();
+ mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor();
+ int tensor_data_type = tensor_ptr->data_type_c();
+ watchpoint_t watchpoint_to_check;
+
+ for (auto w_table_item : watchpoint_table) {
+ auto check_node_list = std::get<1>(w_table_item).check_node_list;
+ for (auto check_node : check_node_list) {
+ std::string w_name = std::get<0>(check_node);
+ bool w_type = std::get<1>(check_node);
+ // get current the full info including condition, id..., for current watchtensor
+ std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
+ if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) ||
+ (w_type == false && current_node_name == w_name)) {
+ watchpoint_to_check = w_table_item.second;
+ // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
+ if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
+ return;
+ }
+ break;
+ }
+ }
+ }
+
+ float *start_addr = reinterpret_cast(tensor_ptr->data_c());
+ unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
+
+ for (unsigned int index = 0; index < num_elements; index++) {
+ float x = start_addr[index];
+ if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) ||
+ (watchpoint_to_check.conditions.nan.enabled && isnan(x))) {
+ std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
+ *name = name_no_slot;
+ *slot = std::to_string(watchtensor->GetSlot());
+ *data_ptr = reinterpret_cast(tensor_ptr->data_c());
+ *data_size = tensor_ptr->data().nbytes();
+ int condition_item = -1;
+ if (watchpoint_to_check.conditions.nan.enabled) {
+ condition_item = 0;
+ } else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) {
+ condition_item = 1;
+ }
+ *condition = condition_item;
+
+ *wacthpoint_id = watchpoint_to_check.id;
+ }
+ }
+}
+
void DebugServices::ReadNodesTensors(std::vector name, std::vector *ret_name,
std::vector *data_ptr, std::vector *data_size,
std::vector *dtype, std::vector> *shape) {
diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h
index f447ed17d8..b664a9b9e9 100644
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -78,6 +78,9 @@ class DebugServices {
std::vector *data_size, std::vector *condition,
std::vector *wacthpoint_id);
+ void CheckSingleWatchpoint(std::shared_ptr watchnode, std::string *name, std::string *slot,
+ char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
+
void ReadNodesTensors(std::vector name, std::vector *ret_name,
std::vector *data_ptr, std::vector *data_size,
std::vector *dtype, std::vector> *shape);
diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
index f742987a4e..27c93787b8 100644
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -31,6 +31,10 @@ service EventListener {
message Metadata {
string device_name = 1;
int32 cur_step = 2;
+ // define the backend is 'GPU' or "Ascend"
+ string backend = 3;
+ // the full name of current node
+ string cur_node = 4;
}
message EventReply {
@@ -44,12 +48,22 @@ message EventReply {
oneof cmd {
bool exit = 2;
- int32 run_cmd = 3;
+ RunCMD run_cmd = 3;
SetCMD set_cmd = 4;
ViewCMD view_cmd = 5;
}
}
+message RunCMD {
+ // step level or node level. "step" or "node"
+ string run_level = 1;
+ oneof cmd {
+ int32 run_steps = 2;
+ // the next node full name
+ string node_name = 3;
+ }
+}
+
message SetCMD {
repeated WatchNode watch_nodes = 1;
WatchCondition watch_condition = 2;
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index b9e9238034..77e75a5f19 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -45,6 +45,9 @@ Debugger::Debugger()
device_target_(""),
num_step_(0),
debugger_enabled_(false),
+ run_level_(""),
+ node_name_(""),
+ cur_name_(""),
is_dataset_graph_(false),
partial_memory_(false) {}
@@ -164,10 +167,46 @@ void Debugger::PostExecute() {
// access lock for public method
std::lock_guard a_lock(access_lock_);
// analyze tensor data and send the watchpoints been hit
+ if (run_level_ == "node") {
+ MS_LOG(INFO) << "Debugger is in node level mode ";
+ return;
+ }
if (debugger_enabled_ && !is_dataset_graph_) {
- num_step_++;
MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
- SendWatchpointsAndSuspend(CheckWatchpoints());
+ CommandLoop();
+ }
+}
+
+bool Debugger::ReadNodeDataRequired() {
+ if (debugger_enabled_ && !is_dataset_graph_) {
+ auto watchpoint_table = debug_services_->GetWatchpointTable();
+ auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
+ // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
+ if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void Debugger::PostExecuteNode() {
+ // access lock for public method
+ std::lock_guard a_lock(access_lock_);
+ if (debugger_enabled_ && !is_dataset_graph_) {
+ auto watchpoint_table = debug_services_->GetWatchpointTable();
+ auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
+ // if kernel is watchpoint,and get hit. suspend.
+ if (is_watchpoint) {
+ auto hits = CheckSingleWatchpoint(cur_name_);
+ if (!hits.empty()) {
+ SendWatchpointsAndSuspend(hits);
+ }
+ }
+ // if kernel is not watchpoint and is next_to or continue_to node, suspend.
+ if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
+ CommandLoop();
+ }
+ return;
}
}
@@ -232,6 +271,8 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
Metadata metadata;
metadata.set_device_name(device_name);
metadata.set_cur_step(num_step_);
+ metadata.set_backend(device_target_);
+ metadata.set_cur_node(cur_name_);
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
if (reply_metadata.status() != reply_metadata.OK) {
MS_LOG(ERROR) << "Error: SendMetadata failed";
@@ -249,8 +290,11 @@ void Debugger::CommandLoop() {
// prepare metadata
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
+
metadata.set_device_name(device_name);
metadata.set_cur_step(num_step_);
+ metadata.set_backend(device_target_);
+ metadata.set_cur_node(cur_name_);
// loop exit flag
bool run = false;
@@ -291,6 +335,16 @@ void Debugger::CommandLoop() {
break;
case DebuggerCommand::kRunCMD:
MS_LOG(INFO) << "RunCMD";
+ {
+ // print run cmd content
+ // get run_level and node_name
+ run_level_ = GetRunLevel(reply);
+ node_name_ = GetNodeName(reply);
+
+ MS_LOG(INFO) << "run_level: " << run_level_;
+ MS_LOG(INFO) << "node_name_: " << node_name_;
+ }
+
// exit loop
run = true;
break;
@@ -445,6 +499,35 @@ std::list Debugger::CheckWatchpoints() const {
return hits;
}
+std::list Debugger::CheckSingleWatchpoint(std::string watchnode) const {
+ auto tensor_loader = debug_services_->tensor_loader();
+ auto tensors = tensor_loader->GetNodeTensorMap(watchnode);
+ std::list hits;
+ for (std::vector>::iterator it = tensors.begin(); it != tensors.end(); ++it) {
+ auto cur_tensor = *it;
+ std::string name = "";
+ std::string slot = "";
+ char *data_ptr = nullptr;
+ unsigned int data_size = 0;
+ int condition = -1;
+ unsigned int watchpoint_id = -1;
+ WatchpointHit hit;
+ debug_services_->CheckSingleWatchpoint(cur_tensor, &name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
+ if (name != "") {
+ hit.set_id(watchpoint_id);
+ // here TensorProto act as a tensor indicator, not sending tensor content
+ TensorProto *tensor_item = hit.mutable_tensor();
+ tensor_item->set_node_name(name);
+ tensor_item->set_slot(slot);
+ tensor_item->set_finished(true);
+ WatchCondition *condition_item = hit.mutable_watch_condition();
+ condition_item->set_condition(debugger::WatchCondition_Condition(condition));
+ hits.push_back(hit);
+ }
+ }
+ return hits;
+}
+
void Debugger::SendWatchpointsAndSuspend(const std::list &points) {
// send info about watchpoint
if (!points.empty()) {
@@ -491,6 +574,24 @@ ProtoVector GetWatchnodes(const EventReply &reply) {
return reply.set_cmd().watch_nodes();
}
+std::string GetRunLevel(const EventReply &reply) {
+ if (!reply.has_run_cmd()) {
+ MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
+ "";
+ return "";
+ }
+ return reply.run_cmd().run_level();
+}
+
+std::string GetNodeName(const EventReply &reply) {
+ if (!reply.has_run_cmd()) {
+ MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
+ "";
+ return "";
+ }
+ return reply.run_cmd().node_name();
+}
+
WatchCondition GetWatchcondition(const EventReply &reply) {
if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
@@ -536,4 +637,20 @@ std::string GetTensorFullName(const TensorProto &tensor) {
bool Debugger::partial_memory() { return partial_memory_; }
+void Debugger::SetCurNode(std::string cur_name) {
+ // access lock for public method
+ std::lock_guard a_lock(access_lock_);
+ cur_name_ = cur_name;
+}
+
+std::string Debugger::run_level() const { return run_level_; }
+
+void Debugger::SetStepNum(int32_t cur_num_step) {
+ // access lock for public method
+ std::lock_guard a_lock(access_lock_);
+ num_step_ = cur_num_step;
+}
+
+int32_t Debugger::step_num() const { return num_step_; }
+
} // namespace mindspore
diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h
index f72a3e038c..ea035708ea 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@@ -69,6 +69,10 @@ class Debugger : public std::enable_shared_from_this {
// don't need a graph_ptr because it is saved during pre_execute
void PostExecute();
+ bool ReadNodeDataRequired();
+
+ void PostExecuteNode();
+
// suspend the execution after a debug_op
void PostDebugOp();
@@ -78,6 +82,14 @@ class Debugger : public std::enable_shared_from_this {
bool partial_memory();
+ void SetCurNode(std::string cur_name);
+
+ std::string run_level() const;
+
+ void SetStepNum(int32_t cur_num_step);
+
+ int32_t step_num() const;
+
private:
// private constructor for singleton
Debugger();
@@ -119,6 +131,7 @@ class Debugger : public std::enable_shared_from_this {
// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std::list CheckWatchpoints() const;
+ std::list CheckSingleWatchpoint(std::string watchnode) const;
// send watchpoints that hit and enter command wait loop
void SendWatchpointsAndSuspend(const std::list &points);
@@ -131,6 +144,9 @@ class Debugger : public std::enable_shared_from_this {
std::string device_target_;
int32_t num_step_;
bool debugger_enabled_;
+ std::string run_level_;
+ std::string node_name_;
+ std::string cur_name_;
bool is_dataset_graph_;
bool partial_memory_;
std::mutex access_lock_;
@@ -154,6 +170,8 @@ DebuggerCommand GetCommand(const EventReply &reply);
// parse other data out of EventReply
ProtoVector GetWatchnodes(const EventReply &reply);
+std::string GetNodeName(const EventReply &reply);
+std::string GetRunLevel(const EventReply &reply);
WatchCondition GetWatchcondition(const EventReply &reply);
int32_t GetWatchpointID(const EventReply &reply);
bool GetWatchpointDelete(const EventReply &reply);
diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h
index 7215b9a624..8c4072ec49 100644
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -47,6 +47,9 @@ class TensorLoader {
}
tensor_list.push_back(tensor);
tensor_list_map.insert({tensor->GetName(), tensor});
+ auto node_name = tensor->GetName();
+ node_name = node_name.substr(0, node_name.find_first_of(":"));
+ node_tensor_map.insert({node_name, tensor});
return true;
}
std::vector> GetTensor() { return tensor_list; }
@@ -54,6 +57,17 @@ class TensorLoader {
uint32_t GetIterNum() { return iter_num; }
std::map> GetTensorMap() { return tensor_list_map; }
+
+ std::vector> GetNodeTensorMap(std::string node_name) {
+ std::vector> tensors;
+ for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) {
+ if (itr->first == node_name) {
+ tensors.push_back(itr->second);
+ }
+ }
+ return tensors;
+ }
+
void SearchTensors(const std::vector &search_list,
std::vector>> *result_list) {
for (auto i : search_list) {
@@ -70,6 +84,7 @@ class TensorLoader {
void EmptyTensor() {
std::lock_guard lg(lock_);
prev_tensor_list_map.clear();
+ node_tensor_map.clear();
tensor_list_map.swap(prev_tensor_list_map);
tensor_list.clear();
}
@@ -127,6 +142,7 @@ class TensorLoader {
private:
std::vector> tensor_list;
std::map> tensor_list_map;
+ std::multimap> node_tensor_map;
std::map> prev_tensor_list_map;
uint32_t iter_num;
std::mutex lock_;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index 35fc90b7e4..c7fbda2dad 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -90,9 +90,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
tensor_data->SetTensor(out_tensor);
tensor_data->SetSlot(slot);
ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
-
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-
return ret;
}
#endif
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 664648351a..4f24b8e412 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -31,6 +31,9 @@
#include "runtime/device/gpu/gpu_memory_copy_manager.h"
#include "common/trans.h"
#include "ir/dtype.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debug_services.h"
+#endif
namespace mindspore {
namespace device {
@@ -221,10 +224,46 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
const std::vector &kernel_workspaces,
const std::vector &kernel_outputs, int exec_order, void *stream_ptr,
bool dump_enabled) {
- if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+ // check if we should read the kernel data
+ bool read_data = false;
+ std::string kernel_name = kernel->fullname_with_scope();
+ if (debugger) {
+ debugger->SetCurNode(kernel_name);
+ if (dump_enabled) {
+ read_data = true;
+ } else if (debugger->debugger_enabled()) {
+ read_data = debugger->ReadNodeDataRequired();
+ }
+ }
+
+ if (!read_data) {
return;
}
- std::string kernel_name = kernel->fullname_with_scope();
+
+ // get inputs
+ if (!dump_enabled) {
+ auto input_size = AnfAlgo::GetInputTensorNum(kernel);
+ for (size_t j = 0; j < input_size; ++j) {
+ auto input_kernel = kernel->input(j + 1);
+ std::string input_kernel_name = input_kernel->fullname_with_scope();
+ auto addr = kernel_inputs[j];
+ auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+ auto format = kOpFormat_DEFAULT;
+ auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type);
+ string input_tensor_name = input_kernel_name + ':' + "0";
+ std::vector int_shapes;
+ auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
+ (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+ [](size_t inner_item) { return SizeToInt(inner_item); });
+ auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
+ if (!ret) {
+ MS_LOG(ERROR) << "LoadMemToHost:"
+ << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
+ }
+ }
+ }
+
+ // get outputs
auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
for (size_t j = 0; j < output_size; ++j) {
auto addr = kernel_outputs[j];
@@ -242,11 +281,21 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
}
}
+
+ debugger->PostExecuteNode();
+}
+
+void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
+ if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
+ auto cur_step_num = debugger->step_num();
+ cur_step_num = cur_step_num + 1;
+ debugger->SetStepNum(cur_step_num);
+ }
}
void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
MS_EXCEPTION_IF_NULL(graph);
- if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+ if (!(debugger && dump_enabled)) {
return;
}
const auto ¶meters = graph->inputs();
@@ -616,9 +665,13 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
#ifdef ENABLE_DEBUGGER
bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
+ if (!mock) {
+ UpdateStepNum(debugger, dump_enabled);
+ }
#endif
auto &kernels = graph->execution_order();
int exec_order = 1;
+
for (const auto &kernel : kernels) {
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -662,7 +715,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
}
if (!mock) {
#ifdef ENABLE_DEBUGGER
- // collect weights and bias
+ // collect weights and bias for dump mode
LoadParameters(graph, debugger, dump_enabled);
#endif
CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");