From 7499c72d5402674543fc8240e355a84b95c34da4 Mon Sep 17 00:00:00 2001 From: lichen_101010 Date: Mon, 20 Jul 2020 19:22:43 -0400 Subject: [PATCH] mindspore grpc implementation fix bugs for grpc implementation addressed peer review comments delete device_target code from Adel add checksinglewatchpoint function for node level debugger set the device target when sending metadata add current node name fix bugs for current node name fix run_level_ bug fix bugs for CheckSingleWatchpoint fix multi-outputs node issue fix num_step_ bug fix continue_to previous node issue fix run_level issue fix merge conflict smart kernel read, watch hit stop mid-sep, fix step number, read input tensors cleanup the code and isolate UpdataStepNum function do cpplint, Cppcheck and clang-format check recover CMakeList.txt mindspore grpc implementation fix bugs for grpc implementation addressed peer review comments delete device_target code from Adel add checksinglewatchpoint function for node level debugger set the device target when sending metadata add current node name fix bugs for current node name fix run_level_ bug fix bugs for CheckSingleWatchpoint fix multi-outputs node issue fix num_step_ bug fix continue_to previous node issue fix run_level issue fix merge conflict smart kernel read, watch hit stop mid-sep, fix step number, read input tensors cleanup the code and isolate UpdataStepNum function do cpplint, Cppcheck and clang-format check recover CMakeList.txt only update step_num in one place fix clang-format error fix CI errors part2 update graphengine version addressed comments --- mindspore/ccsrc/debug/debug_services.cc | 55 ++++++++ mindspore/ccsrc/debug/debug_services.h | 3 + .../ccsrc/debug/debugger/debug_grpc.proto | 16 ++- mindspore/ccsrc/debug/debugger/debugger.cc | 121 +++++++++++++++++- mindspore/ccsrc/debug/debugger/debugger.h | 18 +++ mindspore/ccsrc/debug/tensor_load.h | 16 +++ .../runtime/device/gpu/gpu_device_address.cc | 2 - .../runtime/device/gpu/gpu_kernel_runtime.cc | 61 ++++++++- 8 files changed, 283 insertions(+), 9 deletions(-) diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index cc6c5c53ad..1e99168c1e 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -171,6 +171,61 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector } } +void DebugServices::CheckSingleWatchpoint(std::shared_ptr watchtensor, std::string *name, std::string *slot, + char **data_ptr, unsigned int *data_size, int *condition, + unsigned int *wacthpoint_id) { + std::lock_guard lg(lock_); + + std::string current_watchtensor_name; + current_watchtensor_name = watchtensor->GetName(); + mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor(); + int tensor_data_type = tensor_ptr->data_type_c(); + watchpoint_t watchpoint_to_check; + + for (auto w_table_item : watchpoint_table) { + auto check_node_list = std::get<1>(w_table_item).check_node_list; + for (auto check_node : check_node_list) { + std::string w_name = std::get<0>(check_node); + bool w_type = std::get<1>(check_node); + // get current the full info including condition, id..., for current watchtensor + std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":")); + if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) || + (w_type == false && current_node_name == w_name)) { + watchpoint_to_check = w_table_item.second; + // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan + if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) { + return; + } + break; + } + } + } + + float *start_addr = reinterpret_cast(tensor_ptr->data_c()); + unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float); + + for (unsigned int index = 0; index < num_elements; index++) { + float x = start_addr[index]; + if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) || + (watchpoint_to_check.conditions.nan.enabled && isnan(x))) { + std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":")); + *name = name_no_slot; + *slot = std::to_string(watchtensor->GetSlot()); + *data_ptr = reinterpret_cast(tensor_ptr->data_c()); + *data_size = tensor_ptr->data().nbytes(); + int condition_item = -1; + if (watchpoint_to_check.conditions.nan.enabled) { + condition_item = 0; + } else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) { + condition_item = 1; + } + *condition = condition_item; + + *wacthpoint_id = watchpoint_to_check.id; + } + } +} + void DebugServices::ReadNodesTensors(std::vector name, std::vector *ret_name, std::vector *data_ptr, std::vector *data_size, std::vector *dtype, std::vector> *shape) { diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index f447ed17d8..b664a9b9e9 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -78,6 +78,9 @@ class DebugServices { std::vector *data_size, std::vector *condition, std::vector *wacthpoint_id); + void CheckSingleWatchpoint(std::shared_ptr watchnode, std::string *name, std::string *slot, + char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id); + void ReadNodesTensors(std::vector name, std::vector *ret_name, std::vector *data_ptr, std::vector *data_size, std::vector *dtype, std::vector> *shape); diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto index f742987a4e..27c93787b8 100644 --- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto +++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto @@ -31,6 +31,10 @@ service EventListener { message Metadata { string device_name = 1; int32 cur_step = 2; + // define the backend is 'GPU' or "Ascend" + string backend = 3; + // the full name of current node + string cur_node = 4; } message EventReply { @@ -44,12 +48,22 @@ message EventReply { oneof cmd { bool exit = 2; - int32 run_cmd = 3; + RunCMD run_cmd = 3; SetCMD set_cmd = 4; ViewCMD view_cmd = 5; } } +message RunCMD { + // step level or node level. "step" or "node" + string run_level = 1; + oneof cmd { + int32 run_steps = 2; + // the next node full name + string node_name = 3; + } +} + message SetCMD { repeated WatchNode watch_nodes = 1; WatchCondition watch_condition = 2; diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index b9e9238034..77e75a5f19 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -45,6 +45,9 @@ Debugger::Debugger() device_target_(""), num_step_(0), debugger_enabled_(false), + run_level_(""), + node_name_(""), + cur_name_(""), is_dataset_graph_(false), partial_memory_(false) {} @@ -164,10 +167,46 @@ void Debugger::PostExecute() { // access lock for public method std::lock_guard a_lock(access_lock_); // analyze tensor data and send the watchpoints been hit + if (run_level_ == "node") { + MS_LOG(INFO) << "Debugger is in node level mode "; + return; + } if (debugger_enabled_ && !is_dataset_graph_) { - num_step_++; MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_; - SendWatchpointsAndSuspend(CheckWatchpoints()); + CommandLoop(); + } +} + +bool Debugger::ReadNodeDataRequired() { + if (debugger_enabled_ && !is_dataset_graph_) { + auto watchpoint_table = debug_services_->GetWatchpointTable(); + auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); + // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data + if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { + return true; + } + } + return false; +} + +void Debugger::PostExecuteNode() { + // access lock for public method + std::lock_guard a_lock(access_lock_); + if (debugger_enabled_ && !is_dataset_graph_) { + auto watchpoint_table = debug_services_->GetWatchpointTable(); + auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); + // if kernel is watchpoint,and get hit. suspend. + if (is_watchpoint) { + auto hits = CheckSingleWatchpoint(cur_name_); + if (!hits.empty()) { + SendWatchpointsAndSuspend(hits); + } + } + // if kernel is not watchpoint and is next_to or continue_to node, suspend. + if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) { + CommandLoop(); + } + return; } } @@ -232,6 +271,8 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { Metadata metadata; metadata.set_device_name(device_name); metadata.set_cur_step(num_step_); + metadata.set_backend(device_target_); + metadata.set_cur_node(cur_name_); EventReply reply_metadata = grpc_client_->SendMetadata(metadata); if (reply_metadata.status() != reply_metadata.OK) { MS_LOG(ERROR) << "Error: SendMetadata failed"; @@ -249,8 +290,11 @@ void Debugger::CommandLoop() { // prepare metadata std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); Metadata metadata; + metadata.set_device_name(device_name); metadata.set_cur_step(num_step_); + metadata.set_backend(device_target_); + metadata.set_cur_node(cur_name_); // loop exit flag bool run = false; @@ -291,6 +335,16 @@ void Debugger::CommandLoop() { break; case DebuggerCommand::kRunCMD: MS_LOG(INFO) << "RunCMD"; + { + // print run cmd content + // get run_level and node_name + run_level_ = GetRunLevel(reply); + node_name_ = GetNodeName(reply); + + MS_LOG(INFO) << "run_level: " << run_level_; + MS_LOG(INFO) << "node_name_: " << node_name_; + } + // exit loop run = true; break; @@ -445,6 +499,35 @@ std::list Debugger::CheckWatchpoints() const { return hits; } +std::list Debugger::CheckSingleWatchpoint(std::string watchnode) const { + auto tensor_loader = debug_services_->tensor_loader(); + auto tensors = tensor_loader->GetNodeTensorMap(watchnode); + std::list hits; + for (std::vector>::iterator it = tensors.begin(); it != tensors.end(); ++it) { + auto cur_tensor = *it; + std::string name = ""; + std::string slot = ""; + char *data_ptr = nullptr; + unsigned int data_size = 0; + int condition = -1; + unsigned int watchpoint_id = -1; + WatchpointHit hit; + debug_services_->CheckSingleWatchpoint(cur_tensor, &name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id); + if (name != "") { + hit.set_id(watchpoint_id); + // here TensorProto act as a tensor indicator, not sending tensor content + TensorProto *tensor_item = hit.mutable_tensor(); + tensor_item->set_node_name(name); + tensor_item->set_slot(slot); + tensor_item->set_finished(true); + WatchCondition *condition_item = hit.mutable_watch_condition(); + condition_item->set_condition(debugger::WatchCondition_Condition(condition)); + hits.push_back(hit); + } + } + return hits; +} + void Debugger::SendWatchpointsAndSuspend(const std::list &points) { // send info about watchpoint if (!points.empty()) { @@ -491,6 +574,24 @@ ProtoVector GetWatchnodes(const EventReply &reply) { return reply.set_cmd().watch_nodes(); } +std::string GetRunLevel(const EventReply &reply) { + if (!reply.has_run_cmd()) { + MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: " + ""; + return ""; + } + return reply.run_cmd().run_level(); +} + +std::string GetNodeName(const EventReply &reply) { + if (!reply.has_run_cmd()) { + MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: " + ""; + return ""; + } + return reply.run_cmd().node_name(); +} + WatchCondition GetWatchcondition(const EventReply &reply) { if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) { MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition()."; @@ -536,4 +637,20 @@ std::string GetTensorFullName(const TensorProto &tensor) { bool Debugger::partial_memory() { return partial_memory_; } +void Debugger::SetCurNode(std::string cur_name) { + // access lock for public method + std::lock_guard a_lock(access_lock_); + cur_name_ = cur_name; +} + +std::string Debugger::run_level() const { return run_level_; } + +void Debugger::SetStepNum(int32_t cur_num_step) { + // access lock for public method + std::lock_guard a_lock(access_lock_); + num_step_ = cur_num_step; +} + +int32_t Debugger::step_num() const { return num_step_; } + } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index f72a3e038c..ea035708ea 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -69,6 +69,10 @@ class Debugger : public std::enable_shared_from_this { // don't need a graph_ptr because it is saved during pre_execute void PostExecute(); + bool ReadNodeDataRequired(); + + void PostExecuteNode(); + // suspend the execution after a debug_op void PostDebugOp(); @@ -78,6 +82,14 @@ class Debugger : public std::enable_shared_from_this { bool partial_memory(); + void SetCurNode(std::string cur_name); + + std::string run_level() const; + + void SetStepNum(int32_t cur_num_step); + + int32_t step_num() const; + private: // private constructor for singleton Debugger(); @@ -119,6 +131,7 @@ class Debugger : public std::enable_shared_from_this { // analyze tensors and check watchpoint conditions // return names of tensors and what condition they hit std::list CheckWatchpoints() const; + std::list CheckSingleWatchpoint(std::string watchnode) const; // send watchpoints that hit and enter command wait loop void SendWatchpointsAndSuspend(const std::list &points); @@ -131,6 +144,9 @@ class Debugger : public std::enable_shared_from_this { std::string device_target_; int32_t num_step_; bool debugger_enabled_; + std::string run_level_; + std::string node_name_; + std::string cur_name_; bool is_dataset_graph_; bool partial_memory_; std::mutex access_lock_; @@ -154,6 +170,8 @@ DebuggerCommand GetCommand(const EventReply &reply); // parse other data out of EventReply ProtoVector GetWatchnodes(const EventReply &reply); +std::string GetNodeName(const EventReply &reply); +std::string GetRunLevel(const EventReply &reply); WatchCondition GetWatchcondition(const EventReply &reply); int32_t GetWatchpointID(const EventReply &reply); bool GetWatchpointDelete(const EventReply &reply); diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index 7215b9a624..8c4072ec49 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -47,6 +47,9 @@ class TensorLoader { } tensor_list.push_back(tensor); tensor_list_map.insert({tensor->GetName(), tensor}); + auto node_name = tensor->GetName(); + node_name = node_name.substr(0, node_name.find_first_of(":")); + node_tensor_map.insert({node_name, tensor}); return true; } std::vector> GetTensor() { return tensor_list; } @@ -54,6 +57,17 @@ class TensorLoader { uint32_t GetIterNum() { return iter_num; } std::map> GetTensorMap() { return tensor_list_map; } + + std::vector> GetNodeTensorMap(std::string node_name) { + std::vector> tensors; + for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) { + if (itr->first == node_name) { + tensors.push_back(itr->second); + } + } + return tensors; + } + void SearchTensors(const std::vector &search_list, std::vector>> *result_list) { for (auto i : search_list) { @@ -70,6 +84,7 @@ class TensorLoader { void EmptyTensor() { std::lock_guard lg(lock_); prev_tensor_list_map.clear(); + node_tensor_map.clear(); tensor_list_map.swap(prev_tensor_list_map); tensor_list.clear(); } @@ -127,6 +142,7 @@ class TensorLoader { private: std::vector> tensor_list; std::map> tensor_list_map; + std::multimap> node_tensor_map; std::map> prev_tensor_list_map; uint32_t iter_num; std::mutex lock_; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index 35fc90b7e4..c7fbda2dad 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -90,9 +90,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi tensor_data->SetTensor(out_tensor); tensor_data->SetSlot(slot); ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); - MS_LOG(INFO) << "E2E tensor name is " << tensor_name; - return ret; } #endif diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 664648351a..4f24b8e412 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -31,6 +31,9 @@ #include "runtime/device/gpu/gpu_memory_copy_manager.h" #include "common/trans.h" #include "ir/dtype.h" +#ifdef ENABLE_DEBUGGER +#include "debug/debug_services.h" +#endif namespace mindspore { namespace device { @@ -221,10 +224,46 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, const std::vector &kernel_workspaces, const std::vector &kernel_outputs, int exec_order, void *stream_ptr, bool dump_enabled) { - if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) { + // check if we should read the kernel data + bool read_data = false; + std::string kernel_name = kernel->fullname_with_scope(); + if (debugger) { + debugger->SetCurNode(kernel_name); + if (dump_enabled) { + read_data = true; + } else if (debugger->debugger_enabled()) { + read_data = debugger->ReadNodeDataRequired(); + } + } + + if (!read_data) { return; } - std::string kernel_name = kernel->fullname_with_scope(); + + // get inputs + if (!dump_enabled) { + auto input_size = AnfAlgo::GetInputTensorNum(kernel); + for (size_t j = 0; j < input_size; ++j) { + auto input_kernel = kernel->input(j + 1); + std::string input_kernel_name = input_kernel->fullname_with_scope(); + auto addr = kernel_inputs[j]; + auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); + auto format = kOpFormat_DEFAULT; + auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); + string input_tensor_name = input_kernel_name + ':' + "0"; + std::vector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; + } + } + } + + // get outputs auto output_size = AnfAlgo::GetOutputTensorNum(kernel); for (size_t j = 0; j < output_size; ++j) { auto addr = kernel_outputs[j]; @@ -242,11 +281,21 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; } } + + debugger->PostExecuteNode(); +} + +void UpdateStepNum(Debugger *debugger, bool dump_enabled) { + if (debugger && (debugger->debugger_enabled() || dump_enabled)) { + auto cur_step_num = debugger->step_num(); + cur_step_num = cur_step_num + 1; + debugger->SetStepNum(cur_step_num); + } } void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) { MS_EXCEPTION_IF_NULL(graph); - if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) { + if (!(debugger && dump_enabled)) { return; } const auto ¶meters = graph->inputs(); @@ -616,9 +665,13 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De #ifdef ENABLE_DEBUGGER bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); + if (!mock) { + UpdateStepNum(debugger, dump_enabled); + } #endif auto &kernels = graph->execution_order(); int exec_order = 1; + for (const auto &kernel : kernels) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); @@ -662,7 +715,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De } if (!mock) { #ifdef ENABLE_DEBUGGER - // collect weights and bias + // collect weights and bias for dump mode LoadParameters(graph, debugger, dump_enabled); #endif CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");