| @@ -34,6 +34,15 @@ | |||
| #include "debug/data_dump/e2e_dump.h" | |||
| #include "utils/config_manager.h" | |||
| #include "debug/env_config_parser.h" | |||
| #include "utils/comm_manager.h" | |||
| #include "runtime/framework/actor/actor_common.h" | |||
| #include "runtime/hardware/device_context_manager.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/proto_exporter.h" | |||
| #else | |||
| #include "debug/debugger/proto_exporter_stub.h" | |||
| #endif | |||
| using debugger::Chunk; | |||
| using debugger::EventReply; | |||
| @@ -228,6 +237,9 @@ bool Debugger::CheckDebuggerDumpEnabled() const { | |||
| // see if dump is enabled | |||
| if (device_target_ == kGPUDevice) { | |||
| return device::KernelRuntime::DumpDataEnabled(); | |||
| } else if (IsMindRTUsed()) { | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| return dump_json_parser.e2e_dump_enabled(); | |||
| } | |||
| return false; | |||
| } | |||
| @@ -289,8 +301,23 @@ void Debugger::Reset() { | |||
| graph_ptr_list_.clear(); | |||
| } | |||
| void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) { | |||
| // Only GPU is supported for MindRTBackend | |||
| if (device_target_ != kGPUDevice) { | |||
| return; | |||
| } | |||
| uint32_t graph_sum = graphs.size(); | |||
| for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { | |||
| const auto &graph = graphs[graph_index]; | |||
| if (debugger_) { | |||
| debugger_->PreExecute(graph, graph_sum); | |||
| } | |||
| DumpSetup(graph); | |||
| } | |||
| } | |||
| void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { | |||
| // access lock for public method | |||
| std::lock_guard<std::mutex> a_lock(access_lock_); | |||
| CheckDatasetSinkMode(); | |||
| auto graph_id = graph_ptr->graph_id(); | |||
| @@ -313,7 +340,6 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { | |||
| if (!debugger_enabled_) { | |||
| EnableDebugger(); | |||
| } | |||
| if (debugger_enabled_) { | |||
| if (graph_proto_list_.size()) { | |||
| // only send compiled graphs once. | |||
| @@ -323,7 +349,9 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { | |||
| LoadParametersAndConst(); | |||
| // revert graph ptr to original value | |||
| graph_ptr_ = dbg_graph_ptr; | |||
| SendMultiGraphsAndSuspend(graph_proto_list_); | |||
| graph_proto_list_.clear(); | |||
| } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) { | |||
| // stop only when receive the first sub run graph for each step | |||
| @@ -351,6 +379,89 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { | |||
| // resets for the new graph | |||
| suspended_at_last_kernel_ = 0; | |||
| } | |||
| bool Debugger::DumpDataEnabledIteration() const { | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| if (!dump_json_parser.e2e_dump_enabled()) { | |||
| return false; | |||
| } | |||
| auto cur_iter = dump_json_parser.cur_dump_iter(); | |||
| if (dump_json_parser.IsDumpIter(cur_iter)) { | |||
| return true; | |||
| } | |||
| return false; | |||
| } | |||
| void Debugger::Dump(const KernelGraphPtr &kernel_graph) const { | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| const auto &device_context = | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); | |||
| uint32_t rank_id = device_context->GetRankID(); | |||
| if (debugger_->DebuggerBackendEnabled()) { | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get()); | |||
| } else { | |||
| DumpJsonParser::GetInstance().UpdateDumpIter(); | |||
| } | |||
| } | |||
| void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const { | |||
| MS_LOG(INFO) << "Start!"; | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| const auto &device_context = | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); | |||
| uint32_t rank_id = device_context->GetRankID(); | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| E2eDump::DumpSetup(kernel_graph.get(), rank_id); | |||
| MS_LOG(INFO) << "Finish!"; | |||
| } | |||
| void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) { | |||
| // This function will be called for new GPU runtime using MindRTBackend | |||
| auto &json_parser = DumpJsonParser::GetInstance(); | |||
| if (json_parser.e2e_dump_enabled()) { | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| const auto &device_context = | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); | |||
| uint32_t rank_id = device_context->GetRankID(); | |||
| kernel_graph->set_root_graph_id(kernel_graph->graph_id()); | |||
| std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id()); | |||
| std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id); | |||
| std::string target_dir = root_dir + "/graphs"; | |||
| std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; | |||
| DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack); | |||
| DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path); | |||
| DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir, | |||
| kernel_graph->execution_order()); | |||
| } | |||
| } | |||
| void Debugger::PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) { | |||
| // Only GPU is supported for MindRTBackend | |||
| if (device_target_ != kGPUDevice) { | |||
| return; | |||
| } | |||
| for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { | |||
| const auto &graph = graphs[graph_index]; | |||
| bool dump_enabled = debugger_->DumpDataEnabledIteration(); | |||
| // debug used for dump | |||
| if (debugger_ && dump_enabled) { | |||
| debugger_->Dump(graph); | |||
| } else { | |||
| DumpJsonParser::GetInstance().UpdateDumpIter(); | |||
| } | |||
| if (debugger_) { | |||
| debugger_->PostExecute(); | |||
| } | |||
| } | |||
| } | |||
| void Debugger::PostExecute() { | |||
| // access lock for public method | |||
| @@ -365,6 +476,7 @@ void Debugger::PostExecute() { | |||
| num_step_++; | |||
| } | |||
| SendWatchpoints(CheckWatchpoints()); | |||
| // no need to suspend at each graph for GPU, suspension happens in preExecute | |||
| if (device_target_ != kGPUDevice) { | |||
| CommandLoop(); | |||
| @@ -388,7 +500,6 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const { | |||
| } | |||
| return false; | |||
| } | |||
| void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) { | |||
| // access lock for public method | |||
| std::lock_guard<std::mutex> a_lock(access_lock_); | |||
| @@ -405,6 +516,7 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) { | |||
| if (!hits.empty()) { | |||
| SendWatchpoints(hits); | |||
| CommandLoop(); | |||
| hit_empty_flag = false; | |||
| } | |||
| } | |||
| @@ -507,7 +619,6 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const { | |||
| ModelProto model = GetDebuggerFuncGraphProto(graph_ptr); | |||
| return model.graph(); | |||
| } | |||
| void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) { | |||
| if (SendMetadata(true)) { | |||
| // send graph to Mindinsight server | |||
| @@ -533,7 +644,9 @@ bool Debugger::SendMetadata(bool version_check) { | |||
| MS_LOG(INFO) << "Is training done?" << training_done_; | |||
| // set graph munber to not_dataset_graph_sum_ | |||
| metadata.set_graph_num(not_dataset_graph_sum_); | |||
| EventReply reply_metadata = grpc_client_->SendMetadata(metadata); | |||
| bool ret = false; | |||
| if (reply_metadata.status() == reply_metadata.OK) { | |||
| if (version_check) { | |||
| @@ -575,6 +688,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot | |||
| auto graph_size = graph.ByteSize(); | |||
| if (graph_size > g_chunk_size) { | |||
| auto sub_graph_str = grpc_client_->ChunkString(str, graph_size); | |||
| for (unsigned int i = 0; i < sub_graph_str.size(); i++) { | |||
| chunk.set_buffer(sub_graph_str[i]); | |||
| chunked_graph_proto_list.push_back(chunk); | |||
| @@ -834,7 +948,6 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten | |||
| } | |||
| return tensor_list; | |||
| } | |||
| void Debugger::Exit() { | |||
| // clear resource before exit | |||
| // debugger will notify main thread to exit because main thread can only exit at step boundary | |||
| @@ -1171,6 +1284,13 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output | |||
| if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) { | |||
| return; | |||
| } | |||
| // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host | |||
| if (IsMindRTUsed() && (device_target_ == kGPUDevice)) { | |||
| if (!anf_node->isa<ValueNode>() && | |||
| !(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) { | |||
| return; | |||
| } | |||
| } | |||
| // for parameters and value nodes, set its execution order to be 0; | |||
| int exec_order = 0; | |||
| std::string node_name = anf_node->fullname_with_scope(); | |||
| @@ -1268,6 +1388,14 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) { | |||
| ++num_step_; | |||
| } | |||
| } | |||
| void Debugger::UpdateStepNumGPU() { | |||
| // UpdateStepNum with DebugActor::DebugOnStepEnd | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) { | |||
| // access lock for public method | |||
| std::lock_guard<std::mutex> a_lock(access_lock_); | |||
| ++num_step_; | |||
| } | |||
| } | |||
| void Debugger::ClearCurrentData() { | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||
| @@ -73,6 +73,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| // reset debugger | |||
| void Reset(); | |||
| void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs); | |||
| // enable debugger | |||
| // send graph and wait for command | |||
| // do nothing if graph is set already | |||
| @@ -82,6 +83,16 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| // don't need a graph_ptr because it is saved during pre_execute | |||
| void PostExecute(); | |||
| bool DumpDataEnabledIteration() const; | |||
| void Dump(const KernelGraphPtr &kernel_graph) const; | |||
| void DumpSetup(const KernelGraphPtr &kernel_graph) const; | |||
| void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph); | |||
| void PostExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs); | |||
| bool ReadNodeDataRequired(const CNodePtr &kernel) const; | |||
| void PostExecuteNode(const CNodePtr &kernel, bool last_kernel); | |||
| @@ -132,6 +143,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void UpdateStepNum(const session::KernelGraph *graph); | |||
| void UpdateStepNumGPU(); | |||
| void ClearCurrentData(); | |||
| void LoadGraphOutputs(); | |||
| @@ -194,7 +207,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void ProcessKSetCMD(const EventReply &reply); | |||
| // Process the KViewCMD | |||
| void ProcessKViewCMD(const EventReply &reply); | |||
| // set what nodes and conditions to watch | |||
| void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id, | |||
| const ProtoVector<WatchCondition_Parameter> ¶meters); | |||
| @@ -228,6 +240,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index); | |||
| // class members | |||
| std::unique_ptr<GrpcClient> grpc_client_; | |||
| std::unique_ptr<DebugServices> debug_services_; | |||
| KernelGraphPtr graph_ptr_; | |||
| @@ -249,6 +262,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| std::map<uint32_t, std::string> overflow_bin_path_; | |||
| // flag to keep track of the very first suspension of debugger | |||
| bool initial_suspend_; | |||
| std::list<GraphProto> graph_proto_list_; | |||
| std::list<KernelGraphPtr> graph_ptr_list_; | |||
| @@ -261,9 +275,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| }; | |||
| using DebuggerPtr = std::shared_ptr<Debugger>; | |||
| // get debugger ModelProto | |||
| std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph); | |||
| ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph); | |||
| // for getting proto DataType from Type of Tensor | |||
| @@ -282,7 +296,6 @@ int32_t GetWatchpointID(const EventReply &reply); | |||
| bool GetWatchpointDelete(const EventReply &reply); | |||
| ProtoVector<TensorProto> GetTensors(const EventReply &reply); | |||
| bool GetMiVersionMatched(const EventReply &reply); | |||
| // get the full name of a tensor, which is the name used in TensorLoader | |||
| std::string GetTensorFullName(const TensorProto &tensor); | |||
| @@ -167,7 +167,7 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co | |||
| } | |||
| void DeviceQueueDataSourceActor::SendDebugReq(OpContext<DeviceTensor> *context) { | |||
| Async(*debug_aid_, &DebugActor::Debug, data_kernel_, device_context_, context, &GetAID()); | |||
| Async(*debug_aid_, &DebugActor::Debug, data_kernel_, &launch_info_, device_context_, context, &GetAID()); | |||
| } | |||
| void DeviceQueueDataSourceActor::OnDebugFinish(OpContext<DeviceTensor> *context) { | |||
| @@ -15,20 +15,134 @@ | |||
| */ | |||
| #include "runtime/framework/actor/debug_actor.h" | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "runtime/framework/actor/debug_aware_actor.h" | |||
| #include "mindrt/include/async/async.h" | |||
| #include "utils/log_adapter.h" | |||
| #ifdef ENABLE_GPU | |||
| #include "debug/debugger/debugger.h" | |||
| #include "runtime/device/gpu/gpu_device_address.h" | |||
| using mindspore::kernel::AddressPtr; | |||
| using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>; | |||
| #endif | |||
| namespace mindspore { | |||
| namespace runtime { | |||
| void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, | |||
| const AID *from_aid) { | |||
| #ifdef ENABLE_GPU | |||
| static const size_t PARAMETER_OUTPUT_INDEX = 0; | |||
| std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) { | |||
| // define a vector containing real output number | |||
| std::vector<int> real_outputs; | |||
| // P.BatchNorm is used for training and inference | |||
| // can add the filter list for more operators here.... | |||
| if (node_name == "BatchNorm") { | |||
| MS_LOG(INFO) << "loading node named " << node_name; | |||
| real_outputs.insert(real_outputs.end(), {0, 3, 4}); | |||
| } else { | |||
| // by default, TensorLoader will load all outputs | |||
| for (size_t j = 0; j < output_size; ++j) { | |||
| real_outputs.push_back(j); | |||
| } | |||
| } | |||
| return real_outputs; | |||
| } | |||
| void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) { | |||
| // get inputs | |||
| auto kernel_inputs = launch_info_->inputs_; | |||
| auto input_size = AnfAlgo::GetInputTensorNum(cnode); | |||
| for (size_t j = 0; j < input_size; ++j) { | |||
| auto input_kernel = cnode->input(j + 1); | |||
| std::string input_kernel_name = input_kernel->fullname_with_scope(); | |||
| auto addr = kernel_inputs[j]; | |||
| auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); | |||
| // For example, this happens with the Depend op | |||
| if (type == kMetaTypeNone) { | |||
| continue; | |||
| } | |||
| auto format = kOpFormat_DEFAULT; | |||
| auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type); | |||
| string input_tensor_name = input_kernel_name + ':' + "0"; | |||
| ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX); | |||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order_, format, int_shapes, type, 0, true); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||
| << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | |||
| } | |||
| } | |||
| } | |||
| void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) { | |||
| // get outputs | |||
| auto kernel_outputs = launch_info_->outputs_; | |||
| auto output_size = AnfAlgo::GetOutputTensorNum(cnode); | |||
| auto node_name = AnfAlgo::GetCNodeName(cnode); | |||
| std::string kernel_name = cnode->fullname_with_scope(); | |||
| std::vector<int> real_outputs = CheckRealOutput(node_name, output_size); | |||
| for (int j : real_outputs) { | |||
| auto addr = kernel_outputs[j]; | |||
| auto type = AnfAlgo::GetOutputInferDataType(cnode, j); | |||
| // For example, this happens with the Depend op | |||
| if (type == kMetaTypeNone) { | |||
| continue; | |||
| } | |||
| auto format = kOpFormat_DEFAULT; | |||
| auto gpu_addr = std::make_unique<device::gpu::GPUDeviceAddress>(addr->addr, addr->size, format, type); | |||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||
| ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j); | |||
| auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order_, format, int_shapes, type, j, false); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, | |||
| const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, const AID *from_aid) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| MS_EXCEPTION_IF_NULL(op_context); | |||
| MS_EXCEPTION_IF_NULL(from_aid); | |||
| // todo debug. | |||
| // todo debug. | |||
| #ifdef ENABLE_GPU | |||
| if (node->isa<CNode>()) { | |||
| const auto &cnode = node->cast<CNodePtr>(); | |||
| auto debugger = Debugger::GetInstance(); | |||
| if (debugger) { | |||
| std::string kernel_name = cnode->fullname_with_scope(); | |||
| debugger->SetCurNode(kernel_name); | |||
| bool read_data = false; | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| bool dump_enabled = debugger->DumpDataEnabledIteration(); | |||
| if (dump_enabled) { | |||
| auto dump_mode = dump_json_parser.dump_mode(); | |||
| // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list | |||
| if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { | |||
| read_data = true; | |||
| } | |||
| } else if (debugger->debugger_enabled()) { | |||
| read_data = debugger->ReadNodeDataRequired(cnode); | |||
| } | |||
| if (read_data) { | |||
| if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) { | |||
| LoadInputs(cnode, launch_info_, exec_order_); | |||
| } | |||
| if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) { | |||
| LoadOutputs(cnode, launch_info_, exec_order_); | |||
| } | |||
| // check if the node is last kernel | |||
| bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip"); | |||
| debugger->PostExecuteNode(cnode, last_kernel); | |||
| } | |||
| } | |||
| exec_order_ += 1; | |||
| } | |||
| #endif | |||
| // Call back to the from actor to process after debug finished. | |||
| Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context); | |||
| } | |||
| @@ -36,8 +150,16 @@ void DebugActor::Debug(const AnfNodePtr &node, const DeviceContext *device_conte | |||
| void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) { | |||
| MS_EXCEPTION_IF_NULL(op_context); | |||
| MS_EXCEPTION_IF_NULL(from_aid); | |||
| // todo debug. | |||
| // todo debug. | |||
| #ifdef ENABLE_GPU | |||
| auto debugger = Debugger::GetInstance(); | |||
| if (debugger) { | |||
| debugger->Debugger::UpdateStepNumGPU(); | |||
| debugger->Debugger::LoadParametersAndConst(); | |||
| // Reset exec_order for the next step | |||
| exec_order_ = 0; | |||
| } | |||
| #endif | |||
| // Call back to the from actor to process after debug finished. | |||
| Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context); | |||
| } | |||
| @@ -24,6 +24,7 @@ | |||
| namespace mindspore { | |||
| namespace runtime { | |||
| using mindspore::device::DeviceContext; | |||
| using mindspore::kernel::KernelLaunchInfo; | |||
| // The debug actor is used to debug and dump kernel info, it gets the kernel real time execution info in the device, so | |||
| // it is synchronous and blocked. | |||
| @@ -33,12 +34,17 @@ class DebugActor : public ActorBase { | |||
| ~DebugActor() override = default; | |||
| // The debug of each node. | |||
| void Debug(const AnfNodePtr &node, const DeviceContext *device_context, OpContext<DeviceTensor> *op_context, | |||
| const AID *from_aid); | |||
| void Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, const DeviceContext *device_context, | |||
| OpContext<DeviceTensor> *op_context, const AID *from_aid); | |||
| // The debug on step end. | |||
| void DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid); | |||
| private: | |||
| // class members | |||
| uint32_t exec_order_ = 0; | |||
| }; | |||
| } // namespace runtime | |||
| } // namespace mindspore | |||
| @@ -169,7 +169,7 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) { | |||
| } | |||
| void KernelActor::SendDebugReq(OpContext<DeviceTensor> *context) { | |||
| Async(*debug_aid_, &DebugActor::Debug, kernel_, device_context_, context, &GetAID()); | |||
| Async(*debug_aid_, &DebugActor::Debug, kernel_, &launch_info_, device_context_, context, &GetAID()); | |||
| } | |||
| void KernelActor::OnDebugFinish(OpContext<DeviceTensor> *context) { | |||
| @@ -24,6 +24,10 @@ | |||
| #include "ir/tensor.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "base/base_ref_utils.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| #include "debug/data_dump/dump_json_parser.h" | |||
| namespace mindspore { | |||
| namespace runtime { | |||
| @@ -278,6 +282,9 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| auto &json_parser = DumpJsonParser::GetInstance(); | |||
| json_parser.Parse(); | |||
| // Execute optimization pass. | |||
| auto outputs_before_optimizer = AnfAlgo::GetAllOutputWithIndex(graph->output()); | |||
| device_context->OptimizeGraph(graph); | |||
| @@ -297,13 +304,20 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic | |||
| } | |||
| graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get())); | |||
| #ifdef ENABLE_DEBUGGER | |||
| auto debugger = Debugger::GetInstance(); | |||
| debugger->DumpInGraphCompiler(graph); | |||
| #endif | |||
| MS_EXCEPTION_IF_NULL(session_); | |||
| session_->InitAllBucket(graph, device_context); | |||
| session_->SetSummaryNodes(graph.get()); | |||
| SetSummaryNodesRefCount(graph.get()); | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (debugger && debugger->DebuggerBackendEnabled()) { | |||
| debugger->LoadGraphs(graph); | |||
| } | |||
| #endif | |||
| return graph->graph_id(); | |||
| } | |||
| @@ -31,7 +31,9 @@ | |||
| #ifdef ENABLE_DUMP_IR | |||
| #include "debug/rdr/recorder_manager.h" | |||
| #endif | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace runtime { | |||
| namespace { | |||
| @@ -371,6 +373,18 @@ void GraphScheduler::Initialize() { | |||
| (void)actorMgr->Spawn(base_recorder_actor, true); | |||
| } | |||
| #endif | |||
| // Create and schedule debug actor. | |||
| #ifdef ENABLE_DEBUGGER | |||
| auto debugger = mindspore::Debugger::GetInstance(); | |||
| if (debugger->DebuggerBackendEnabled()) { | |||
| auto debug_actor = std::make_shared<DebugActor>(); | |||
| MS_EXCEPTION_IF_NULL(debug_actor); | |||
| debug_aid_ = &(debug_actor->GetAID()); | |||
| auto base_debug_actor = static_cast<ActorReference>(debug_actor); | |||
| base_debug_actor->set_thread_pool(thread_pool_); | |||
| (void)actorMgr->Spawn(base_debug_actor, true); | |||
| } | |||
| #endif | |||
| } | |||
| ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info, GraphExecutionStrategy strategy) { | |||
| @@ -37,6 +37,7 @@ | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "debug/rdr/running_data_recorder.h" | |||
| #include "utils/comm_manager.h" | |||
| #include "debug/debugger/debugger.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| @@ -91,6 +92,12 @@ bool GPUDeviceContext::Initialize() { | |||
| (*init_nccl_comm_funcptr)(); | |||
| } | |||
| auto rank_id = GetRankID(); | |||
| auto &json_parser = DumpJsonParser::GetInstance(); | |||
| // Dump json config file if dump is enabled | |||
| json_parser.CopyJsonToDir(rank_id); | |||
| json_parser.CopyMSCfgJsonToDir(rank_id); | |||
| initialized_ = true; | |||
| return ret; | |||
| } | |||
| @@ -125,6 +132,12 @@ bool GPUDeviceContext::InitDevice() { | |||
| void GPUDeviceContext::Destroy() { | |||
| // Release GPU buffer manager resource | |||
| auto debugger = Debugger::GetInstance(); | |||
| if (debugger && debugger->debugger_enabled()) { | |||
| debugger->SetTrainingDone(true); | |||
| debugger->SendMetadata(false); | |||
| } | |||
| if (GpuBufferMgr::GetInstance().IsInit()) { | |||
| if (!GpuBufferMgr::GetInstance().IsClosed() && !GpuBufferMgr::GetInstance().CloseNotify()) { | |||
| MS_LOG(EXCEPTION) << "Could not close gpu data queue."; | |||
| @@ -36,7 +36,9 @@ | |||
| #ifdef ENABLE_GE | |||
| #include "utils/callbacks_ge.h" | |||
| #endif | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace compile { | |||
| bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); } | |||
| @@ -577,10 +579,24 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args, | |||
| const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info); | |||
| MS_EXCEPTION_IF_NULL(actor_set); | |||
| runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors); | |||
| // PreExecuteGraph | |||
| #ifdef ENABLE_DEBUGGER | |||
| auto debugger = Debugger::GetInstance(); | |||
| if (debugger) { | |||
| debugger->Debugger::PreExecuteGraphDebugger(graph_compiler_info.graphs_); | |||
| } | |||
| #endif | |||
| if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) { | |||
| MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_; | |||
| } | |||
| // PostExecuteGraph | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (debugger) { | |||
| debugger->Debugger::PostExecuteGraphDebugger(graph_compiler_info.graphs_); | |||
| } | |||
| #endif | |||
| // Sync device stream. | |||
| const auto &first_device_context = graph_compiler_info.device_contexts_[0]; | |||
| MS_EXCEPTION_IF_NULL(first_device_context); | |||
| @@ -644,6 +660,15 @@ void MindRTBackend::ConstructOutputs(const AnfNodePtr &output_node, | |||
| } | |||
| } | |||
| #ifdef ENABLE_DEBUGGER | |||
| void MindRTBackend::SetDebugger() { | |||
| auto debugger_ = Debugger::GetInstance(); | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET)); | |||
| } | |||
| #endif | |||
| std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) { | |||
| MS_EXCEPTION_IF_NULL(root_graph); | |||
| MS_EXCEPTION_IF_NULL(graph_compiler_); | |||
| @@ -118,6 +118,9 @@ class MindRTBackend : public Backend { | |||
| // Run Graph in the pyNative mode. | |||
| void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask, | |||
| const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs); | |||
| #ifdef ENABLE_DEBUGGER | |||
| void SetDebugger() override; | |||
| #endif | |||
| private: | |||
| // The parameter func_graph is a graph, it can be either a root graph or a sub graph, | |||