Merge pull request !7263 from Harshvardhan Gupta/add-dbg-runtimetags/v1.1.0
| @@ -20,8 +20,8 @@ | |||
| #include "runtime/device/ascend/ascend_stream_assign.h" | |||
| #endif | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #include "debug/debug_services.h" | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| namespace mindspore { | |||
| @@ -82,9 +82,8 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr | |||
| auto debugger_ = mindspore::Debugger::GetInstance(); | |||
| if (debugger_->DebuggerBackendEnabled()) { | |||
| DebugServices *debug_services = debugger_->debug_services(); | |||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | |||
| std::string current_kernel_name = kernel_curr->scope_full_name(); | |||
| if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) { | |||
| if (debug_services->IsWatchPoint(current_kernel_name)) { | |||
| return false; | |||
| } | |||
| } | |||
| @@ -602,7 +602,7 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) | |||
| tensor_loader->EmptyTensor(); | |||
| uint32_t iter_num = tensor_loader->GetIterNum(); | |||
| tensor_loader->set_iter_num(++iter_num); | |||
| (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get()); | |||
| (void)runtime_instance->LoadData(kernel_graph.get()); | |||
| tensor_loader->EmptyPrevTensor(); | |||
| } | |||
| #endif | |||
| @@ -221,11 +221,7 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (!runtime_instance->Run(kernel_graph.get(), false, debugger_.get())) { | |||
| #else | |||
| if (!runtime_instance->Run(kernel_graph.get(), false)) { | |||
| #endif | |||
| MS_LOG(EXCEPTION) << "GPU execute graph failed!"; | |||
| } | |||
| } | |||
| @@ -234,8 +234,7 @@ void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector< | |||
| } | |||
| } | |||
| bool DebugServices::IsWatchPoint(std::string kernel_name, | |||
| std::unordered_map<unsigned int, watchpoint_t> watchpoint_table) { | |||
| bool DebugServices::IsWatchPoint(std::string kernel_name) { | |||
| bool ret = false; | |||
| for (auto w_table_item : watchpoint_table) { | |||
| auto check_node_list = std::get<1>(w_table_item).check_node_list; | |||
| @@ -136,7 +136,7 @@ class DebugServices { | |||
| std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | |||
| std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape); | |||
| bool IsWatchPoint(std::string kernel_name, std::unordered_map<unsigned int, watchpoint_t> watchpoint_table); | |||
| bool IsWatchPoint(std::string kernel_name); | |||
| TensorLoader *tensor_loader() const; | |||
| @@ -49,7 +49,7 @@ namespace mindspore { | |||
| DebuggerPtr Debugger::debugger_ = nullptr; | |||
| std::mutex Debugger::instance_lock_; | |||
| static const size_t PRAMATER_OUTPUT_INDEX = 0; | |||
| static const size_t PARAMETER_OUTPUT_INDEX = 0; | |||
| static const size_t VALUE_NODE_OUTPUT_INDEX = 0; | |||
| Debugger::Debugger() | |||
| @@ -279,8 +279,7 @@ void Debugger::PostExecute() { | |||
| bool Debugger::ReadNodeDataRequired() { | |||
| if (debugger_enabled_ && !is_dataset_graph_) { | |||
| auto watchpoint_table = debug_services_->GetWatchpointTable(); | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||
| // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data | |||
| if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { | |||
| return true; | |||
| @@ -296,8 +295,7 @@ void Debugger::PostExecuteNode() { | |||
| return; | |||
| } | |||
| if (debugger_enabled_ && !is_dataset_graph_) { | |||
| auto watchpoint_table = debug_services_->GetWatchpointTable(); | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||
| // if kernel is watchpoint,and get hit. suspend. | |||
| bool hit_empty_flag = true; | |||
| @@ -914,7 +912,7 @@ void Debugger::LoadParametersAndConst() { | |||
| MS_LOG(INFO) << "Start to load Parameters!"; | |||
| const auto ¶meters = graph_ptr_->inputs(); | |||
| for (auto &item : parameters) { | |||
| LoadSingleAnfnode(item, PRAMATER_OUTPUT_INDEX); | |||
| LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX); | |||
| } | |||
| // load value nodes | |||
| // get all constant avlues from the graph | |||
| @@ -925,4 +923,50 @@ void Debugger::LoadParametersAndConst() { | |||
| } | |||
| } | |||
| void Debugger::LoadGraphOutputs() { | |||
| if (!(debugger_enabled() && device_target_ == kAscendDevice)) return; | |||
| MS_EXCEPTION_IF_NULL(graph_ptr_); | |||
| const auto &apply_kernels = graph_ptr_->execution_order(); | |||
| // for kernels, execution order starts from 1 | |||
| int exec_order = 1; | |||
| for (const auto &node : apply_kernels) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||
| std::string kernel_name = node->fullname_with_scope(); | |||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||
| if (partial_memory_) { | |||
| if (!debug_services_->IsWatchPoint(kernel_name)) { | |||
| continue; | |||
| } | |||
| } | |||
| for (size_t j = 0; j < output_size; ++j) { | |||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||
| MS_EXCEPTION_IF_NULL(addr); | |||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||
| auto format = kOpFormat_DEFAULT; | |||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||
| ShapeVector int_shapes; | |||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||
| auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | |||
| } | |||
| } | |||
| exec_order = exec_order + 1; | |||
| } | |||
| } | |||
| void Debugger::UpdateStepNum() { | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||
| ++num_step_; | |||
| } | |||
| void Debugger::ClearCurrentData() { | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||
| debug_services_->tensor_loader()->EmptyCurrentTensor(); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -105,6 +105,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void LoadParametersAndConst(); | |||
| void UpdateStepNum(); | |||
| void ClearCurrentData(); | |||
| void LoadGraphOutputs(); | |||
| private: | |||
| // private constructor for singleton | |||
| Debugger(); | |||
| @@ -263,6 +263,7 @@ bool AscendKernelRuntime::Init() { | |||
| if (!ret) { | |||
| return ret; | |||
| } | |||
| SetDebugger(); | |||
| mem_manager_ = std::make_shared<AscendMemoryManager>(); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->MallocDeviceMemory(); | |||
| @@ -271,63 +272,16 @@ bool AscendKernelRuntime::Init() { | |||
| return ret; | |||
| } | |||
| #ifdef ENABLE_DEBUGGER | |||
| namespace { | |||
| void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| // trans_flag: "true" means tensor values will be transfered to host format, otherwise not. | |||
| bool trans_flag = false; | |||
| const auto &apply_kernels = graph->execution_order(); | |||
| // for kernels, execution order starts from 1 | |||
| int exec_order = 1; | |||
| auto debugger_i = mindspore::Debugger::GetInstance(); | |||
| DebugServices *debug_services = debugger_i->debug_services(); | |||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | |||
| for (const auto &node : apply_kernels) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||
| std::string kernel_name = node->fullname_with_scope(); | |||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||
| if (debugger_i->partial_memory()) { | |||
| if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { | |||
| continue; | |||
| } | |||
| } | |||
| for (size_t j = 0; j < output_size; ++j) { | |||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||
| auto format = kOpFormat_DEFAULT; | |||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||
| auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr); | |||
| MS_EXCEPTION_IF_NULL(ascend_addr); | |||
| ShapeVector int_shapes; | |||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||
| auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name | |||
| << ", host_format:" << format << ".!"; | |||
| } | |||
| } | |||
| exec_order = exec_order + 1; | |||
| } | |||
| } | |||
| } // namespace | |||
| #endif | |||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| #ifdef ENABLE_DEBUGGER | |||
| debugger_ = debugger; | |||
| MS_LOG(INFO) << "Start load step"; | |||
| uint32_t cur_iter = 0; | |||
| MS_LOG(INFO) << "Cur iter is " << cur_iter; | |||
| // load output | |||
| LoadOutput(graph, debugger); | |||
| debugger_->LoadGraphOutputs(); | |||
| // load parameters | |||
| if (debugger) debugger->LoadParametersAndConst(); | |||
| debugger_->LoadParametersAndConst(); | |||
| #endif | |||
| return true; | |||
| } | |||
| @@ -550,7 +504,7 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) { | |||
| } | |||
| } | |||
| bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { | |||
| bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { | |||
| bool ret = false; | |||
| #if defined(_WIN32) || defined(_WIN64) | |||
| auto start_time = std::chrono::steady_clock::now(); | |||
| @@ -38,14 +38,14 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| AscendKernelRuntime() = default; | |||
| ~AscendKernelRuntime() override; | |||
| bool Init() override; | |||
| bool LoadData(session::KernelGraph *graph, Debugger *debugger) override; | |||
| bool LoadData(session::KernelGraph *graph) override; | |||
| bool GenTask(const session::KernelGraph *graph); | |||
| bool GenDynamicKernel(const session::KernelGraph *graph) override; | |||
| bool RunDynamicKernelAsync(const session::KernelGraph *graph) override; | |||
| bool LoadTask(const session::KernelGraph *graph); | |||
| bool RunTask(const session::KernelGraph *graph); | |||
| bool Load(session::KernelGraph *graph, bool is_task_sink) override; | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||
| void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs, | |||
| const std::unordered_set<ValueNodePtr> &value_nodes, | |||
| const std::vector<CNodePtr> &execution_order) override; | |||
| @@ -324,7 +324,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput | |||
| resource_manager_.DecreaseSummaryRefCount(summary_outputs); | |||
| } | |||
| bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink, Debugger *debugger) { | |||
| bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink) { | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| resource_manager_.IncreaseAddressRefCount(kernel_graph); | |||
| @@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime { | |||
| ~CPUKernelRuntime() override = default; | |||
| bool Init() override { return true; } | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||
| void AssignKernelAddress(session::KernelGraph *kernel_graph); | |||
| void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||
| VectorRef *outputs); | |||
| @@ -73,6 +73,7 @@ bool GPUKernelRuntime::Init() { | |||
| (*init_nccl_comm_funcptr)(); | |||
| } | |||
| device_init_ = true; | |||
| SetDebugger(); | |||
| return ret; | |||
| } | |||
| @@ -104,17 +105,15 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||
| bool read_data = false; | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| std::string kernel_name = kernel->fullname_with_scope(); | |||
| if (debugger) { | |||
| debugger->SetCurNode(kernel_name); | |||
| if (dump_enabled) { | |||
| auto dump_mode = dump_json_parser.dump_mode(); | |||
| // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list | |||
| if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { | |||
| read_data = true; | |||
| } | |||
| } else if (debugger->debugger_enabled()) { | |||
| read_data = debugger->ReadNodeDataRequired(); | |||
| debugger->SetCurNode(kernel_name); | |||
| if (dump_enabled) { | |||
| auto dump_mode = dump_json_parser.dump_mode(); | |||
| // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list | |||
| if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { | |||
| read_data = true; | |||
| } | |||
| } else if (debugger->debugger_enabled()) { | |||
| read_data = debugger->ReadNodeDataRequired(); | |||
| } | |||
| if (!read_data) { | |||
| return; | |||
| @@ -169,25 +168,8 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||
| } | |||
| } | |||
| } | |||
| debugger->PostExecuteNode(); | |||
| } | |||
| void UpdateStepNum(Debugger *debugger, bool dump_enabled) { | |||
| if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | |||
| auto cur_step_num = debugger->step_num(); | |||
| cur_step_num = cur_step_num + 1; | |||
| debugger->SetStepNum(cur_step_num); | |||
| } | |||
| } | |||
| void ClearCurrentData(Debugger *debugger, bool dump_enabled) { | |||
| if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | |||
| DebugServices *debug_services = debugger->debug_services(); | |||
| TensorLoader *tensor_loader = debug_services->tensor_loader(); | |||
| tensor_loader->EmptyCurrentTensor(); | |||
| } | |||
| } | |||
| } // namespace | |||
| DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| @@ -345,7 +327,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { | |||
| } | |||
| } | |||
| bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { | |||
| bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { | |||
| struct timeval start_time, end_time; | |||
| (void)gettimeofday(&start_time, nullptr); | |||
| bool ret = true; | |||
| @@ -368,7 +350,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debug | |||
| mem_reuse_util_ = mem_reuse_iter->second; | |||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_); | |||
| ret = RunOneStep(graph, debugger); | |||
| ret = RunOneStep(graph); | |||
| } else { | |||
| py::gil_scoped_release gil_release; | |||
| ret = LaunchKernel(graph); | |||
| @@ -381,28 +363,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debug | |||
| return ret; | |||
| } | |||
| bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) { | |||
| bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { | |||
| bool ret = true; | |||
| auto graph_id = graph->graph_id(); | |||
| if (!is_first_step_map_[graph_id]) { | |||
| // Normally run graph | |||
| ret = LaunchKernelDynamic(graph, debugger); | |||
| ret = LaunchKernelDynamic(graph); | |||
| } else { | |||
| // Mock run first step | |||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||
| ret = LaunchKernelDynamic(graph, true, false); | |||
| if (ret) { | |||
| // Normally run graph | |||
| ret = LaunchKernelDynamic(graph, debugger); | |||
| ret = LaunchKernelDynamic(graph); | |||
| } else { | |||
| // Trigger memory swap | |||
| ret = SearchMemSwapScheme(graph, debugger); | |||
| ret = SearchMemSwapScheme(graph); | |||
| } | |||
| is_first_step_map_[graph_id] = false; | |||
| } | |||
| return ret; | |||
| } | |||
| bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { | |||
| bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { | |||
| MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; | |||
| bool ret = false; | |||
| ClearKernelOldOutputAndWorkspace(graph); | |||
| @@ -416,7 +398,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, De | |||
| if (!mem_swap_manager_->RetreatSwapInfo()) { | |||
| return false; | |||
| } | |||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||
| ret = LaunchKernelDynamic(graph, true, false); | |||
| if (!ret) { | |||
| ClearKernelOldOutputAndWorkspace(graph); | |||
| } | |||
| @@ -424,14 +406,14 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, De | |||
| mem_swap_manager_->AssignHostMemory(); | |||
| // Time profiling | |||
| ret = LaunchKernelDynamic(graph, debugger, false, true); | |||
| ret = LaunchKernelDynamic(graph, false, true); | |||
| if (!ret) { | |||
| return ret; | |||
| } | |||
| return RefineMemSwapScheme(graph, debugger); | |||
| return RefineMemSwapScheme(graph); | |||
| } | |||
| bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { | |||
| bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { | |||
| MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; | |||
| auto &kernels = graph->execution_order(); | |||
| for (const auto &kernel : kernels) { | |||
| @@ -444,7 +426,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, De | |||
| bool ret = false; | |||
| while (!ret) { | |||
| mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); | |||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||
| ret = LaunchKernelDynamic(graph, true, false); | |||
| if (!ret) { | |||
| ClearKernelOldOutputAndWorkspace(graph); | |||
| ClearSwapInfo(true); | |||
| @@ -583,8 +565,7 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g | |||
| } | |||
| } | |||
| bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock, | |||
| bool profiling) { | |||
| bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_); | |||
| // Reset the reference count. | |||
| @@ -593,10 +574,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||
| AllocCommunicationOpDynamicRes(graph); | |||
| AllocInplaceNodeMemory(graph); | |||
| debugger_ = debugger; | |||
| bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | |||
| if (!mock) { | |||
| UpdateStepNum(debugger, dump_enabled); | |||
| debugger_->UpdateStepNum(); | |||
| } | |||
| auto &kernels = graph->execution_order(); | |||
| int exec_order = 1; | |||
| @@ -618,7 +598,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||
| if (!ret) { | |||
| if (!mock) { | |||
| // invalidate current data collected by the debugger | |||
| ClearCurrentData(debugger, dump_enabled); | |||
| debugger_->ClearCurrentData(); | |||
| } | |||
| return false; | |||
| } | |||
| @@ -639,7 +619,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||
| LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); | |||
| } | |||
| // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) | |||
| LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | |||
| LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | |||
| dump_enabled); | |||
| } | |||
| exec_order = exec_order + 1; | |||
| @@ -647,14 +627,14 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||
| if (!UpdateMemorySwapTask(kernel, mock, profiling)) { | |||
| if (!mock) { | |||
| // invalidate current data collected by the debugger | |||
| ClearCurrentData(debugger, dump_enabled); | |||
| debugger_->ClearCurrentData(); | |||
| } | |||
| return false; | |||
| } | |||
| } | |||
| if (!mock) { | |||
| // collect weights and bias for dump mode | |||
| if (debugger) debugger->LoadParametersAndConst(); | |||
| debugger_->LoadParametersAndConst(); | |||
| CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | |||
| } | |||
| ClearSwapInfo(mock); | |||
| @@ -42,7 +42,7 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| const std::unordered_set<ValueNodePtr> &value_nodes, | |||
| const std::vector<CNodePtr> &execution_order) override; | |||
| void AssignMemory(session::KernelGraph *graph) override; | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||
| bool GenDynamicKernel(const session::KernelGraph *graph) override { return true; } | |||
| bool RunDynamicKernelAsync(const session::KernelGraph *graph) override { return true; } | |||
| @@ -67,11 +67,10 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| void ClearKernelOutputAddress(const session::KernelGraph *graph); | |||
| void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); | |||
| void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); | |||
| bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||
| bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||
| bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||
| bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false, | |||
| bool profiling = false); | |||
| bool RunOneStep(const session::KernelGraph *graph); | |||
| bool SearchMemSwapScheme(const session::KernelGraph *graph); | |||
| bool RefineMemSwapScheme(const session::KernelGraph *graph); | |||
| bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false); | |||
| void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, | |||
| const AddressPtrList &workspace, const AddressPtrList &outputs); | |||
| bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); | |||
| @@ -39,7 +39,7 @@ KernelRuntime::~KernelRuntime() {} | |||
| bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; } | |||
| bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; } | |||
| bool KernelRuntime::LoadData(session::KernelGraph *graph) { return false; } | |||
| bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| @@ -56,9 +56,9 @@ class KernelRuntime { | |||
| void RunOpClearMemory(const session::KernelGraph *graph); | |||
| static bool DumpDataEnabled(); | |||
| static bool DumpDataEnabledIteration(); | |||
| virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); | |||
| virtual bool LoadData(session::KernelGraph *graph); | |||
| virtual bool Load(session::KernelGraph *graph, bool is_task_sink); | |||
| virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; | |||
| virtual bool Run(session::KernelGraph *graph, bool is_task_sink) = 0; | |||
| virtual bool GenDynamicKernel(const session::KernelGraph *graph) = 0; | |||
| virtual bool RunDynamicKernelAsync(const session::KernelGraph *graph) = 0; | |||
| bool LaunchKernel(const session::KernelGraph *graph); | |||
| @@ -89,6 +89,13 @@ class KernelRuntime { | |||
| uint32_t device_id() { return device_id_; } | |||
| DeviceAddressPtr AssignSingleOpLaunchMemory(size_t size, const std::string &format, TypeId type); | |||
| // set debugger | |||
| void SetDebugger() { | |||
| #if !defined(_WIN32) && !defined(_WIN64) | |||
| debugger_ = Debugger::GetInstance(); | |||
| #endif | |||
| } | |||
| protected: | |||
| virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) = 0; | |||
| @@ -122,8 +129,8 @@ class KernelRuntime { | |||
| protected: | |||
| uint32_t device_id_{0}; | |||
| #ifdef ENABLE_DEBUGGER | |||
| Debugger *debugger_; | |||
| #if !defined(_WIN32) && !defined(_WIN64) | |||
| std::shared_ptr<Debugger> debugger_; | |||
| #endif | |||
| void *stream_ = nullptr; | |||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | |||