Merge pull request !7263 from Harshvardhan Gupta/add-dbg-runtimetags/v1.1.0
| @@ -20,8 +20,8 @@ | |||||
| #include "runtime/device/ascend/ascend_stream_assign.h" | #include "runtime/device/ascend/ascend_stream_assign.h" | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| #include "debug/debugger/debugger.h" | |||||
| #include "debug/debug_services.h" | #include "debug/debug_services.h" | ||||
| #include "debug/debugger/debugger.h" | |||||
| #endif | #endif | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -82,9 +82,8 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr | |||||
| auto debugger_ = mindspore::Debugger::GetInstance(); | auto debugger_ = mindspore::Debugger::GetInstance(); | ||||
| if (debugger_->DebuggerBackendEnabled()) { | if (debugger_->DebuggerBackendEnabled()) { | ||||
| DebugServices *debug_services = debugger_->debug_services(); | DebugServices *debug_services = debugger_->debug_services(); | ||||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | |||||
| std::string current_kernel_name = kernel_curr->scope_full_name(); | std::string current_kernel_name = kernel_curr->scope_full_name(); | ||||
| if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) { | |||||
| if (debug_services->IsWatchPoint(current_kernel_name)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| @@ -602,7 +602,7 @@ void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) | |||||
| tensor_loader->EmptyTensor(); | tensor_loader->EmptyTensor(); | ||||
| uint32_t iter_num = tensor_loader->GetIterNum(); | uint32_t iter_num = tensor_loader->GetIterNum(); | ||||
| tensor_loader->set_iter_num(++iter_num); | tensor_loader->set_iter_num(++iter_num); | ||||
| (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get()); | |||||
| (void)runtime_instance->LoadData(kernel_graph.get()); | |||||
| tensor_loader->EmptyPrevTensor(); | tensor_loader->EmptyPrevTensor(); | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -221,11 +221,7 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const { | ||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | ||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | MS_EXCEPTION_IF_NULL(runtime_instance); | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| if (!runtime_instance->Run(kernel_graph.get(), false, debugger_.get())) { | |||||
| #else | |||||
| if (!runtime_instance->Run(kernel_graph.get(), false)) { | if (!runtime_instance->Run(kernel_graph.get(), false)) { | ||||
| #endif | |||||
| MS_LOG(EXCEPTION) << "GPU execute graph failed!"; | MS_LOG(EXCEPTION) << "GPU execute graph failed!"; | ||||
| } | } | ||||
| } | } | ||||
| @@ -234,8 +234,7 @@ void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector< | |||||
| } | } | ||||
| } | } | ||||
| bool DebugServices::IsWatchPoint(std::string kernel_name, | |||||
| std::unordered_map<unsigned int, watchpoint_t> watchpoint_table) { | |||||
| bool DebugServices::IsWatchPoint(std::string kernel_name) { | |||||
| bool ret = false; | bool ret = false; | ||||
| for (auto w_table_item : watchpoint_table) { | for (auto w_table_item : watchpoint_table) { | ||||
| auto check_node_list = std::get<1>(w_table_item).check_node_list; | auto check_node_list = std::get<1>(w_table_item).check_node_list; | ||||
| @@ -136,7 +136,7 @@ class DebugServices { | |||||
| std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | ||||
| std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape); | std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape); | ||||
| bool IsWatchPoint(std::string kernel_name, std::unordered_map<unsigned int, watchpoint_t> watchpoint_table); | |||||
| bool IsWatchPoint(std::string kernel_name); | |||||
| TensorLoader *tensor_loader() const; | TensorLoader *tensor_loader() const; | ||||
| @@ -49,7 +49,7 @@ namespace mindspore { | |||||
| DebuggerPtr Debugger::debugger_ = nullptr; | DebuggerPtr Debugger::debugger_ = nullptr; | ||||
| std::mutex Debugger::instance_lock_; | std::mutex Debugger::instance_lock_; | ||||
| static const size_t PRAMATER_OUTPUT_INDEX = 0; | |||||
| static const size_t PARAMETER_OUTPUT_INDEX = 0; | |||||
| static const size_t VALUE_NODE_OUTPUT_INDEX = 0; | static const size_t VALUE_NODE_OUTPUT_INDEX = 0; | ||||
| Debugger::Debugger() | Debugger::Debugger() | ||||
| @@ -279,8 +279,7 @@ void Debugger::PostExecute() { | |||||
| bool Debugger::ReadNodeDataRequired() { | bool Debugger::ReadNodeDataRequired() { | ||||
| if (debugger_enabled_ && !is_dataset_graph_) { | if (debugger_enabled_ && !is_dataset_graph_) { | ||||
| auto watchpoint_table = debug_services_->GetWatchpointTable(); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||||
| // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data | // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data | ||||
| if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { | if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { | ||||
| return true; | return true; | ||||
| @@ -296,8 +295,7 @@ void Debugger::PostExecuteNode() { | |||||
| return; | return; | ||||
| } | } | ||||
| if (debugger_enabled_ && !is_dataset_graph_) { | if (debugger_enabled_ && !is_dataset_graph_) { | ||||
| auto watchpoint_table = debug_services_->GetWatchpointTable(); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||||
| // if kernel is watchpoint,and get hit. suspend. | // if kernel is watchpoint,and get hit. suspend. | ||||
| bool hit_empty_flag = true; | bool hit_empty_flag = true; | ||||
| @@ -914,7 +912,7 @@ void Debugger::LoadParametersAndConst() { | |||||
| MS_LOG(INFO) << "Start to load Parameters!"; | MS_LOG(INFO) << "Start to load Parameters!"; | ||||
| const auto ¶meters = graph_ptr_->inputs(); | const auto ¶meters = graph_ptr_->inputs(); | ||||
| for (auto &item : parameters) { | for (auto &item : parameters) { | ||||
| LoadSingleAnfnode(item, PRAMATER_OUTPUT_INDEX); | |||||
| LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX); | |||||
| } | } | ||||
| // load value nodes | // load value nodes | ||||
| // get all constant avlues from the graph | // get all constant avlues from the graph | ||||
| @@ -925,4 +923,50 @@ void Debugger::LoadParametersAndConst() { | |||||
| } | } | ||||
| } | } | ||||
| void Debugger::LoadGraphOutputs() { | |||||
| if (!(debugger_enabled() && device_target_ == kAscendDevice)) return; | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr_); | |||||
| const auto &apply_kernels = graph_ptr_->execution_order(); | |||||
| // for kernels, execution order starts from 1 | |||||
| int exec_order = 1; | |||||
| for (const auto &node : apply_kernels) { | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||||
| std::string kernel_name = node->fullname_with_scope(); | |||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||||
| if (partial_memory_) { | |||||
| if (!debug_services_->IsWatchPoint(kernel_name)) { | |||||
| continue; | |||||
| } | |||||
| } | |||||
| for (size_t j = 0; j < output_size; ++j) { | |||||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||||
| MS_EXCEPTION_IF_NULL(addr); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||||
| auto format = kOpFormat_DEFAULT; | |||||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||||
| ShapeVector int_shapes; | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | |||||
| } | |||||
| } | |||||
| exec_order = exec_order + 1; | |||||
| } | |||||
| } | |||||
| void Debugger::UpdateStepNum() { | |||||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||||
| ++num_step_; | |||||
| } | |||||
| void Debugger::ClearCurrentData() { | |||||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||||
| debug_services_->tensor_loader()->EmptyCurrentTensor(); | |||||
| } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -105,6 +105,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| void LoadParametersAndConst(); | void LoadParametersAndConst(); | ||||
| void UpdateStepNum(); | |||||
| void ClearCurrentData(); | |||||
| void LoadGraphOutputs(); | |||||
| private: | private: | ||||
| // private constructor for singleton | // private constructor for singleton | ||||
| Debugger(); | Debugger(); | ||||
| @@ -263,6 +263,7 @@ bool AscendKernelRuntime::Init() { | |||||
| if (!ret) { | if (!ret) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| SetDebugger(); | |||||
| mem_manager_ = std::make_shared<AscendMemoryManager>(); | mem_manager_ = std::make_shared<AscendMemoryManager>(); | ||||
| MS_EXCEPTION_IF_NULL(mem_manager_); | MS_EXCEPTION_IF_NULL(mem_manager_); | ||||
| mem_manager_->MallocDeviceMemory(); | mem_manager_->MallocDeviceMemory(); | ||||
| @@ -271,63 +272,16 @@ bool AscendKernelRuntime::Init() { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| namespace { | |||||
| void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| // trans_flag: "true" means tensor values will be transfered to host format, otherwise not. | |||||
| bool trans_flag = false; | |||||
| const auto &apply_kernels = graph->execution_order(); | |||||
| // for kernels, execution order starts from 1 | |||||
| int exec_order = 1; | |||||
| auto debugger_i = mindspore::Debugger::GetInstance(); | |||||
| DebugServices *debug_services = debugger_i->debug_services(); | |||||
| auto watchpoint_table = debug_services->GetWatchpointTable(); | |||||
| for (const auto &node : apply_kernels) { | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||||
| std::string kernel_name = node->fullname_with_scope(); | |||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | |||||
| if (debugger_i->partial_memory()) { | |||||
| if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { | |||||
| continue; | |||||
| } | |||||
| } | |||||
| for (size_t j = 0; j < output_size; ++j) { | |||||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||||
| auto format = kOpFormat_DEFAULT; | |||||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||||
| auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr); | |||||
| MS_EXCEPTION_IF_NULL(ascend_addr); | |||||
| ShapeVector int_shapes; | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name | |||||
| << ", host_format:" << format << ".!"; | |||||
| } | |||||
| } | |||||
| exec_order = exec_order + 1; | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| #endif | |||||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| debugger_ = debugger; | |||||
| MS_LOG(INFO) << "Start load step"; | MS_LOG(INFO) << "Start load step"; | ||||
| uint32_t cur_iter = 0; | uint32_t cur_iter = 0; | ||||
| MS_LOG(INFO) << "Cur iter is " << cur_iter; | MS_LOG(INFO) << "Cur iter is " << cur_iter; | ||||
| // load output | // load output | ||||
| LoadOutput(graph, debugger); | |||||
| debugger_->LoadGraphOutputs(); | |||||
| // load parameters | // load parameters | ||||
| if (debugger) debugger->LoadParametersAndConst(); | |||||
| debugger_->LoadParametersAndConst(); | |||||
| #endif | #endif | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -550,7 +504,7 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) { | |||||
| } | } | ||||
| } | } | ||||
| bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { | |||||
| bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { | |||||
| bool ret = false; | bool ret = false; | ||||
| #if defined(_WIN32) || defined(_WIN64) | #if defined(_WIN32) || defined(_WIN64) | ||||
| auto start_time = std::chrono::steady_clock::now(); | auto start_time = std::chrono::steady_clock::now(); | ||||
| @@ -38,14 +38,14 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| AscendKernelRuntime() = default; | AscendKernelRuntime() = default; | ||||
| ~AscendKernelRuntime() override; | ~AscendKernelRuntime() override; | ||||
| bool Init() override; | bool Init() override; | ||||
| bool LoadData(session::KernelGraph *graph, Debugger *debugger) override; | |||||
| bool LoadData(session::KernelGraph *graph) override; | |||||
| bool GenTask(const session::KernelGraph *graph); | bool GenTask(const session::KernelGraph *graph); | ||||
| bool GenDynamicKernel(const session::KernelGraph *graph) override; | bool GenDynamicKernel(const session::KernelGraph *graph) override; | ||||
| bool RunDynamicKernelAsync(const session::KernelGraph *graph) override; | bool RunDynamicKernelAsync(const session::KernelGraph *graph) override; | ||||
| bool LoadTask(const session::KernelGraph *graph); | bool LoadTask(const session::KernelGraph *graph); | ||||
| bool RunTask(const session::KernelGraph *graph); | bool RunTask(const session::KernelGraph *graph); | ||||
| bool Load(session::KernelGraph *graph, bool is_task_sink) override; | bool Load(session::KernelGraph *graph, bool is_task_sink) override; | ||||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||||
| void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs, | void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs, | ||||
| const std::unordered_set<ValueNodePtr> &value_nodes, | const std::unordered_set<ValueNodePtr> &value_nodes, | ||||
| const std::vector<CNodePtr> &execution_order) override; | const std::vector<CNodePtr> &execution_order) override; | ||||
| @@ -324,7 +324,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput | |||||
| resource_manager_.DecreaseSummaryRefCount(summary_outputs); | resource_manager_.DecreaseSummaryRefCount(summary_outputs); | ||||
| } | } | ||||
| bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink, Debugger *debugger) { | |||||
| bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink) { | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | MS_EXCEPTION_IF_NULL(kernel_graph); | ||||
| resource_manager_.IncreaseAddressRefCount(kernel_graph); | resource_manager_.IncreaseAddressRefCount(kernel_graph); | ||||
| @@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime { | |||||
| ~CPUKernelRuntime() override = default; | ~CPUKernelRuntime() override = default; | ||||
| bool Init() override { return true; } | bool Init() override { return true; } | ||||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||||
| void AssignKernelAddress(session::KernelGraph *kernel_graph); | void AssignKernelAddress(session::KernelGraph *kernel_graph); | ||||
| void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | ||||
| VectorRef *outputs); | VectorRef *outputs); | ||||
| @@ -73,6 +73,7 @@ bool GPUKernelRuntime::Init() { | |||||
| (*init_nccl_comm_funcptr)(); | (*init_nccl_comm_funcptr)(); | ||||
| } | } | ||||
| device_init_ = true; | device_init_ = true; | ||||
| SetDebugger(); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -104,17 +105,15 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| bool read_data = false; | bool read_data = false; | ||||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | auto &dump_json_parser = DumpJsonParser::GetInstance(); | ||||
| std::string kernel_name = kernel->fullname_with_scope(); | std::string kernel_name = kernel->fullname_with_scope(); | ||||
| if (debugger) { | |||||
| debugger->SetCurNode(kernel_name); | |||||
| if (dump_enabled) { | |||||
| auto dump_mode = dump_json_parser.dump_mode(); | |||||
| // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list | |||||
| if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { | |||||
| read_data = true; | |||||
| } | |||||
| } else if (debugger->debugger_enabled()) { | |||||
| read_data = debugger->ReadNodeDataRequired(); | |||||
| debugger->SetCurNode(kernel_name); | |||||
| if (dump_enabled) { | |||||
| auto dump_mode = dump_json_parser.dump_mode(); | |||||
| // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list | |||||
| if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { | |||||
| read_data = true; | |||||
| } | } | ||||
| } else if (debugger->debugger_enabled()) { | |||||
| read_data = debugger->ReadNodeDataRequired(); | |||||
| } | } | ||||
| if (!read_data) { | if (!read_data) { | ||||
| return; | return; | ||||
| @@ -169,25 +168,8 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| debugger->PostExecuteNode(); | debugger->PostExecuteNode(); | ||||
| } | } | ||||
| void UpdateStepNum(Debugger *debugger, bool dump_enabled) { | |||||
| if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | |||||
| auto cur_step_num = debugger->step_num(); | |||||
| cur_step_num = cur_step_num + 1; | |||||
| debugger->SetStepNum(cur_step_num); | |||||
| } | |||||
| } | |||||
| void ClearCurrentData(Debugger *debugger, bool dump_enabled) { | |||||
| if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | |||||
| DebugServices *debug_services = debugger->debug_services(); | |||||
| TensorLoader *tensor_loader = debug_services->tensor_loader(); | |||||
| tensor_loader->EmptyCurrentTensor(); | |||||
| } | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| @@ -345,7 +327,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { | |||||
| } | } | ||||
| } | } | ||||
| bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { | |||||
| bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { | |||||
| struct timeval start_time, end_time; | struct timeval start_time, end_time; | ||||
| (void)gettimeofday(&start_time, nullptr); | (void)gettimeofday(&start_time, nullptr); | ||||
| bool ret = true; | bool ret = true; | ||||
| @@ -368,7 +350,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debug | |||||
| mem_reuse_util_ = mem_reuse_iter->second; | mem_reuse_util_ = mem_reuse_iter->second; | ||||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_); | MS_EXCEPTION_IF_NULL(mem_reuse_util_); | ||||
| ret = RunOneStep(graph, debugger); | |||||
| ret = RunOneStep(graph); | |||||
| } else { | } else { | ||||
| py::gil_scoped_release gil_release; | py::gil_scoped_release gil_release; | ||||
| ret = LaunchKernel(graph); | ret = LaunchKernel(graph); | ||||
| @@ -381,28 +363,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debug | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) { | |||||
| bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { | |||||
| bool ret = true; | bool ret = true; | ||||
| auto graph_id = graph->graph_id(); | auto graph_id = graph->graph_id(); | ||||
| if (!is_first_step_map_[graph_id]) { | if (!is_first_step_map_[graph_id]) { | ||||
| // Normally run graph | // Normally run graph | ||||
| ret = LaunchKernelDynamic(graph, debugger); | |||||
| ret = LaunchKernelDynamic(graph); | |||||
| } else { | } else { | ||||
| // Mock run first step | // Mock run first step | ||||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||||
| ret = LaunchKernelDynamic(graph, true, false); | |||||
| if (ret) { | if (ret) { | ||||
| // Normally run graph | // Normally run graph | ||||
| ret = LaunchKernelDynamic(graph, debugger); | |||||
| ret = LaunchKernelDynamic(graph); | |||||
| } else { | } else { | ||||
| // Trigger memory swap | // Trigger memory swap | ||||
| ret = SearchMemSwapScheme(graph, debugger); | |||||
| ret = SearchMemSwapScheme(graph); | |||||
| } | } | ||||
| is_first_step_map_[graph_id] = false; | is_first_step_map_[graph_id] = false; | ||||
| } | } | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { | |||||
| bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { | |||||
| MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; | MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; | ||||
| bool ret = false; | bool ret = false; | ||||
| ClearKernelOldOutputAndWorkspace(graph); | ClearKernelOldOutputAndWorkspace(graph); | ||||
| @@ -416,7 +398,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, De | |||||
| if (!mem_swap_manager_->RetreatSwapInfo()) { | if (!mem_swap_manager_->RetreatSwapInfo()) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||||
| ret = LaunchKernelDynamic(graph, true, false); | |||||
| if (!ret) { | if (!ret) { | ||||
| ClearKernelOldOutputAndWorkspace(graph); | ClearKernelOldOutputAndWorkspace(graph); | ||||
| } | } | ||||
| @@ -424,14 +406,14 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, De | |||||
| mem_swap_manager_->AssignHostMemory(); | mem_swap_manager_->AssignHostMemory(); | ||||
| // Time profiling | // Time profiling | ||||
| ret = LaunchKernelDynamic(graph, debugger, false, true); | |||||
| ret = LaunchKernelDynamic(graph, false, true); | |||||
| if (!ret) { | if (!ret) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| return RefineMemSwapScheme(graph, debugger); | |||||
| return RefineMemSwapScheme(graph); | |||||
| } | } | ||||
| bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { | |||||
| bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { | |||||
| MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; | MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; | ||||
| auto &kernels = graph->execution_order(); | auto &kernels = graph->execution_order(); | ||||
| for (const auto &kernel : kernels) { | for (const auto &kernel : kernels) { | ||||
| @@ -444,7 +426,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, De | |||||
| bool ret = false; | bool ret = false; | ||||
| while (!ret) { | while (!ret) { | ||||
| mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); | mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); | ||||
| ret = LaunchKernelDynamic(graph, debugger, true, false); | |||||
| ret = LaunchKernelDynamic(graph, true, false); | |||||
| if (!ret) { | if (!ret) { | ||||
| ClearKernelOldOutputAndWorkspace(graph); | ClearKernelOldOutputAndWorkspace(graph); | ||||
| ClearSwapInfo(true); | ClearSwapInfo(true); | ||||
| @@ -583,8 +565,7 @@ void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *g | |||||
| } | } | ||||
| } | } | ||||
| bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock, | |||||
| bool profiling) { | |||||
| bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bool mock, bool profiling) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_); | MS_EXCEPTION_IF_NULL(mem_reuse_util_); | ||||
| // Reset the reference count. | // Reset the reference count. | ||||
| @@ -593,10 +574,9 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| AllocCommunicationOpDynamicRes(graph); | AllocCommunicationOpDynamicRes(graph); | ||||
| AllocInplaceNodeMemory(graph); | AllocInplaceNodeMemory(graph); | ||||
| debugger_ = debugger; | |||||
| bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | ||||
| if (!mock) { | if (!mock) { | ||||
| UpdateStepNum(debugger, dump_enabled); | |||||
| debugger_->UpdateStepNum(); | |||||
| } | } | ||||
| auto &kernels = graph->execution_order(); | auto &kernels = graph->execution_order(); | ||||
| int exec_order = 1; | int exec_order = 1; | ||||
| @@ -618,7 +598,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| if (!ret) { | if (!ret) { | ||||
| if (!mock) { | if (!mock) { | ||||
| // invalidate current data collected by the debugger | // invalidate current data collected by the debugger | ||||
| ClearCurrentData(debugger, dump_enabled); | |||||
| debugger_->ClearCurrentData(); | |||||
| } | } | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -639,7 +619,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); | LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); | ||||
| } | } | ||||
| // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) | // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) | ||||
| LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | |||||
| LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | |||||
| dump_enabled); | dump_enabled); | ||||
| } | } | ||||
| exec_order = exec_order + 1; | exec_order = exec_order + 1; | ||||
| @@ -647,14 +627,14 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| if (!UpdateMemorySwapTask(kernel, mock, profiling)) { | if (!UpdateMemorySwapTask(kernel, mock, profiling)) { | ||||
| if (!mock) { | if (!mock) { | ||||
| // invalidate current data collected by the debugger | // invalidate current data collected by the debugger | ||||
| ClearCurrentData(debugger, dump_enabled); | |||||
| debugger_->ClearCurrentData(); | |||||
| } | } | ||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| if (!mock) { | if (!mock) { | ||||
| // collect weights and bias for dump mode | // collect weights and bias for dump mode | ||||
| if (debugger) debugger->LoadParametersAndConst(); | |||||
| debugger_->LoadParametersAndConst(); | |||||
| CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | ||||
| } | } | ||||
| ClearSwapInfo(mock); | ClearSwapInfo(mock); | ||||
| @@ -42,7 +42,7 @@ class GPUKernelRuntime : public KernelRuntime { | |||||
| const std::unordered_set<ValueNodePtr> &value_nodes, | const std::unordered_set<ValueNodePtr> &value_nodes, | ||||
| const std::vector<CNodePtr> &execution_order) override; | const std::vector<CNodePtr> &execution_order) override; | ||||
| void AssignMemory(session::KernelGraph *graph) override; | void AssignMemory(session::KernelGraph *graph) override; | ||||
| bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; | |||||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||||
| bool GenDynamicKernel(const session::KernelGraph *graph) override { return true; } | bool GenDynamicKernel(const session::KernelGraph *graph) override { return true; } | ||||
| bool RunDynamicKernelAsync(const session::KernelGraph *graph) override { return true; } | bool RunDynamicKernelAsync(const session::KernelGraph *graph) override { return true; } | ||||
| @@ -67,11 +67,10 @@ class GPUKernelRuntime : public KernelRuntime { | |||||
| void ClearKernelOutputAddress(const session::KernelGraph *graph); | void ClearKernelOutputAddress(const session::KernelGraph *graph); | ||||
| void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); | void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); | ||||
| void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); | void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); | ||||
| bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||||
| bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||||
| bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); | |||||
| bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false, | |||||
| bool profiling = false); | |||||
| bool RunOneStep(const session::KernelGraph *graph); | |||||
| bool SearchMemSwapScheme(const session::KernelGraph *graph); | |||||
| bool RefineMemSwapScheme(const session::KernelGraph *graph); | |||||
| bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false); | |||||
| void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, | void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, | ||||
| const AddressPtrList &workspace, const AddressPtrList &outputs); | const AddressPtrList &workspace, const AddressPtrList &outputs); | ||||
| bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); | bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); | ||||
| @@ -39,7 +39,7 @@ KernelRuntime::~KernelRuntime() {} | |||||
| bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; } | bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; } | ||||
| bool KernelRuntime::LoadData(session::KernelGraph *graph, Debugger *debugger) { return false; } | |||||
| bool KernelRuntime::LoadData(session::KernelGraph *graph) { return false; } | |||||
| bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) { | bool KernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) { | ||||
| MS_EXCEPTION_IF_NULL(kernel); | MS_EXCEPTION_IF_NULL(kernel); | ||||
| @@ -56,9 +56,9 @@ class KernelRuntime { | |||||
| void RunOpClearMemory(const session::KernelGraph *graph); | void RunOpClearMemory(const session::KernelGraph *graph); | ||||
| static bool DumpDataEnabled(); | static bool DumpDataEnabled(); | ||||
| static bool DumpDataEnabledIteration(); | static bool DumpDataEnabledIteration(); | ||||
| virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); | |||||
| virtual bool LoadData(session::KernelGraph *graph); | |||||
| virtual bool Load(session::KernelGraph *graph, bool is_task_sink); | virtual bool Load(session::KernelGraph *graph, bool is_task_sink); | ||||
| virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; | |||||
| virtual bool Run(session::KernelGraph *graph, bool is_task_sink) = 0; | |||||
| virtual bool GenDynamicKernel(const session::KernelGraph *graph) = 0; | virtual bool GenDynamicKernel(const session::KernelGraph *graph) = 0; | ||||
| virtual bool RunDynamicKernelAsync(const session::KernelGraph *graph) = 0; | virtual bool RunDynamicKernelAsync(const session::KernelGraph *graph) = 0; | ||||
| bool LaunchKernel(const session::KernelGraph *graph); | bool LaunchKernel(const session::KernelGraph *graph); | ||||
| @@ -89,6 +89,13 @@ class KernelRuntime { | |||||
| uint32_t device_id() { return device_id_; } | uint32_t device_id() { return device_id_; } | ||||
| DeviceAddressPtr AssignSingleOpLaunchMemory(size_t size, const std::string &format, TypeId type); | DeviceAddressPtr AssignSingleOpLaunchMemory(size_t size, const std::string &format, TypeId type); | ||||
| // set debugger | |||||
| void SetDebugger() { | |||||
| #if !defined(_WIN32) && !defined(_WIN64) | |||||
| debugger_ = Debugger::GetInstance(); | |||||
| #endif | |||||
| } | |||||
| protected: | protected: | ||||
| virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| TypeId type_id) = 0; | TypeId type_id) = 0; | ||||
| @@ -122,8 +129,8 @@ class KernelRuntime { | |||||
| protected: | protected: | ||||
| uint32_t device_id_{0}; | uint32_t device_id_{0}; | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| Debugger *debugger_; | |||||
| #if !defined(_WIN32) && !defined(_WIN64) | |||||
| std::shared_ptr<Debugger> debugger_; | |||||
| #endif | #endif | ||||
| void *stream_ = nullptr; | void *stream_ = nullptr; | ||||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | ||||