| @@ -171,7 +171,7 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) { | |||||
| device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get())); | device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get())); | ||||
| // build kernel | // build kernel | ||||
| BuildKernel(root_graph); | BuildKernel(root_graph); | ||||
| if (debugger_) { | |||||
| if (debugger_ && debugger_->partial_memory()) { | |||||
| debugger_->PreExecute(root_graph); | debugger_->PreExecute(root_graph); | ||||
| } | } | ||||
| SetSummaryNodes(root_graph.get()); | SetSummaryNodes(root_graph.get()); | ||||
| @@ -248,7 +248,7 @@ void AscendSession::BuildGraph(GraphId graph_id) { | |||||
| BuildKernel(graph); | BuildKernel(graph); | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(ms_context); | MS_EXCEPTION_IF_NULL(ms_context); | ||||
| if (debugger_) { | |||||
| if (debugger_ && debugger_->partial_memory()) { | |||||
| debugger_->PreExecute(graph); | debugger_->PreExecute(graph); | ||||
| } | } | ||||
| if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) { | if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) { | ||||
| @@ -312,6 +312,9 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor:: | |||||
| } | } | ||||
| // load input data from user input | // load input data from user input | ||||
| LoadInputData(kernel_graph, inputs); | LoadInputData(kernel_graph, inputs); | ||||
| if (debugger_) { | |||||
| debugger_->PreExecute(kernel_graph); | |||||
| } | |||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| // Initialize parameter server | // Initialize parameter server | ||||
| InitPSParamAndOptim(kernel_graph, inputs); | InitPSParamAndOptim(kernel_graph, inputs); | ||||
| @@ -278,9 +278,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList | |||||
| void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | ||||
| auto &kernel_graph = graphs_[graph_id]; | auto &kernel_graph = graphs_[graph_id]; | ||||
| PreIterationDbg(kernel_graph); | |||||
| // Load input data from user input | // Load input data from user input | ||||
| LoadInputData(kernel_graph, inputs); | LoadInputData(kernel_graph, inputs); | ||||
| PreIterationDbg(kernel_graph); | |||||
| #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU)) | ||||
| // Initialize parameter server | // Initialize parameter server | ||||
| InitPSParamAndOptim(kernel_graph, inputs); | InitPSParamAndOptim(kernel_graph, inputs); | ||||
| @@ -22,7 +22,6 @@ | |||||
| #include <utility> | #include <utility> | ||||
| #include <memory> | #include <memory> | ||||
| #include <map> | #include <map> | ||||
| #include "backend/session/session_context.h" | #include "backend/session/session_context.h" | ||||
| #include "backend/session/kernel_graph.h" | #include "backend/session/kernel_graph.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| @@ -30,6 +30,7 @@ | |||||
| #include "pipeline/jit/pipeline.h" | #include "pipeline/jit/pipeline.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| #include "runtime/device/kernel_runtime_manager.h" | #include "runtime/device/kernel_runtime_manager.h" | ||||
| #include "runtime/device/kernel_runtime.h" | |||||
| using debugger::EventReply; | using debugger::EventReply; | ||||
| using debugger::GraphProto; | using debugger::GraphProto; | ||||
| @@ -47,6 +48,7 @@ namespace mindspore { | |||||
| DebuggerPtr Debugger::debugger_ = nullptr; | DebuggerPtr Debugger::debugger_ = nullptr; | ||||
| std::mutex Debugger::instance_lock_; | std::mutex Debugger::instance_lock_; | ||||
| static const size_t PRAMATER_OUTPUT_INDEX = 0; | |||||
| Debugger::Debugger() | Debugger::Debugger() | ||||
| : grpc_client_(nullptr), | : grpc_client_(nullptr), | ||||
| @@ -62,7 +64,26 @@ Debugger::Debugger() | |||||
| is_dataset_graph_(false), | is_dataset_graph_(false), | ||||
| partial_memory_(false), | partial_memory_(false), | ||||
| last_overflow_bin_(0), | last_overflow_bin_(0), | ||||
| overflow_bin_path_("") {} | |||||
| overflow_bin_path_("") { | |||||
| if (CheckDebuggerEnabled()) { | |||||
| // configure partial memory reuse | |||||
| partial_memory_ = CheckDebuggerPartialMemoryEnabled(); | |||||
| // switch memory reuse on or off | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_); | |||||
| // print some message about memory reuse to user | |||||
| if (partial_memory_) { | |||||
| MS_LOG(WARNING) | |||||
| << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first " | |||||
| "step. 2. Tensor values are only available for nodes that are watched by any watchpoint."; | |||||
| } else { | |||||
| MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory " | |||||
| "usage for large models."; | |||||
| } | |||||
| } | |||||
| } | |||||
| void Debugger::Init(const uint32_t device_id, const std::string device_target) { | void Debugger::Init(const uint32_t device_id, const std::string device_target) { | ||||
| // access lock for public method | // access lock for public method | ||||
| @@ -133,27 +154,6 @@ void Debugger::EnableDebugger() { | |||||
| MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051"; | MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051"; | ||||
| port = "50051"; | port = "50051"; | ||||
| } | } | ||||
| // configure partial memory reuse | |||||
| const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM"); | |||||
| if (env_partial_mem_str != nullptr) { | |||||
| MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str; | |||||
| if (std::strcmp(env_partial_mem_str, "1") == 0) { | |||||
| partial_memory_ = true; | |||||
| } | |||||
| } | |||||
| // switch memory reuse on or off | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| context_ptr->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, partial_memory_); | |||||
| // print some message about memory reuse to user | |||||
| if (partial_memory_) { | |||||
| MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first " | |||||
| "step. 2. Tensor values are only available for nodes that are watched by any watchpoint."; | |||||
| } else { | |||||
| MS_LOG(INFO) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory " | |||||
| "usage for large models."; | |||||
| } | |||||
| #ifdef ENABLE_D | #ifdef ENABLE_D | ||||
| // set operation overflow info | // set operation overflow info | ||||
| overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_); | overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_); | ||||
| @@ -195,9 +195,7 @@ void Debugger::EnableDebugger() { | |||||
| bool Debugger::CheckDebuggerDumpEnabled() { | bool Debugger::CheckDebuggerDumpEnabled() { | ||||
| // see if dump is enabled | // see if dump is enabled | ||||
| if (device_target_ == kGPUDevice) { | if (device_target_ == kGPUDevice) { | ||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| return runtime_instance->DumpDataEnabled(); | |||||
| return device::KernelRuntime::DumpDataEnabled(); | |||||
| } | } | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -213,6 +211,17 @@ bool Debugger::CheckDebuggerEnabled() { | |||||
| return false; | return false; | ||||
| } | } | ||||
| bool Debugger::CheckDebuggerPartialMemoryEnabled() { | |||||
| const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM"); | |||||
| if (env_partial_mem_str != nullptr) { | |||||
| MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str; | |||||
| if (std::strcmp(env_partial_mem_str, "1") == 0) { | |||||
| return true; | |||||
| } | |||||
| } | |||||
| return false; | |||||
| } | |||||
| bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); } | bool Debugger::DebuggerBackendEnabled() { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); } | ||||
| void Debugger::Reset() { | void Debugger::Reset() { | ||||
| @@ -324,6 +333,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) { | |||||
| // only try to enable debugger if it is not a dataset graph | // only try to enable debugger if it is not a dataset graph | ||||
| EnableDebugger(); | EnableDebugger(); | ||||
| if (debugger_enabled_) { | if (debugger_enabled_) { | ||||
| LoadParameters(); | |||||
| // get graph proto and send to mindinsight | // get graph proto and send to mindinsight | ||||
| SendGraphAndSuspend(GetGraphProto()); | SendGraphAndSuspend(GetGraphProto()); | ||||
| } | } | ||||
| @@ -839,4 +849,34 @@ bool Debugger::CheckPort(const char *port) { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void Debugger::LoadParameters() { | |||||
| if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return; | |||||
| if (!(num_step_ == 0 || device_target_ == kAscendDevice || | |||||
| (device_target_ == kGPUDevice && device::KernelRuntime::DumpDataEnabledIteration()))) | |||||
| return; | |||||
| MS_EXCEPTION_IF_NULL(graph_ptr_); | |||||
| const auto ¶meters = graph_ptr_->inputs(); | |||||
| // for parameters, set its execution order to be 0; | |||||
| int exec_order = 0; | |||||
| for (auto &item : parameters) { | |||||
| if (!item->isa<Parameter>()) { | |||||
| continue; | |||||
| } | |||||
| std::string parameter_name = item->fullname_with_scope(); | |||||
| auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX); | |||||
| auto format = kOpFormat_DEFAULT; | |||||
| string tensor_name = parameter_name + ':' + "0"; | |||||
| ShapeVector int_shapes; | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, true); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | |||||
| } | |||||
| } | |||||
| } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -103,6 +103,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| void SendMetadata(); | void SendMetadata(); | ||||
| void LoadParameters(); | |||||
| private: | private: | ||||
| // private constructor for singleton | // private constructor for singleton | ||||
| Debugger(); | Debugger(); | ||||
| @@ -118,6 +120,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| // check if debugger enabled | // check if debugger enabled | ||||
| bool CheckDebuggerEnabled(); | bool CheckDebuggerEnabled(); | ||||
| bool CheckDebuggerPartialMemoryEnabled(); | |||||
| // check and save graph pointer | // check and save graph pointer | ||||
| void CheckGraphPtr(const KernelGraphPtr &graph_ptr); | void CheckGraphPtr(const KernelGraphPtr &graph_ptr); | ||||
| @@ -663,39 +663,25 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file | |||||
| } | } | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order, | |||||
| bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, | |||||
| const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, | const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type, | ||||
| size_t slot, Debugger *debugger, bool keep_prev) const { | |||||
| size_t slot, bool keep_prev) const { | |||||
| bool ret = false; | bool ret = false; | ||||
| DebugServices *debug_services = debugger->debug_services(); | |||||
| MS_EXCEPTION_IF_NULL(debug_services); | |||||
| TensorLoader *tensor_loader = debug_services->tensor_loader(); | |||||
| TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader(); | |||||
| MS_EXCEPTION_IF_NULL(tensor_loader); | MS_EXCEPTION_IF_NULL(tensor_loader); | ||||
| // TensorData is freed up in AscendSession class | // TensorData is freed up in AscendSession class | ||||
| auto tensor_data = std::make_shared<mindspore::TensorData>(); | auto tensor_data = std::make_shared<mindspore::TensorData>(); | ||||
| tensor_data->SetName(tensor_name); | tensor_data->SetName(tensor_name); | ||||
| tensor_data->SetExecutionOrder(execution_order); | tensor_data->SetExecutionOrder(execution_order); | ||||
| tensor_data->SetSlot(slot); | tensor_data->SetSlot(slot); | ||||
| if (trans_flag) { | |||||
| MS_LOG(INFO) << "E2E tensor name is " << tensor_name; | |||||
| mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape); | |||||
| size_t host_size = out_tensor->data().nbytes(); | |||||
| ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c()); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "Copy device mem to host failed"; | |||||
| return ret; | |||||
| } | |||||
| tensor_data->SetTensor(out_tensor); | |||||
| } else { | |||||
| mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape); | |||||
| size_t host_size = out_tensor->data().nbytes(); | |||||
| auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST); | |||||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||||
| MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]"; | |||||
| } | |||||
| MS_LOG(INFO) << "E2E tensor name is " << tensor_name; | |||||
| tensor_data->SetTensor(out_tensor); | |||||
| mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape); | |||||
| size_t host_size = out_tensor->data().nbytes(); | |||||
| auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST); | |||||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||||
| MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]"; | |||||
| } | } | ||||
| MS_LOG(INFO) << "E2E tensor name is " << tensor_name; | |||||
| tensor_data->SetTensor(out_tensor); | |||||
| ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); | ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -45,9 +45,8 @@ class AscendDeviceAddress : public DeviceAddress { | |||||
| bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, | bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, | ||||
| const ShapeVector &host_shape, TypeId host_type) const override; | const ShapeVector &host_shape, TypeId host_type) const override; | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt, | |||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, | |||||
| bool keep_prev) const; | |||||
| bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | |||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override; | |||||
| #endif | #endif | ||||
| private: | private: | ||||
| @@ -254,15 +254,10 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr); | auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr); | ||||
| MS_EXCEPTION_IF_NULL(ascend_addr); | MS_EXCEPTION_IF_NULL(ascend_addr); | ||||
| ShapeVector int_shapes; | ShapeVector int_shapes; | ||||
| if (trans_flag) { | |||||
| int_shapes = trans::GetRuntimePaddingShape(node, j); | |||||
| } else { | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| } | |||||
| auto ret = | |||||
| ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false); | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(node, j); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| auto ret = ascend_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name | MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name | ||||
| << ", host_format:" << format << ".!"; | << ", host_format:" << format << ".!"; | ||||
| @@ -272,40 +267,6 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| } | } | ||||
| } | } | ||||
| void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| // trans_flag: "true" means tensor values will be transfered to host format, otherwise not. | |||||
| bool trans_flag = false; | |||||
| const auto ¶meters = graph->inputs(); | |||||
| // for parameters, set its execution order to be 0; | |||||
| int exec_order = 0; | |||||
| for (auto &item : parameters) { | |||||
| if (!item->isa<Parameter>()) { | |||||
| continue; | |||||
| } | |||||
| std::string parameter_name = item->fullname_with_scope(); | |||||
| auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX); | |||||
| auto format = kOpFormat_DEFAULT; | |||||
| string tensor_name = parameter_name + ':' + "0"; | |||||
| auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr); | |||||
| MS_EXCEPTION_IF_NULL(ascend_addr); | |||||
| ShapeVector int_shapes; | |||||
| if (trans_flag) { | |||||
| int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX); | |||||
| } else { | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| } | |||||
| auto ret = | |||||
| ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name | |||||
| << ", host_format:" << format << ".!"; | |||||
| } | |||||
| } | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| #endif | #endif | ||||
| @@ -319,7 +280,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debug | |||||
| // load output | // load output | ||||
| LoadOutput(graph, debugger); | LoadOutput(graph, debugger); | ||||
| // load parameters | // load parameters | ||||
| LoadParameters(graph, debugger); | |||||
| if (debugger) debugger->LoadParameters(); | |||||
| #endif | #endif | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -70,6 +70,12 @@ class DeviceAddress : public mindspore::DeviceSync { | |||||
| const ShapeVector &host_shape, TypeId host_type) const { | const ShapeVector &host_shape, TypeId host_type) const { | ||||
| return true; | return true; | ||||
| } | } | ||||
| #ifdef ENABLE_DEBUGGER | |||||
| virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | |||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const { | |||||
| return true; | |||||
| } | |||||
| #endif | |||||
| protected: | protected: | ||||
| const void *ptr() const { return ptr_; } | const void *ptr() const { return ptr_; } | ||||
| @@ -80,14 +80,14 @@ GPUDeviceAddress::~GPUDeviceAddress() { | |||||
| } | } | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | ||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, | |||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, | |||||
| bool keep_prev) const { | bool keep_prev) const { | ||||
| bool ret = false; | bool ret = false; | ||||
| if (size_ == 0) { | if (size_ == 0) { | ||||
| return true; | return true; | ||||
| } | } | ||||
| DebugServices *debug_services = debugger->debug_services(); | |||||
| TensorLoader *tensor_loader = debug_services->tensor_loader(); | |||||
| TensorLoader *tensor_loader = Debugger::GetInstance()->debug_services()->tensor_loader(); | |||||
| mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape); | mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape); | ||||
| size_t host_size = out_tensor->data().nbytes(); | size_t host_size = out_tensor->data().nbytes(); | ||||
| @@ -44,8 +44,7 @@ class GPUDeviceAddress : public DeviceAddress { | |||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | ||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, | |||||
| bool keep_prev) const; | |||||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev) const override; | |||||
| #endif | #endif | ||||
| private: | private: | ||||
| DeviceAddressStatus status_{DeviceAddressStatus::kInDevice}; | DeviceAddressStatus status_{DeviceAddressStatus::kInDevice}; | ||||
| @@ -111,7 +111,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); | auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); | ||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | ||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | [](size_t inner_item) { return SizeToInt(inner_item); }); | ||||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); | |||||
| auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true); | |||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(ERROR) << "LoadMemToHost:" | MS_LOG(ERROR) << "LoadMemToHost:" | ||||
| << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; | ||||
| @@ -130,7 +130,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j); | auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j); | ||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | ||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | [](size_t inner_item) { return SizeToInt(inner_item); }); | ||||
| auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false); | |||||
| auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false); | |||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(ERROR) << "LoadMemToHost:" | MS_LOG(ERROR) << "LoadMemToHost:" | ||||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | ||||
| @@ -148,36 +148,6 @@ void UpdateStepNum(Debugger *debugger, bool dump_enabled) { | |||||
| } | } | ||||
| } | } | ||||
| void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| if (!(debugger && dump_enabled)) { | |||||
| return; | |||||
| } | |||||
| const auto ¶meters = graph->inputs(); | |||||
| // for parameters, set its execution order to be 0; | |||||
| int exec_order = 0; | |||||
| for (auto &item : parameters) { | |||||
| if (!item->isa<Parameter>()) { | |||||
| continue; | |||||
| } | |||||
| std::string parameter_name = item->fullname_with_scope(); | |||||
| auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); | |||||
| auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); | |||||
| auto format = kOpFormat_DEFAULT; | |||||
| string tensor_name = parameter_name + ':' + "0"; | |||||
| auto gpu_addr = dynamic_cast<const mindspore::device::gpu::GPUDeviceAddress *>(addr); | |||||
| ShapeVector int_shapes; | |||||
| auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); | |||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), | |||||
| [](size_t inner_item) { return SizeToInt(inner_item); }); | |||||
| auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "LoadMemToHost:" | |||||
| << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; | |||||
| } | |||||
| } | |||||
| } | |||||
| void ClearCurrentData(Debugger *debugger, bool dump_enabled) { | void ClearCurrentData(Debugger *debugger, bool dump_enabled) { | ||||
| if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | if (debugger && (debugger->debugger_enabled() || dump_enabled)) { | ||||
| DebugServices *debug_services = debugger->debug_services(); | DebugServices *debug_services = debugger->debug_services(); | ||||
| @@ -601,7 +571,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De | |||||
| } | } | ||||
| if (!mock) { | if (!mock) { | ||||
| // collect weights and bias for dump mode | // collect weights and bias for dump mode | ||||
| LoadParameters(graph, debugger, dump_enabled); | |||||
| if (debugger) debugger->LoadParameters(); | |||||
| CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | ||||
| } | } | ||||
| ClearSwapInfo(mock); | ClearSwapInfo(mock); | ||||
| @@ -53,8 +53,8 @@ class KernelRuntime { | |||||
| void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors, | void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector<tensor::TensorPtr> &input_tensors, | ||||
| session::KernelGraph *graph); | session::KernelGraph *graph); | ||||
| void RunOpClearMemory(const session::KernelGraph *graph); | void RunOpClearMemory(const session::KernelGraph *graph); | ||||
| bool DumpDataEnabled(); | |||||
| bool DumpDataEnabledIteration(); | |||||
| static bool DumpDataEnabled(); | |||||
| static bool DumpDataEnabledIteration(); | |||||
| virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); | virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger); | ||||
| virtual bool Load(session::KernelGraph *graph, bool is_task_sink); | virtual bool Load(session::KernelGraph *graph, bool is_task_sink); | ||||
| virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; | virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; | ||||