Other Contributor: Adel Shafiei, John Tzanakakistags/v1.1.0
| @@ -160,6 +160,11 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { | |||
| HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo)); | |||
| memo.clear(); | |||
| // load graphs to debugger. | |||
| if (debugger_) { | |||
| LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo)); | |||
| } | |||
| memo.clear(); | |||
| UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo)); | |||
| memo.clear(); | |||
| @@ -191,7 +196,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { | |||
| // build kernel | |||
| BuildKernel(root_graph); | |||
| if (debugger_ && debugger_->partial_memory()) { | |||
| debugger_->PreExecute(root_graph); | |||
| debugger_->PreExecute(root_graph, graph_sum_); | |||
| } | |||
| SetSummaryNodes(root_graph.get()); | |||
| // Alloc memory for child graph's inputs | |||
| @@ -271,7 +276,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) { | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| if (debugger_ && debugger_->partial_memory()) { | |||
| debugger_->PreExecute(graph); | |||
| debugger_->PreExecute(graph, graph_sum_); | |||
| } | |||
| if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) { | |||
| MS_LOG(INFO) << "Precompile only, stop in build kernel step"; | |||
| @@ -329,7 +334,7 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens | |||
| // load input data from user input | |||
| LoadInputData(kernel_graph, inputs); | |||
| if (debugger_) { | |||
| debugger_->PreExecute(kernel_graph); | |||
| debugger_->PreExecute(kernel_graph, graph_sum_); | |||
| } | |||
| #if ENABLE_CPU && ENABLE_D | |||
| // Initialize parameter server | |||
| @@ -962,6 +967,23 @@ void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph, | |||
| MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id(); | |||
| } | |||
| void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph, | |||
| NotNull<std::set<KernelGraphPtr> *> const memo) const { | |||
| if (memo->find(graph) != memo->end()) { | |||
| return; | |||
| } | |||
| memo->insert(graph.get()); | |||
| MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id(); | |||
| debugger_->LoadGraphs(graph); | |||
| MS_LOG(INFO) << "graph_sum_: " << graph_sum_; | |||
| for (auto &child_graph : graph->child_graph_order()) { | |||
| LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo); | |||
| } | |||
| MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id(); | |||
| } | |||
| void AscendSession::AssignStaticMemory(NotNull<KernelGraphPtr> graph, | |||
| NotNull<std::set<KernelGraphPtr> *> const memo) const { | |||
| if (memo->find(graph) != memo->end()) { | |||
| @@ -125,6 +125,7 @@ class AscendSession : public SessionBasic { | |||
| size_t *const raise_precision_count, size_t *const reduce_precision_count) const; | |||
| void IrFusionPass(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo); | |||
| void HardwareOptimize(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const; | |||
| void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const; | |||
| void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const; | |||
| void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const; | |||
| @@ -333,12 +333,21 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr | |||
| } | |||
| // Alloc memory, including static memory and dynamic memory | |||
| AllocateMemory(graph.get()); | |||
| #ifdef ENABLE_DEBUGGER | |||
| if (debugger_) { | |||
| debugger_->LoadGraphs(graph); | |||
| } | |||
| #endif | |||
| MS_LOG(INFO) << "CompileGraph graph_id: " << graph_id; | |||
| return graph_id; | |||
| } | |||
| void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, | |||
| VectorRef *outputs) { | |||
| auto &kernel_graph = graphs_[graph_id]; | |||
| MS_LOG(INFO) << "RunGraph graph_id: " << graph_id; | |||
| // Load input data from user input | |||
| LoadInputData(kernel_graph, inputs); | |||
| PreIterationDbg(kernel_graph); | |||
| @@ -414,7 +423,7 @@ bool GPUSession::DumpDataEnabledIteration() const { | |||
| void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| if (debugger_) { | |||
| debugger_->PreExecute(kernel_graph); | |||
| debugger_->PreExecute(kernel_graph, graph_sum_); | |||
| } | |||
| PreLoadTensor(kernel_graph); | |||
| } | |||
| @@ -26,6 +26,7 @@ service EventListener { | |||
| rpc SendGraph (stream Chunk) returns (EventReply) {}; | |||
| rpc SendTensors (stream TensorProto) returns (EventReply) {}; | |||
| rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {}; | |||
| rpc SendMultiGraphs (stream Chunk) returns (EventReply) {}; | |||
| } | |||
| message Metadata { | |||
| @@ -36,11 +37,14 @@ message Metadata { | |||
| // the full name of current node | |||
| string cur_node = 4; | |||
| // check if training is done. | |||
| bool training_done = 5; | |||
| bool training_done = 5; | |||
| // the number of total graphs | |||
| int32 graph_num = 6; | |||
| } | |||
| message Chunk { | |||
| bytes buffer = 1; | |||
| bytes buffer = 1; | |||
| bool finished = 2; | |||
| } | |||
| message EventReply { | |||
| @@ -34,6 +34,7 @@ | |||
| #include "debug/data_dump/e2e_dump_util.h" | |||
| #include "utils/config_manager.h" | |||
| using debugger::Chunk; | |||
| using debugger::EventReply; | |||
| using debugger::GraphProto; | |||
| using debugger::ModelProto; | |||
| @@ -69,7 +70,8 @@ Debugger::Debugger() | |||
| partial_memory_(false), | |||
| last_overflow_bin_(0), | |||
| overflow_bin_path_(""), | |||
| initial_suspend_(true) { | |||
| initial_suspend_(true), | |||
| not_dataset_graph_sum_(0) { | |||
| if (CheckDebuggerEnabled()) { | |||
| // configure partial memory reuse | |||
| partial_memory_ = CheckDebuggerPartialMemoryEnabled(); | |||
| @@ -259,12 +261,47 @@ void Debugger::Reset() { | |||
| stream_task_to_opname_.clear(); | |||
| } | |||
| void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) { | |||
| void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { | |||
| // access lock for public method | |||
| std::lock_guard<std::mutex> a_lock(access_lock_); | |||
| CheckDatasetSinkMode(); | |||
| if (debugger_->DebuggerBackendEnabled()) { | |||
| // check and save graph_ptr, suspend if graph is new | |||
| auto graph_id = graph_ptr->graph_id(); | |||
| // collect rungrap_ids to update step number in multigraph case | |||
| if (!rungraph_id_list_.size()) { | |||
| rungraph_id_list_.push_back(graph_id); | |||
| } else { | |||
| if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) { | |||
| rungraph_id_list_.push_back(graph_id); | |||
| } | |||
| } | |||
| // check and save graph_ptr, suspend if graph is new | |||
| MS_LOG(INFO) << "total number graph: " << graph_sum; | |||
| // multiple graphs | |||
| if (graph_sum > 1) { | |||
| // there are more than one graphs are not dataset_graph | |||
| if (not_dataset_graph_sum_ > 0) { | |||
| // only try to enable debugger if they are not all dataset graphs | |||
| if (!debugger_enabled_) { | |||
| EnableDebugger(); | |||
| } | |||
| if (debugger_enabled_) { | |||
| if (graph_proto_list_.size()) { | |||
| // only send compiled graphs once. | |||
| SendMultiGraphsAndSuspend(graph_proto_list_, graph_sum); | |||
| graph_proto_list_.clear(); | |||
| } else if (graph_id == rungraph_id_list_.front()) { | |||
| // stop only when receive the first sub run graph for each step | |||
| CommandLoop(); | |||
| } | |||
| } | |||
| } | |||
| } else if (graph_proto_list_.size() == 1) { | |||
| // In single graph case, reset graph_ptr_ to be nullptr for the initial step | |||
| if (num_step_ == 0) { | |||
| graph_ptr_ = nullptr; | |||
| } | |||
| CheckGraphPtr(graph_ptr); | |||
| } | |||
| } | |||
| @@ -346,20 +383,38 @@ void Debugger::SetStreamTaskToOpnameMap(const std::map<std::pair<uint32_t, uint3 | |||
| stream_task_to_opname_ = mapping; | |||
| } | |||
| void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) { | |||
| void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) { | |||
| if (graph_ptr_ != graph_ptr) { | |||
| MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id(); | |||
| MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id(); | |||
| // save new graph_ptr | |||
| graph_ptr_ = graph_ptr; | |||
| // check if it is dataset graph | |||
| CheckDatasetGraph(); | |||
| if (!is_dataset_graph_) { | |||
| // get proto for new graph_ptr | |||
| auto graph_proto = GetGraphProto(graph_ptr); | |||
| // add new graph proto to graph_proto_list_ | |||
| graph_proto_list_.push_back(graph_proto); | |||
| not_dataset_graph_sum_++; | |||
| } | |||
| // reset is_dataset_graph to be false | |||
| is_dataset_graph_ = false; | |||
| } | |||
| } | |||
| // In single graph cases, check single graph ptr | |||
| void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) { | |||
| if (graph_ptr_ != graph_ptr) { | |||
| MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id(); | |||
| // save new graph_ptr | |||
| graph_ptr_ = graph_ptr; | |||
| if (!is_dataset_graph_) { | |||
| // only try to enable debugger if it is not a dataset graph | |||
| EnableDebugger(); | |||
| if (debugger_enabled_) { | |||
| LoadParametersAndConst(); | |||
| // get graph proto and send to mindinsight | |||
| SendGraphAndSuspend(GetGraphProto()); | |||
| auto graph_proto = graph_proto_list_.front(); | |||
| SendGraphAndSuspend(graph_proto); | |||
| } | |||
| } | |||
| } | |||
| @@ -386,7 +441,7 @@ void Debugger::CheckDatasetGraph() { | |||
| is_dataset_graph_ = false; | |||
| } | |||
| GraphProto Debugger::GetGraphProto() const { | |||
| GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const { | |||
| // convert kernel graph to debugger modelproto | |||
| ModelProto model = GetDebuggerFuncGraphProto(graph_ptr_); | |||
| return model.graph(); | |||
| @@ -413,12 +468,49 @@ void Debugger::SendMetadata() { | |||
| metadata.set_cur_node(cur_name_); | |||
| metadata.set_training_done(training_done_); | |||
| MS_LOG(INFO) << "Is training done?" << training_done_; | |||
| // set graph munber to not_dataset_graph_sum_ | |||
| metadata.set_graph_num(not_dataset_graph_sum_); | |||
| EventReply reply_metadata = grpc_client_->SendMetadata(metadata); | |||
| if (reply_metadata.status() != reply_metadata.OK) { | |||
| MS_LOG(ERROR) << "Error: SendMetadata failed"; | |||
| } | |||
| } | |||
| void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum) { | |||
| SendMetadata(); | |||
| // send multiple graphs to mindinght server | |||
| // split graph into chunks if one graph is larger than chunk size | |||
| std::list<Chunk> chunked_graph_proto_list; | |||
| Chunk chunk; | |||
| for (auto graph : graph_proto_list) { | |||
| std::string str = graph.SerializeAsString(); | |||
| auto graph_size = graph.ByteSize(); | |||
| if (graph_size > CHUNK_SIZE) { | |||
| auto sub_graph_str = grpc_client_->ChunkString(str, graph_size); | |||
| for (unsigned int i = 0; i < sub_graph_str.size(); i++) { | |||
| chunk.set_buffer(sub_graph_str[i]); | |||
| chunked_graph_proto_list.push_back(chunk); | |||
| if (i < sub_graph_str.size() - 1) { | |||
| chunk.set_finished(false); | |||
| } else { | |||
| chunk.set_finished(true); | |||
| chunked_graph_proto_list.push_back(chunk); | |||
| } | |||
| } | |||
| } else { | |||
| chunk.set_buffer(str); | |||
| chunk.set_finished(true); | |||
| chunked_graph_proto_list.push_back(chunk); | |||
| } | |||
| } | |||
| EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list); | |||
| if (reply.status() != reply.OK) { | |||
| MS_LOG(ERROR) << "Error: SendGraph failed"; | |||
| } | |||
| // enter command loop, wait and process commands | |||
| CommandLoop(); | |||
| } | |||
| void Debugger::CommandLoop() { | |||
| // prepare metadata | |||
| std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id()); | |||
| @@ -923,6 +1015,8 @@ bool Debugger::CheckPort(const char *port) { | |||
| return true; | |||
| } | |||
| uint32_t Debugger::GetFirstRunGraphId() { return rungraph_id_list_.front(); } | |||
| void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) { | |||
| @@ -996,6 +1090,13 @@ void Debugger::LoadGraphOutputs() { | |||
| } | |||
| } | |||
| for (size_t j = 0; j < output_size; ++j) { | |||
| auto kernel_info = static_cast<device::KernelInfo *>(node->kernel_info()); | |||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||
| auto addr_test = kernel_info->GetOutputAddr(j); | |||
| if (addr_test == nullptr) { | |||
| MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name; | |||
| continue; | |||
| } | |||
| auto addr = AnfAlgo::GetOutputAddr(node, j); | |||
| MS_EXCEPTION_IF_NULL(addr); | |||
| auto type = AnfAlgo::GetOutputInferDataType(node, j); | |||
| @@ -1015,9 +1116,14 @@ void Debugger::LoadGraphOutputs() { | |||
| } | |||
| } | |||
| void Debugger::UpdateStepNum() { | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) | |||
| void Debugger::UpdateStepNum(const session::KernelGraph *graph) { | |||
| // update step number if we are processing the first graph (to support multigraph) | |||
| if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) && | |||
| (graph->graph_id() == debugger_->GetFirstRunGraphId())) { | |||
| // access lock for public method | |||
| std::lock_guard<std::mutex> a_lock(access_lock_); | |||
| ++num_step_; | |||
| } | |||
| } | |||
| void Debugger::ClearCurrentData() { | |||
| @@ -68,7 +68,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| // enable debugger | |||
| // send graph and wait for command | |||
| // do nothing if graph is set already | |||
| void PreExecute(const KernelGraphPtr &graph_ptr); | |||
| void PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum = 1); | |||
| // analyze tensors and wait for command | |||
| // don't need a graph_ptr because it is saved during pre_execute | |||
| @@ -106,7 +106,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void LoadParametersAndConst(); | |||
| void UpdateStepNum(); | |||
| void UpdateStepNum(const session::KernelGraph *graph); | |||
| void ClearCurrentData(); | |||
| @@ -114,6 +114,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void CheckDatasetSinkMode(); | |||
| void LoadGraphs(const KernelGraphPtr &graph_ptr); | |||
| uint32_t GetFirstRunGraphId(); | |||
| private: | |||
| // private constructor for singleton | |||
| Debugger(); | |||
| @@ -138,11 +142,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| void CheckDatasetGraph(); | |||
| // serialize graph and get proto | |||
| GraphProto GetGraphProto() const; | |||
| GraphProto GetGraphProto(const KernelGraphPtr &graph_ptr) const; | |||
| // send graph and enter command wait loop | |||
| void SendGraphAndSuspend(const GraphProto &graph_proto); | |||
| void SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list, uint32_t graph_sum); | |||
| // wait for command and process command | |||
| // send command request and process reply in a loop | |||
| // break if RunCMD | |||
| @@ -197,9 +203,13 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| std::string overflow_bin_path_; | |||
| // flag to keep track of the very first suspension of debugger | |||
| bool initial_suspend_; | |||
| std::list<GraphProto> graph_proto_list_; | |||
| // singleton | |||
| static std::mutex instance_lock_; | |||
| static std::shared_ptr<Debugger> debugger_; | |||
| uint32_t not_dataset_graph_sum_; | |||
| std::list<uint32_t> rungraph_id_list_; | |||
| }; | |||
| using DebuggerPtr = std::shared_ptr<Debugger>; | |||
| @@ -69,7 +69,7 @@ EventReply GrpcClient::SendMetadata(const Metadata &metadata) { | |||
| return reply; | |||
| } | |||
| std::vector<std::string> ChunkString(std::string str, int graph_size) { | |||
| std::vector<std::string> GrpcClient::ChunkString(std::string str, int graph_size) { | |||
| std::vector<std::string> buf; | |||
| int size_iter = 0; | |||
| while (size_iter < graph_size) { | |||
| @@ -118,6 +118,28 @@ EventReply GrpcClient::SendGraph(const GraphProto &graph) { | |||
| return reply; | |||
| } | |||
| EventReply GrpcClient::SendMultiGraphs(const std::list<Chunk> &chunks) { | |||
| EventReply reply; | |||
| grpc::ClientContext context; | |||
| std::unique_ptr<grpc::ClientWriter<Chunk> > writer(stub_->SendMultiGraphs(&context, &reply)); | |||
| for (const auto &chunk : chunks) { | |||
| if (!writer->Write(chunk)) { | |||
| break; | |||
| } | |||
| std::this_thread::sleep_for(std::chrono::milliseconds(1)); | |||
| } | |||
| writer->WritesDone(); | |||
| grpc::Status status = writer->Finish(); | |||
| if (!status.ok()) { | |||
| MS_LOG(ERROR) << "RPC failed: SendMultigraphs"; | |||
| MS_LOG(ERROR) << status.error_code() << ": " << status.error_message(); | |||
| reply.set_status(EventReply_Status_FAILED); | |||
| } | |||
| return reply; | |||
| } | |||
| EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) { | |||
| EventReply reply; | |||
| grpc::ClientContext context; | |||
| @@ -19,9 +19,11 @@ | |||
| #include <grpcpp/grpcpp.h> | |||
| #include <string> | |||
| #include <list> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "proto/debug_grpc.grpc.pb.h" | |||
| using debugger::Chunk; | |||
| using debugger::EventListener; | |||
| using debugger::EventReply; | |||
| using debugger::GraphProto; | |||
| @@ -52,8 +54,12 @@ class GrpcClient { | |||
| EventReply SendTensors(const std::list<TensorProto> &tensors); | |||
| EventReply SendMultiGraphs(const std::list<Chunk> &chunks); | |||
| EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints); | |||
| std::vector<std::string> ChunkString(std::string str, int graph_size); | |||
| private: | |||
| std::unique_ptr<EventListener::Stub> stub_; | |||
| }; | |||
| @@ -354,6 +354,8 @@ void DebuggerProtoExporter::ExportFuncGraph(const FuncGraphPtr &func_graph, debu | |||
| // set graph name | |||
| graph_proto->set_name(func_graph->ToString()); | |||
| MS_LOG(INFO) << "graph names: " << func_graph->ToString(); | |||
| ExportParameters(func_graph, graph_proto); | |||
| ExportCNodes(func_graph, graph_proto, &const_map); | |||
| @@ -433,6 +435,7 @@ void DebuggerProtoExporter::ExportCNode(const FuncGraphPtr &func_graph, const CN | |||
| // add full_name for debugger | |||
| node_proto->set_full_name(node->fullname_with_scope()); | |||
| MS_LOG(INFO) << "full_name: " << node->fullname_with_scope(); | |||
| // process OP inputs | |||
| for (size_t i = 1; i < inputs.size(); ++i) { | |||
| @@ -577,8 +577,8 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||
| AllocInplaceNodeMemory(graph); | |||
| bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); | |||
| if (!mock) { | |||
| debugger_->UpdateStepNum(); | |||
| if (!mock && debugger_) { | |||
| debugger_->UpdateStepNum(graph); | |||
| } | |||
| auto &kernels = graph->execution_order(); | |||
| int exec_order = 1; | |||