From 7c9ecff30bb002d7fcfeec0fd93cd2f7874b5ca3 Mon Sep 17 00:00:00 2001 From: Parastoo Ashtari Date: Mon, 13 Sep 2021 19:59:03 -0400 Subject: [PATCH] support multi network dump in GPU --- mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 10 +++++++++- mindspore/ccsrc/debug/data_dump/e2e_dump.h | 3 +++ mindspore/ccsrc/debug/debugger/debugger.cc | 12 +++++++----- mindspore/ccsrc/debug/debugger/debugger.h | 2 ++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 11e11eea94..399f0eadac 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -279,7 +279,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m if (IsDeviceTargetGPU()) { if (starting_graph_id == INT32_MAX) { starting_graph_id = graph_id; - } else if (starting_graph_id == graph_id) { + } else if (starting_graph_id == graph_id && !MsContext::GetInstance()->get_param(MS_CTX_ENABLE_MINDRT)) { + // Update dump iter for mindrt runtime is done using UpdateIterGPUDump(). + // Update dump iter for GPU old runtime. dump_json_parser.UpdateDumpIter(); } return; @@ -307,6 +309,12 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) { } } +void E2eDump::UpdateIterGPUDump() { + if (starting_graph_id != INT32_MAX) { + DumpJsonParser::GetInstance().UpdateDumpIter(); + } +} + void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); bool success = false; diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index ca5b4daa67..05a1ae5622 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -36,6 +36,9 @@ class E2eDump { E2eDump() = default; ~E2eDump() = default; static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id); + + static void UpdateIterGPUDump(); + static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr); static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index e216d5458b..6b0c63bc96 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -274,6 +274,7 @@ void Debugger::Reset() { debug_services_ = nullptr; graph_proto_list_.clear(); graph_ptr_list_.clear(); + graph_ptr_step_vec_.clear(); MS_LOG(INFO) << "Release Debugger resource."; } @@ -282,6 +283,9 @@ void Debugger::PreExecuteGraphDebugger(const std::vector &graphs if (device_target_ != kGPUDevice) { return; } + E2eDump::UpdateIterGPUDump(); + // Store graphs that are run in one step. + graph_ptr_step_vec_ = graphs; for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { const auto &graph = graphs[graph_index]; if (debugger_) { @@ -430,15 +434,13 @@ void Debugger::PostExecuteGraphDebugger() { return; } // LoadParametersAndConst for all the graphs - if (debugger_) { - for (auto graph : graph_ptr_list_) { - debugger_->LoadParametersAndConst(graph); - } + for (auto graph : graph_ptr_step_vec_) { + debugger_->LoadParametersAndConst(graph); } // debug used for dump if (debugger_ && debugger_->CheckDebuggerDumpEnabled()) { // Dump Parameters and consts - for (auto graph : graph_ptr_list_) { + for (auto graph : graph_ptr_step_vec_) { debugger_->Dump(graph); if (!debugger_->debugger_enabled()) { debugger_->ClearCurrentData(); diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 7e5bf3efe7..1ac0c5e1a6 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -265,6 +265,8 @@ class Debugger : public std::enable_shared_from_this { std::list graph_proto_list_; std::list graph_ptr_list_; + // The vector of graph pointers that have been run in the current step. + std::vector graph_ptr_step_vec_; // singleton static std::mutex instance_lock_;