Browse Source

fix async dump for sink_mode=True

tags/v1.3.0
John Tzanakakis 4 years ago
parent
commit
11b9f17a71
4 changed files with 43 additions and 8 deletions
  1. +1
    -0
      mindspore/ccsrc/debug/data_dump/dump_json_parser.h
  2. +21
    -2
      mindspore/ccsrc/debug/data_dump/e2e_dump.cc
  3. +1
    -0
      mindspore/ccsrc/debug/data_dump/e2e_dump.h
  4. +20
    -6
      mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc

+ 1
- 0
mindspore/ccsrc/debug/data_dump/dump_json_parser.h View File

@@ -49,6 +49,7 @@ class DumpJsonParser {
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
uint32_t dump_mode() const { return dump_mode_; }
std::string path() const { return path_; }
std::string iteration_string() const { return iteration_; }
std::string net_name() const { return net_name_; }
uint32_t input_output() const { return input_output_; }
uint32_t op_debug_mode() const { return op_debug_mode_; }


+ 21
- 2
mindspore/ccsrc/debug/data_dump/e2e_dump.cc View File

@@ -26,6 +26,7 @@
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/ms_context.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "utils/config_manager.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#include "debug/tensor_load.h"
@@ -241,6 +242,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
uint32_t graph_id = graph->graph_id();
bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));

if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) {
if (starting_graph_id == INT32_MAX) {
@@ -250,7 +252,9 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
}
}

if (dump_json_parser.async_dump_enabled() && dump_json_parser.IsDumpIter(cur_iter)) {
MS_LOG(INFO) << "sink_mode = " << sink_mode;

if (dump_json_parser.async_dump_enabled() && dump_json_parser.IsDumpIter(cur_iter) && !sink_mode) {
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";

@@ -291,6 +295,9 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
bool success = false;
auto &dump_json_parser = DumpJsonParser::GetInstance();
uint32_t graph_id = graph->graph_id();
bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));

MS_LOG(INFO) << "sink_mode = " << sink_mode;

if (dump_json_parser.GetIterDumpFlag()) {
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
@@ -301,7 +308,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
DumpOutput(graph, dump_path, debugger);
DumpParametersAndConst(graph, dump_path, debugger);
success = true;
} else if (dump_json_parser.async_dump_enabled()) {
} else if (dump_json_parser.async_dump_enabled() && !sink_mode) {
uint32_t current_iter = dump_json_parser.cur_dump_iter();

auto zero_dir_dump_path =
@@ -348,4 +355,16 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons

return success;
}

bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) {
// check if there is GetNext or InitDataSetQueue node
const auto &nodes = graph->execution_order();
for (const auto &node : nodes) {
auto node_name = AnfAlgo::GetCNodeName(node);
if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
return true;
}
}
return false;
}
} // namespace mindspore

+ 1
- 0
mindspore/ccsrc/debug/data_dump/e2e_dump.h View File

@@ -36,6 +36,7 @@ class E2eDump {
~E2eDump() = default;
static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id);
static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
static bool isDatasetGraph(const session::KernelGraph *graph);
// Dump data when task error.
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, const Debugger *debugger);


+ 20
- 6
mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc View File

@@ -19,6 +19,7 @@
#include <memory>
#include <string>
#include <algorithm>
#include <limits>
#include "utility"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/mem.h"
@@ -135,6 +136,12 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
MS_EXCEPTION_IF_NULL(context_ptr);
MS_EXCEPTION_IF_NULL(kernel_graph_);
auto dump_path = DumpJsonParser::GetInstance().path();
const auto &input_ctrl_tensors = kernel_graph_->input_ctrl_tensors();
constexpr size_t kLoopSinkCtrlTensorNum = 3; // cur step, cur epoch, steps per epoch
bool data_sink_mode = input_ctrl_tensors != nullptr && input_ctrl_tensors->size() >= kLoopSinkCtrlTensorNum;
std::string net_name = (data_sink_mode ? DumpJsonParser::GetInstance().net_name() : "_");
std::string iteration = (data_sink_mode ? DumpJsonParser::GetInstance().iteration_string() : "0");

if (dump_path.empty()) {
MS_LOG(EXCEPTION) << "Dump path invalid";
}
@@ -149,15 +156,22 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
}
}
dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(rank_id) + "/");
MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path;
dump_info->set_model_name("_");
dump_info->set_dump_step("0");
MS_LOG(INFO) << "[DataDump] dump_path: " << dump_path;

dump_info->set_model_name(net_name);
MS_LOG(INFO) << "[DataDump] model_name: " << net_name;

MS_LOG(INFO) << "[DataDump] iteration_pre: " << iteration;
if (iteration == "all") {
iteration = "0-" + std::to_string(ULONG_MAX);
}
MS_LOG(INFO) << "[DataDump] iteration_post: " << iteration;
dump_info->set_dump_step(iteration);

dump_info->set_model_id(graph_id);
dump_info->set_flag(kAicpuLoadFlag);

const auto &input_ctrl_tensors = kernel_graph_->input_ctrl_tensors();
constexpr size_t kLoopSinkCtrlTensorNum = 3; // cur step, cur epoch, steps per epoch
if (input_ctrl_tensors == nullptr || input_ctrl_tensors->size() < kLoopSinkCtrlTensorNum) {
if (!data_sink_mode) {
MS_LOG(INFO) << "[DataDump] Not data sink mode, input_ctrl_tensor";
return;
}


Loading…
Cancel
Save