Browse Source

fix iter 0 and iter 1 being dumped in dir 0, make op_debug_mode optional for sync mode, read input files for offline debugger

tags/v1.3.0
John Tzanakakis 4 years ago
parent
commit
ac1847ffac
9 changed files with 94 additions and 42 deletions
  1. +13
    -4
      mindspore/ccsrc/backend/session/gpu_session.cc
  2. +2
    -0
      mindspore/ccsrc/backend/session/gpu_session.h
  3. +17
    -11
      mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
  4. +0
    -1
      mindspore/ccsrc/debug/data_dump/dump_json_parser.h
  5. +12
    -10
      mindspore/ccsrc/debug/data_dump/e2e_dump.cc
  6. +40
    -11
      mindspore/ccsrc/debug/debug_services.cc
  7. +3
    -0
      mindspore/ccsrc/debug/debug_services.h
  8. +3
    -3
      mindspore/ccsrc/debug/debugger/debugger.h
  9. +4
    -2
      mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc

+ 13
- 4
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -445,6 +445,9 @@ void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_grap
if (debugger_) {
debugger_->PreExecute(kernel_graph, graph_sum_);
}

DumpSetup(kernel_graph);

#if ENABLE_CPU && ENABLE_GPU
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);
@@ -459,13 +462,12 @@ void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_gra
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) {
Summary(kernel_graph.get());
}
bool dump_enabled = DumpDataEnabledIteration();
// debug used for dump
if (debugger_ && dump_enabled) {
if (debugger_ && debugger_->CheckDebuggerDumpEnabled()) {
Dump(kernel_graph);
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}

if (debugger_) {
debugger_->PostExecute();
}
@@ -600,6 +602,13 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
}
}

void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start!";
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpSetup(kernel_graph.get(), rank_id_);
MS_LOG(INFO) << "Finish!";
}

void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);


+ 2
- 0
mindspore/ccsrc/backend/session/gpu_session.h View File

@@ -86,6 +86,8 @@ class GPUSession : public SessionBasic {

void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;

void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;

bool DumpDataEnabledIteration() const;

GraphId CompileGraphImpl(KernelGraphPtr kernel_graph);


+ 17
- 11
mindspore/ccsrc/debug/data_dump/dump_json_parser.cc View File

@@ -206,16 +206,6 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
}

void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
auto path = CheckJsonKeyExist(*common_dump_settings, kPath);
auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
auto op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);

// async_dump is enabled by default, if e2e dump is enabled it will override this
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
@@ -228,6 +218,20 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
}
}

auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
auto path = CheckJsonKeyExist(*common_dump_settings, kPath);
auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);

nlohmann::detail::iter_impl<const nlohmann::json> op_debug_mode;
if (async_dump_enabled_) {
op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
}

ParseDumpMode(*dump_mode);
ParseDumpPath(*path);
ParseNetName(*net_name);
@@ -235,7 +239,9 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
ParseInputOutput(*input_output);
ParseKernels(*kernels);
ParseSupportDevice(*support_device);
ParseOpDebugMode(*op_debug_mode);
if (async_dump_enabled_) {
ParseOpDebugMode(*op_debug_mode);
}
}

void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {


+ 0
- 1
mindspore/ccsrc/debug/data_dump/dump_json_parser.h View File

@@ -60,7 +60,6 @@ class DumpJsonParser {
bool OutputNeedDump() const;
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);
bool AsyncDumpEnabled() const { return async_dump_enabled_; }

private:
DumpJsonParser() = default;


+ 12
- 10
mindspore/ccsrc/debug/data_dump/e2e_dump.cc View File

@@ -240,7 +240,17 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st
void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
uint32_t cur_iter = dump_json_parser.cur_dump_iter();
if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) {
uint32_t graph_id = graph->graph_id();

if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) {
if (starting_graph_id == INT32_MAX) {
starting_graph_id = graph_id;
} else if (starting_graph_id == graph_id) {
dump_json_parser.UpdateDumpIter();
}
}

if (dump_json_parser.async_dump_enabled() && dump_json_parser.IsDumpIter(cur_iter)) {
auto zero_dir_dump_path =
dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0";

@@ -291,7 +301,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
DumpOutput(graph, dump_path, debugger);
DumpParametersAndConst(graph, dump_path, debugger);
success = true;
} else if (dump_json_parser.AsyncDumpEnabled()) {
} else if (dump_json_parser.async_dump_enabled()) {
uint32_t current_iter = dump_json_parser.cur_dump_iter();

auto zero_dir_dump_path =
@@ -336,14 +346,6 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
success = true;
}

if (starting_graph_id == INT32_MAX) {
starting_graph_id = graph_id;
} else {
if (starting_graph_id == graph_id) {
dump_json_parser.UpdateDumpIter();
}
}

return success;
}
} // namespace mindspore

+ 40
- 11
mindspore/ccsrc/debug/debug_services.cc View File

@@ -634,6 +634,22 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
result_list->push_back(tensor_data);
}

void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *prefix_dump_file_name_input,
std::string *prefix_dump_file_name_output, std::string *dump_style_kernel_name,
size_t slot) {
if (is_sync_mode) {
std::string dump_style_name_part = *dump_style_kernel_name;
std::size_t last_scope_marker = dump_style_kernel_name->rfind("--");
if (last_scope_marker != std::string::npos) {
dump_style_name_part = dump_style_kernel_name->substr(last_scope_marker + 2);
}
*prefix_dump_file_name_input = dump_style_name_part + ".input." + std::to_string(slot);
*prefix_dump_file_name_output = dump_style_name_part + ".output." + std::to_string(slot);
} else {
*prefix_dump_file_name = *dump_style_kernel_name;
}
}

void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id,
@@ -665,12 +681,10 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
}

std::string prefix_dump_file_name;
if (is_sync_mode) {
prefix_dump_file_name = dump_style_kernel_name.substr(dump_style_kernel_name.rfind("--") + 2);
prefix_dump_file_name += ".output." + std::to_string(slot[i]);
} else {
prefix_dump_file_name = dump_style_kernel_name;
}
std::string prefix_dump_file_name_input;
std::string prefix_dump_file_name_output;
SetPrefixToCheck(&prefix_dump_file_name, &prefix_dump_file_name_input, &prefix_dump_file_name_output,
&dump_style_kernel_name, slot[i]);

std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]);
@@ -701,7 +715,11 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
std::string stripped_file_name = start_string + end_string;
std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);

std::size_t found = stripped_file_name.rfind(prefix_dump_file_name_output, 0);
if (found == std::string::npos) {
found = stripped_file_name.rfind(prefix_dump_file_name_input, 0);
}

if (found != 0) {
continue;
@@ -810,14 +828,25 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
for (auto node : wp_nodes) {
std::string orig_name = node;
std::string dump_style_name = node;
std::string dump_style_name_input;
std::string dump_style_name_output;
ReplaceSrcFileName(is_sync_mode, &dump_style_name);

if (is_sync_mode) {
dump_style_name = dump_style_name.substr(dump_style_name.rfind("--") + 2);
dump_style_name.append(".output.");
}
std::string dump_style_name_part = dump_style_name;
std::size_t last_scope_marker = dump_style_name.rfind("--");
if (last_scope_marker != std::string::npos) {
dump_style_name_part = dump_style_name.substr(last_scope_marker + 2);
}

dump_style_name_input = dump_style_name_part + ".input.";
proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name_input));

proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
dump_style_name_output = dump_style_name_part + ".output.";
proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name_output));
} else {
proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
}
}

if (!is_sync_mode) {


+ 3
- 0
mindspore/ccsrc/debug/debug_services.h View File

@@ -226,6 +226,9 @@ class DebugServices {
const std::string &type_name, const std::vector<int64_t> &shape, std::vector<char> *buffer,
std::vector<std::shared_ptr<TensorData>> *result_list);

void SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *prefix_dump_file_name_input,
std::string *prefix_dump_file_name_output, std::string *dump_style_kernel_name, size_t slot);

void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, const std::vector<std::string> &async_file_pool,


+ 3
- 3
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -148,6 +148,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

bool TensorExistsInCurrent(const std::string &tensor_name);

// check if dump using debugger backend is enabled
bool CheckDebuggerDumpEnabled() const;

private:
// private constructor for singleton
Debugger();
@@ -159,9 +162,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

void SetOpOverflowBinPath(uint32_t graph_id);

// check if dump using debugger backend is enabled
bool CheckDebuggerDumpEnabled() const;

// check if debugger enabled
bool CheckDebuggerEnabled() const;



+ 4
- 2
mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc View File

@@ -218,8 +218,10 @@ std::string GetTensorFullName(tensor_info_t info) {
if (info.is_parameter) {
// scopes in node name are separated by '/'
// use the name without scope if truncate is true
std::size_t found = node_name.find_last_of("/");
node_name = node_name.substr(found + 1);
auto found = node_name.find_last_of("/");
if (found != std::string::npos) {
node_name = node_name.substr(found + 1);
}
}
return node_name + ":" + std::to_string(info.slot);
}


Loading…
Cancel
Save