| @@ -49,6 +49,7 @@ class DumpJsonParser { | |||
| bool e2e_dump_enabled() const { return e2e_dump_enabled_; } | |||
| uint32_t dump_mode() const { return dump_mode_; } | |||
| std::string path() const { return path_; } | |||
| std::string iteration_string() const { return iteration_; } | |||
| std::string net_name() const { return net_name_; } | |||
| uint32_t input_output() const { return input_output_; } | |||
| uint32_t op_debug_mode() const { return op_debug_mode_; } | |||
| @@ -26,6 +26,7 @@ | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/ms_context.h" | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| #include "utils/config_manager.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debug_services.h" | |||
| #include "debug/tensor_load.h" | |||
| @@ -241,6 +242,7 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) { | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| uint32_t cur_iter = dump_json_parser.cur_dump_iter(); | |||
| uint32_t graph_id = graph->graph_id(); | |||
| bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph)); | |||
| if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) { | |||
| if (starting_graph_id == INT32_MAX) { | |||
| @@ -250,7 +252,9 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t rank_id) { | |||
| } | |||
| } | |||
| if (dump_json_parser.async_dump_enabled() && dump_json_parser.IsDumpIter(cur_iter)) { | |||
| MS_LOG(INFO) << "sink_mode = " << sink_mode; | |||
| if (dump_json_parser.async_dump_enabled() && dump_json_parser.IsDumpIter(cur_iter) && !sink_mode) { | |||
| auto zero_dir_dump_path = | |||
| dump_json_parser.path() + "/rank_" + std::to_string(rank_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; | |||
| @@ -291,6 +295,9 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons | |||
| bool success = false; | |||
| auto &dump_json_parser = DumpJsonParser::GetInstance(); | |||
| uint32_t graph_id = graph->graph_id(); | |||
| bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph)); | |||
| MS_LOG(INFO) << "sink_mode = " << sink_mode; | |||
| if (dump_json_parser.GetIterDumpFlag()) { | |||
| MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter(); | |||
| @@ -301,7 +308,7 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons | |||
| DumpOutput(graph, dump_path, debugger); | |||
| DumpParametersAndConst(graph, dump_path, debugger); | |||
| success = true; | |||
| } else if (dump_json_parser.async_dump_enabled()) { | |||
| } else if (dump_json_parser.async_dump_enabled() && !sink_mode) { | |||
| uint32_t current_iter = dump_json_parser.cur_dump_iter(); | |||
| auto zero_dir_dump_path = | |||
| @@ -348,4 +355,16 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons | |||
| return success; | |||
| } | |||
| bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) { | |||
| // check if there is GetNext or InitDataSetQueue node | |||
| const auto &nodes = graph->execution_order(); | |||
| for (const auto &node : nodes) { | |||
| auto node_name = AnfAlgo::GetCNodeName(node); | |||
| if (node_name == "GetNext" || node_name == "InitDataSetQueue") { | |||
| return true; | |||
| } | |||
| } | |||
| return false; | |||
| } | |||
| } // namespace mindspore | |||
| @@ -36,6 +36,7 @@ class E2eDump { | |||
| ~E2eDump() = default; | |||
| static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id); | |||
| static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr); | |||
| static bool isDatasetGraph(const session::KernelGraph *graph); | |||
| // Dump data when task error. | |||
| static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||
| std::string *kernel_name, const Debugger *debugger); | |||
| @@ -19,6 +19,7 @@ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include <limits> | |||
| #include "utility" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/mem.h" | |||
| @@ -135,6 +136,12 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| MS_EXCEPTION_IF_NULL(kernel_graph_); | |||
| auto dump_path = DumpJsonParser::GetInstance().path(); | |||
| const auto &input_ctrl_tensors = kernel_graph_->input_ctrl_tensors(); | |||
| constexpr size_t kLoopSinkCtrlTensorNum = 3; // cur step, cur epoch, steps per epoch | |||
| bool data_sink_mode = input_ctrl_tensors != nullptr && input_ctrl_tensors->size() >= kLoopSinkCtrlTensorNum; | |||
| std::string net_name = (data_sink_mode ? DumpJsonParser::GetInstance().net_name() : "_"); | |||
| std::string iteration = (data_sink_mode ? DumpJsonParser::GetInstance().iteration_string() : "0"); | |||
| if (dump_path.empty()) { | |||
| MS_LOG(EXCEPTION) << "Dump path invalid"; | |||
| } | |||
| @@ -149,15 +156,22 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf | |||
| } | |||
| } | |||
| dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(rank_id) + "/"); | |||
| MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path; | |||
| dump_info->set_model_name("_"); | |||
| dump_info->set_dump_step("0"); | |||
| MS_LOG(INFO) << "[DataDump] dump_path: " << dump_path; | |||
| dump_info->set_model_name(net_name); | |||
| MS_LOG(INFO) << "[DataDump] model_name: " << net_name; | |||
| MS_LOG(INFO) << "[DataDump] iteration_pre: " << iteration; | |||
| if (iteration == "all") { | |||
| iteration = "0-" + std::to_string(ULONG_MAX); | |||
| } | |||
| MS_LOG(INFO) << "[DataDump] iteration_post: " << iteration; | |||
| dump_info->set_dump_step(iteration); | |||
| dump_info->set_model_id(graph_id); | |||
| dump_info->set_flag(kAicpuLoadFlag); | |||
| const auto &input_ctrl_tensors = kernel_graph_->input_ctrl_tensors(); | |||
| constexpr size_t kLoopSinkCtrlTensorNum = 3; // cur step, cur epoch, steps per epoch | |||
| if (input_ctrl_tensors == nullptr || input_ctrl_tensors->size() < kLoopSinkCtrlTensorNum) { | |||
| if (!data_sink_mode) { | |||
| MS_LOG(INFO) << "[DataDump] Not data sink mode, input_ctrl_tensor"; | |||
| return; | |||
| } | |||