diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index 7d79c8c68f..6ef4471bd8 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -364,6 +364,7 @@ void DumpJsonParser::PrintUnusedKernel() { std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const { std::string bin_path; bin_path.append(path_); + bin_path.append("/"); bin_path.append("device_"); bin_path.append(std::to_string(device_id)); bin_path.append("/"); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index e23fce7ebe..a0c6672dc6 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -69,7 +69,6 @@ Debugger::Debugger() is_dataset_graph_(false), partial_memory_(false), last_overflow_bin_(0), - overflow_bin_path_(""), initial_suspend_(true), not_dataset_graph_sum_(0), version_("") { @@ -161,41 +160,43 @@ void Debugger::EnableDebugger() { } MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051"; } + // initialize grpc client + grpc_client_ = std::make_unique(host, port); + } + debug_services_ = std::make_unique(); +} +void Debugger::SetOpOverflowBinPath(uint32_t graph_id) { #ifdef ENABLE_D - // set operation overflow info - overflow_bin_path_ = DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_); - // new overflow dump files will have a timestamp greater than last_overflow_bin_ - last_overflow_bin_ = 0; - DIR *d; - d = opendir(overflow_bin_path_.c_str()); - if (d != nullptr) { - struct dirent *dir; - while ((dir = readdir(d)) != NULL) { - if (dir->d_type == DT_REG) { - std::string file_path = overflow_bin_path_; - file_path.append(dir->d_name); - std::size_t found = file_path.find_last_of("."); - if (found == std::string::npos) { - continue; - } - std::string overflow_time = file_path.substr(found + 1); - if (stod(overflow_time) <= last_overflow_bin_) { - MS_LOG(INFO) << "Old op overflow bin folder" << file_path; - continue; - } - last_overflow_bin_ = stod(overflow_time); + // set operation overflow info + overflow_bin_path_.insert(std::pair( + graph_id, DumpJsonParser::GetInstance().GetOpOverflowBinPath(graph_id, device_id_))); + // new overflow dump files will have a timestamp greater than last_overflow_bin_ + auto overflow_bin_path = overflow_bin_path_.find(graph_id)->second; + DIR *d; + d = opendir(overflow_bin_path.c_str()); + if (d != nullptr) { + struct dirent *dir; + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_REG) { + std::string file_path = overflow_bin_path; + file_path.append(dir->d_name); + std::size_t found = file_path.find_last_of("."); + if (found == std::string::npos) { + continue; + } + std::string overflow_time = file_path.substr(found + 1); + if (stod(overflow_time) <= last_overflow_bin_) { + MS_LOG(INFO) << "Old op overflow bin folder" << file_path; + continue; } + last_overflow_bin_ = stod(overflow_time); } - MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_; - closedir(d); } -#endif - - // initialize grpc client - grpc_client_ = std::make_unique(host, port); + MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_; + closedir(d); } - debug_services_ = std::make_unique(); +#endif } void Debugger::CheckDatasetSinkMode() { @@ -256,7 +257,7 @@ void Debugger::Reset() { grpc_client_ = nullptr; debug_services_ = nullptr; last_overflow_bin_ = 0; - overflow_bin_path_ = ""; + overflow_bin_path_.clear(); stream_task_to_opname_.clear(); } @@ -390,6 +391,9 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) { // add new graph proto to graph_proto_list_ graph_proto_list_.push_back(graph_proto); graph_ptr_list_.push_back(graph_ptr); +#ifdef ENABLE_D + SetOpOverflowBinPath(graph_ptr->graph_id()); +#endif not_dataset_graph_sum_++; } // reset is_dataset_graph to be false @@ -991,52 +995,55 @@ uint64_t BytestoInt64(const std::vector &buffer) { std::vector Debugger::CheckOpOverflow() { std::vector bin_list; std::vector op_names; - DIR *d; - struct dirent *dir = nullptr; - d = opendir(overflow_bin_path_.c_str()); - if (d != nullptr) { - while ((dir = readdir(d)) != NULL) { - if (dir->d_type == DT_REG) { - std::string file_path = overflow_bin_path_; - file_path.append(dir->d_name); - std::string file_name = dir->d_name; - std::size_t found = file_name.find_last_of("."); - if (found == std::string::npos) { - continue; - } - std::string overflow_time = file_name.substr(found + 1); - if (stod(overflow_time) <= last_overflow_bin_) { - MS_LOG(INFO) << "File already processed " << file_name; - continue; - } - bin_list.push_back(stod(overflow_time)); - std::fstream infile; - infile.open(file_path.c_str(), std::ios::binary | std::ios::in); - if (!infile.is_open()) { - MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name; - continue; - } - infile.seekg(313, std::ios::beg); - std::vector buffer; - buffer.resize(BUF_SIZ); - infile.read(buffer.data(), BUF_SIZ); - uint64_t stream_id = BytestoInt64(std::vector(buffer.begin() + 8, buffer.end())); - uint64_t task_id = BytestoInt64(std::vector(buffer.begin() + 16, buffer.end())); - MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << "."; - auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id)); - if (op != debugger_->stream_task_to_opname_.end()) { - MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl; - op_names.push_back(op->second); - } else { - MS_LOG(INFO) << "No overflow is detected " << std::endl; + for (const auto &[graph_id, overflow_bin_path] : overflow_bin_path_) { + DIR *d; + d = opendir(overflow_bin_path.c_str()); + MS_LOG(INFO) << "processing bin file path " << overflow_bin_path << ", graph id " << graph_id; + if (d != nullptr) { + struct dirent *dir = nullptr; + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_REG) { + std::string file_path = overflow_bin_path; + file_path.append(dir->d_name); + std::string file_name = dir->d_name; + std::size_t found = file_name.find_last_of("."); + if (found == std::string::npos) { + continue; + } + std::string overflow_time = file_name.substr(found + 1); + if (stod(overflow_time) <= last_overflow_bin_) { + MS_LOG(INFO) << "File already processed " << file_name; + continue; + } + bin_list.push_back(stod(overflow_time)); + std::fstream infile; + infile.open(file_path.c_str(), std::ios::binary | std::ios::in); + if (!infile.is_open()) { + MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name; + continue; + } + infile.seekg(313, std::ios::beg); + std::vector buffer; + buffer.resize(BUF_SIZ); + infile.read(buffer.data(), BUF_SIZ); + uint64_t stream_id = BytestoInt64(std::vector(buffer.begin() + 8, buffer.end())); + uint64_t task_id = BytestoInt64(std::vector(buffer.begin() + 16, buffer.end())); + MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << "."; + auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id)); + if (op != debugger_->stream_task_to_opname_.end()) { + MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl; + op_names.push_back(op->second); + } else { + MS_LOG(INFO) << "No overflow is detected " << std::endl; + } + infile.close(); } - infile.close(); } + } else { + MS_LOG(INFO) << "OverFlow bin directory does not exist!"; } - } else { - MS_LOG(INFO) << "OverFlow bin directory does not exist!"; + closedir(d); } - closedir(d); if (!op_names.empty()) { MS_LOG(ERROR) << "These operation overflows are detected " << op_names; diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index ee9196e5df..3c6e48c541 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -156,6 +156,8 @@ class Debugger : public std::enable_shared_from_this { // read env variable for grpc client void EnableDebugger(); + void SetOpOverflowBinPath(uint32_t graph_id); + // check if dump using debugger backend is enabled bool CheckDebuggerDumpEnabled(); @@ -232,7 +234,7 @@ class Debugger : public std::enable_shared_from_this { std::mutex access_lock_; std::map, std::string> stream_task_to_opname_; double last_overflow_bin_; - std::string overflow_bin_path_; + std::map overflow_bin_path_; // flag to keep track of the very first suspension of debugger bool initial_suspend_; std::list graph_proto_list_;