/** * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "debug/debug_services.h" #include #include #include #include #include #include #include #include #include #include #include #include "pybind11/embed.h" #ifdef ONLINE_DBG_MODE #include "debug/common.h" #include "debug/debugger/debugger.h" #include "debug/anf_ir_utils.h" #include "backend/session/anf_runtime_algorithm.h" #endif #include "debug/debugger/tensor_summary.h" #ifdef ONLINE_DBG_MODE namespace mindspore { #endif DebugServices::DebugServices() { tensor_loader_ = std::make_shared(); } DebugServices::DebugServices(const DebugServices &other) { wp_id_cache_ = other.wp_id_cache_; net_name_ = other.net_name_; dump_dir_ = other.dump_dir_; is_sync_mode_ = other.is_sync_mode_; tensor_loader_ = other.tensor_loader_; watchpoint_table_ = other.watchpoint_table_; } DebugServices &DebugServices::operator=(const DebugServices &other) { if (this != &other) { tensor_loader_ = other.tensor_loader_; watchpoint_table_ = other.watchpoint_table_; } return *this; } void DebugServices::AddWatchpoint( unsigned int id, unsigned int watch_condition, float parameter, const std::vector> &check_node_list, const std::vector ¶meter_list, const std::vector>> *check_node_device_list, const std::vector>> *check_node_graph_list) { std::lock_guard lg(lock_); watchpoint_t watchpoint_item; watchpoint_item.id = id; watchpoint_item.condition.type = static_cast(watch_condition); watchpoint_item.condition.parameter = parameter; watchpoint_item.check_node_list = check_node_list; if (check_node_device_list != nullptr) { watchpoint_item.check_node_device_list = *check_node_device_list; } if (check_node_graph_list != nullptr) { watchpoint_item.check_node_graph_list = *check_node_graph_list; } watchpoint_item.parameter_list = parameter_list; watchpoint_table_[id] = watchpoint_item; } void DebugServices::RemoveWatchpoint(unsigned int id) { std::lock_guard lg(lock_); watchpoint_table_.erase(id); } std::unique_ptr GetSummaryPtr(const std::shared_ptr &tensor, void *const previous_tensor_ptr, uint32_t num_elements, int tensor_dtype) { switch (tensor_dtype) { case DbgDataType::DT_UINT8: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT8: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT32: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT32: case DbgDataType::DT_BASE_INT: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT32: case DbgDataType::DT_BASE_FLOAT: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_BOOL: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } default: MS_LOG(INFO) << "Unsupported tensor type"; // return a null pointer return std::unique_ptr>{}; } } DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr &tensor) { if (tensor == nullptr) { MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics."; TensorStat empty_tensor_stat_data; return empty_tensor_stat_data; } std::unique_ptr base_summary_ptr; void *previous_tensor_ptr = nullptr; base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), tensor->GetType()); if (base_summary_ptr == nullptr) { MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics."; TensorStat empty_tensor_stat_data; return empty_tensor_stat_data; } base_summary_ptr->TensorStatistics(tensor->GetType()); TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(), base_summary_ptr->max_value(), base_summary_ptr->min_value(), base_summary_ptr->avg_value(), base_summary_ptr->count(), base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(), base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(), base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count()); return tensor_stat_data; } #ifdef OFFLINE_DBG_MODE void *DebugServices::GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed) { void *previous_tensor_ptr = nullptr; std::shared_ptr tensor_prev; if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) { // read data in offline mode std::vector file_paths; if (!is_sync_mode_) { ConvertReadTensors(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration() - 1}, std::vector{tensor->GetRootGraphId()}, &file_paths); } std::vector> result_list_prev; ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration() - 1}, std::vector{tensor->GetRootGraphId()}, std::vector{tensor->GetIsOutput()}, file_paths, &result_list_prev); tensor_prev = result_list_prev[0]; if (!tensor_prev->GetByteSize()) { tensor_prev.reset(); } else { previous_tensor_ptr = tensor_prev->GetDataPtr(); } } return previous_tensor_ptr; } #endif void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::string &tensor_name, const std::string &tensor_name_no_slot, bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name, std::vector *const watchpoints_to_check) { for (auto w_table_item : watchpoint_table_) { auto wp = std::get<1>(w_table_item); // check ONLY init conditions on initial suspended state. // skip other conditions on initial suspended state if (init_dbg_suspend && (wp.condition.type != INIT)) { continue; } // skip init condition if not init suspend if ((wp.condition.type == INIT) && !init_dbg_suspend) { continue; } // check change conditions only on step end. if (wp.change_condition() && !step_end) { continue; } // if recheck, ignore the cache results and reanalyze everything. // if not a recheck, check only unanalyzed tensors if (!recheck) { wp_lock_.lock(); bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id); wp_lock_.unlock(); if (wp_cache_hit) { continue; } } std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); if (!found.empty()) { *qualified_tensor_name = found; watchpoints_to_check->push_back(w_table_item.second); #ifdef OFFLINE_DBG_MODE if (wp.change_condition()) { *previous_iter_tensor_needed = true; } #endif } } } void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name) { // add analyzed tensor to cache if (!recheck) { wp_lock_.lock(); wp_id_cache_[tensor_name].insert(id); wp_lock_.unlock(); } } void DebugServices::CheckWatchpointsForTensor( partitioned_names *chunk_names, partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions, partitioned_id *const chunk_watchpoint_id, partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes, const std::vector &op_overflows, const std::vector &async_file_pool, partitioned_numbers *chunk_exec_orders, std::vector> *tensor_list, int begin, int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id, std::vector *chunk_tensor_byte_size, std::vector *device_id, std::vector *root_graph_id) { for (int i = begin; i < end; i++) { auto &tensor = (*tensor_list)[i]; #ifdef OFFLINE_DBG_MODE // read data in offline mode std::vector> result_list; ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration()}, std::vector{tensor->GetRootGraphId()}, std::vector{tensor->GetIsOutput()}, async_file_pool, &result_list); tensor = result_list[0]; if (!tensor->GetByteSize()) { tensor.reset(); continue; } #endif const auto tensor_name = tensor->GetName(); const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':')); const auto tensor_slot = std::to_string(tensor->GetSlot()); // no elements to analyze if (tensor->GetByteSize() == 0) { continue; } (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize(); int tensor_dtype = tensor->GetType(); std::vector watchpoints_to_check; std::string qualified_tensor_name; bool previous_iter_tensor_needed = false; // Add do nothing line in case offline debug is off, prevent unused var warning (void)previous_iter_tensor_needed; AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot, &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check); // no wp set on current tensor if (watchpoints_to_check.empty()) { continue; } uint32_t num_elements = tensor->GetNumElements(); #ifdef OFFLINE_DBG_MODE void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed); #else void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr; #endif std::unique_ptr base_summary_ptr; if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) { base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype); if (base_summary_ptr != nullptr) { base_summary_ptr->SummarizeTensor(watchpoints_to_check); } } for (auto &wp : watchpoints_to_check) { bool is_hit = false; int error_code = 0; std::vector parameter_list = {}; if (wp.condition.type == IS_OVERFLOW) { is_hit = CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration()); } else if (base_summary_ptr != nullptr) { auto item = base_summary_ptr->IsWatchpointHit(wp); is_hit = std::get(item); error_code = std::get(item); parameter_list = std::get(item); } AddAnalyzedTensorToCache(recheck, wp.id, tensor_name); if (is_hit || error_code) { (*chunk_exec_orders)[chunk_id].push_back(tensor->GetExecutionOrder()); (*chunk_names)[chunk_id].push_back(qualified_tensor_name); (*chunk_slots)[chunk_id].push_back(tensor_slot); (*chunk_conditions)[chunk_id].push_back(wp.condition.type); (*chunk_watchpoint_id)[chunk_id].push_back(wp.id); if (device_id != nullptr) { (*chunk_device_id)[chunk_id].push_back(tensor->GetDeviceId()); } if (root_graph_id != nullptr) { (*chunk_root_graph_id)[chunk_id].push_back(tensor->GetRootGraphId()); } (*chunk_parameters)[chunk_id].push_back(parameter_list); (*chunk_error_codes)[chunk_id].push_back(error_code); } } #ifdef OFFLINE_DBG_MODE // in offline mode remove the need for the data tensor.reset(); #endif } } void DebugServices::CheckWatchpoints(std::vector *const name, std::vector *const slot, std::vector *const condition, std::vector *const watchpoint_id, std::vector> *const parameters, std::vector *const error_codes, const std::vector &op_overflows, const std::vector &async_file_pool, std::vector> *tensor_list, const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector *device_id, std::vector *root_graph_id) { std::lock_guard lg(lock_); auto t1 = std::chrono::high_resolution_clock::now(); if (watchpoint_table_.empty()) return; // vector to store execution order of tensors hit std::vector exec_order; int tensor_list_size = tensor_list->size(); uint64_t tensor_list_byte_size = 0; MS_LOG(INFO) << "tensor list size: " << tensor_list_size; if (tensor_list_size == 0) return; // default value for number of threads const int default_thread_num = 32; int max_thread_num = default_thread_num; if (max_thread_num > tensor_list_size) { max_thread_num = tensor_list_size; } MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num; int chunk_size = tensor_list_size / max_thread_num; int remainder = tensor_list_size % max_thread_num; partitioned_numbers chunk_exec_orders(max_thread_num); partitioned_names chunk_names(max_thread_num); partitioned_names chunk_slots(max_thread_num); partitioned_numbers chunk_conditions(max_thread_num); partitioned_id chunk_watchpoint_id(max_thread_num); partitioned_parameters chunk_parameters(max_thread_num); partitioned_error_code chunk_error_codes(max_thread_num); partitioned_id chunk_device_id(max_thread_num); partitioned_id chunk_root_graph_id(max_thread_num); std::vector chunk_tensor_byte_size(max_thread_num, 0); std::vector> tensor_future_vec; int begin = 0; int end = begin; for (int i = 0; i < max_thread_num; i++) { end += chunk_size; if (remainder > 0) { end++; remainder--; } tensor_future_vec.push_back( std::async(std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots, &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool, &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id, &chunk_root_graph_id, &chunk_tensor_byte_size, device_id, root_graph_id)); begin = end; } for (unsigned int i = 0; i < tensor_future_vec.size(); i++) { tensor_future_vec[i].wait(); tensor_future_vec[i].get(); for (unsigned int j = 0; j < chunk_exec_orders[i].size(); j++) { std::vector::iterator iter = std::lower_bound(exec_order.begin(), exec_order.end(), chunk_exec_orders[i][j]); // if the execution order is repeated,inserts the new one before the others with same execution order. int position = iter - exec_order.begin(); exec_order.insert(iter, chunk_exec_orders[i][j]); name->insert(name->begin() + position, chunk_names[i][j]); slot->insert(slot->begin() + position, chunk_slots[i][j]); condition->insert(condition->begin() + position, chunk_conditions[i][j]); watchpoint_id->insert(watchpoint_id->begin() + position, chunk_watchpoint_id[i][j]); if (device_id != nullptr) { device_id->insert(device_id->begin() + position, chunk_device_id[i][j]); } if (root_graph_id != nullptr) { root_graph_id->insert(root_graph_id->begin() + position, chunk_root_graph_id[i][j]); } parameters->insert(parameters->begin() + position, chunk_parameters[i][j]); error_codes->insert(error_codes->begin() + position, chunk_error_codes[i][j]); } // free the memory for used vectors std::vector().swap(chunk_exec_orders[i]); std::vector().swap(chunk_names[i]); std::vector().swap(chunk_slots[i]); std::vector().swap(chunk_conditions[i]); std::vector().swap(chunk_watchpoint_id[i]); std::vector>().swap(chunk_parameters[i]); std::vector().swap(chunk_error_codes[i]); std::vector().swap(chunk_device_id[i]); std::vector().swap(chunk_root_graph_id[i]); tensor_list_byte_size += chunk_tensor_byte_size[i]; } auto t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration ms_double = t2 - t1; MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB"; MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s"; } #ifdef OFFLINE_DBG_MODE void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size, std::vector *shape, std::vector **data_buffer) { std::ifstream infile; std::string file_path = file_name; MS_LOG(INFO) << "Reading in file: " << file_path; infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in); if (!infile.is_open()) { MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno << " ErrInfo:" << strerror(errno); return; } uint64_t file_size = infile.tellg(); infile.seekg(0, std::ios::beg); auto buffer = std::make_unique>(file_size); if (!infile.read(buffer->data(), file_size)) { MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path; return; } const int substr_len = 2; const int header_len_offset = 8; const int header_offset = 9; const int type_offset = 10; uint16_t header_len = *reinterpret_cast(buffer->data() + header_len_offset); std::string header(buffer->data() + header_offset, header_len); std::size_t type_i = header.find("descr") + type_offset; if (header.length() < type_i + substr_len) { MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length(); return; } *tensor_type = header.substr(type_i, substr_len); std::size_t shape_i_open = header.find("("); std::size_t shape_i_close = header.find(")"); std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1); std::string intermediate; std::stringstream check_shape(shape_str); MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]"; while (getline(check_shape, intermediate, ',')) { shape->push_back(std::stoi(intermediate)); } std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1])); std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies()); std::size_t data_size = data_len * word_size; infile.seekg(header_len + type_offset); *data_buffer = new std::vector(data_size); if (data_buffer == nullptr || !infile.read((*data_buffer)->data(), data_size)) { MS_LOG(ERROR) << "Unable to get tensor data from npy"; } *size = data_size; } void DebugServices::ConvertToHostFormat(const std::map> &dir_to_files_map, std::vector *result_list) { std::string file_format = "npy"; for (auto const &d : dir_to_files_map) { std::vector files_to_convert_in_dir; std::string dump_key = d.first; for (auto const &file_name : d.second) { bool already_converted = false; // Remove scope from the file_name for matching files converted by mindinsight tool. std::size_t found_first_dot = file_name.find("."); std::size_t found_last_underscore = file_name.find_last_of("_"); std::string file_name_without_scope = file_name; if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) { file_name_without_scope = file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot); } for (std::string &file_found : *result_list) { if (file_found.find(file_name_without_scope) != std::string::npos) { already_converted = true; } } if (!already_converted) { files_to_convert_in_dir.push_back(dump_key + "/" + file_name); } } std::ostringstream input_file_o; const char *const delim = " "; std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(), std::ostream_iterator(input_file_o, delim)); std::string input_files = input_file_o.str(); MS_LOG(INFO) << "Ops to convert: " << input_files; if (input_files != "") { // Look for the installation path to the conver_async package. If not found, throw exception and terminate the // later task. try { auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async"); std::string convert_pkg_path = pkg.attr("__file__").cast(); MS_LOG(INFO) << "The file for converting async dump data is in " << convert_pkg_path; std::string convert_command = "python " + convert_pkg_path + " -out " + dump_key + " -t " + file_format + " -d " + dump_key + " -f NCHW -l " + input_files; (void)(system(convert_command.c_str()) + 1); } catch (pybind11::error_already_set &e) { MS_LOG(EXCEPTION) << "Can't find package mindspore.offline_debug.convert_async"; } std::string abspath = RealPath(dump_key); DIR *d_handle = opendir(abspath.c_str()); if (d_handle == nullptr) { MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat."; return; } struct dirent *dir = nullptr; while ((dir = readdir(d_handle)) != NULL) { if (dir->d_type == DT_REG) { std::string candidate = dir->d_name; for (const std::string &file_to_find : files_to_convert_in_dir) { std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1); if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) { // we found a converted file for this op std::string found_file = dump_key + "/" + candidate; if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) { result_list->push_back(found_file); } } } } } (void)closedir(d_handle); } } } void GetNodeNameWithoutScope(std::string *dump_style_name) { if (dump_style_name == nullptr) { return; } std::string node_name_without_scope = *dump_style_name; std::size_t last_scope_marker; std::string delim = "/"; last_scope_marker = node_name_without_scope.rfind(delim); if (last_scope_marker != std::string::npos) { node_name_without_scope = node_name_without_scope.substr(last_scope_marker + delim.size()); } *dump_style_name = node_name_without_scope; } void ReplaceSrcFileName(std::string *dump_style_name) { if (dump_style_name == nullptr) { return; } const std::string strsrc = "/"; std::string strdst = "_"; std::string::size_type pos = 0; std::string::size_type srclen = strsrc.size(); std::string::size_type dstlen = strdst.size(); while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) { dump_style_name->replace(pos, srclen, strdst); pos += dstlen; } } void DebugServices::ConvertReadTensors(std::vector backend_name, std::vector slot, std::vector device_id, std::vector iteration, std::vector root_graph_id, std::vector *result_list) { std::string file_format = "npy"; std::map> dir_to_files_map; for (unsigned int i = 0; i < backend_name.size(); i++) { // form prefix of the tensor file to read from graph pb node name std::string dump_style_kernel_name = backend_name[i]; // remove slot from name std::size_t found_colon = dump_style_kernel_name.find_last_of(":"); dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon); std::string prefix_dump_file_name = dump_style_kernel_name; GetNodeNameWithoutScope(&prefix_dump_file_name); std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" + std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory std::string abspath = RealPath(specific_dump_dir); DIR *d = opendir(abspath.c_str()); if (d == nullptr) { MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors."; return; } else { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1); if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos && file_name.rfind(file_format) == std::string::npos) { // if file matches prefix and is in device format add to candidate files to convert. dir_to_files_map[specific_dump_dir].push_back(file_name); } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos && file_name.rfind(file_format) != std::string::npos) { // otherwise, if file matches prefix and already has been converted to host format // add to result of converted files. std::string found_file = specific_dump_dir + "/" + file_name; if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) { result_list->push_back(found_file); } } } } (void)closedir(d); } } ConvertToHostFormat(dir_to_files_map, result_list); } void DebugServices::ConvertWatchPointNodes(const std::vector> &proto_dump, const std::string &specific_dump_dir, std::vector *result_list) { std::string file_format = "npy"; std::map> dir_to_files_map; for (const auto &node : proto_dump) { std::string dump_name = std::get<1>(node); dump_name = dump_name.substr(0, dump_name.rfind(".")); // search files in dir for the one that meets the filename prefix and read the file into memory std::string abspath = RealPath(specific_dump_dir); DIR *d = opendir(abspath.c_str()); if (d == nullptr) { MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes."; return; } else { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1); if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos && file_name.rfind(file_format) == std::string::npos) { // if file matches prefix and is in device format add to candidate files to convert. dir_to_files_map[specific_dump_dir].push_back(file_name); } else if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos && file_name.rfind(file_format) != std::string::npos) { // otherwise, if file matches prefix and already has been converted to host format // add to result of converted files. std::string found_file = specific_dump_dir + "/" + file_name; if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) { result_list->push_back(found_file); } } } } (void)closedir(d); } } ConvertToHostFormat(dir_to_files_map, result_list); } void DebugServices::GetTensorDataInfoAsync(const std::vector> &proto_dump, const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id, uint32_t root_graph_id, const std::vector &async_file_pool, std::vector> *tensor_list) { for (auto &node : proto_dump) { std::vector slot_list; std::string dump_style_name = std::get<1>(node); // Get dump_name and output_str from the second element of tuple std::size_t found_dot = dump_style_name.rfind("."); std::string dump_name = dump_style_name.substr(0, found_dot); std::string output_str = dump_style_name.substr(found_dot + 1); bool output_flag = (output_str == "output"); for (const std::string &file_name : async_file_pool) { std::size_t found = file_name.find(dump_name); std::size_t found_out = file_name.find(output_str); std::size_t found_dot_start = file_name.find(".", found_out); std::size_t found_dot_end = file_name.find(".", found_dot_start); if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos && found_out != std::string::npos) { slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1))); } } for (auto slot : slot_list) { // add a TensorData entry (data will be read when needed) std::vector shape; std::string orig_name = std::get<0>(node); auto tensor_data = std::make_shared(); tensor_data->SetName(orig_name); tensor_data->SetExecutionOrder(0); tensor_data->SetSlot(slot); tensor_data->SetIteration(iteration); tensor_data->SetDeviceId(device_id); tensor_data->SetRootGraphId(root_graph_id); tensor_data->SetDataPtr(NULL); tensor_data->SetByteSize(0); tensor_data->SetType(""); tensor_data->SetShape(shape); tensor_data->SetIsOutput(output_flag); tensor_list->push_back(tensor_data); } } } void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot, const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id, const bool is_output, const std::size_t data_size, const std::string &type_name, const std::vector &shape, std::vector *buffer, std::vector> *result_list) { // call LoadNewTensor to store tensor in internal cache auto tensor_data = std::make_shared(); tensor_data->SetName(backend_name); tensor_data->SetExecutionOrder(0); tensor_data->SetSlot(slot); tensor_data->SetIteration(iteration); tensor_data->SetDeviceId(device_id); tensor_data->SetRootGraphId(root_graph_id); tensor_data->SetIsOutput(is_output); if (data_size) { tensor_data->SetDataPtr(buffer->data()); } else { tensor_data->SetDataPtr(NULL); } tensor_data->SetByteSize(data_size); tensor_data->SetType(type_name); tensor_data->SetShape(shape); if (data_size) { tensor_loader_->LoadNewTensor(tensor_data, false); } // add to result_list result_list->push_back(tensor_data); } void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *slot_string_to_check, std::string *dump_style_kernel_name, size_t slot, bool is_output) { std::string dump_style_name_part = *dump_style_kernel_name; GetNodeNameWithoutScope(&dump_style_name_part); std::string slot_str; if (is_output) { slot_str = ".output." + std::to_string(slot); } else { slot_str = ".input." + std::to_string(slot); } dump_style_name_part += slot_str; *prefix_dump_file_name = dump_style_name_part; *slot_string_to_check = slot_str; } std::string GetNewestFilePath(std::vector file_list) { // get file with the newest timestamp from the list. std::string newest_file; if (file_list.empty()) { return newest_file; } std::sort(file_list.begin(), file_list.end()); return file_list.back(); } void DebugServices::ReadDumpedTensor(std::vector backend_name, std::vector slot, std::vector device_id, std::vector iteration, std::vector root_graph_id, const std::vector &is_output, const std::vector &async_file_pool, std::vector> *result_list) { for (unsigned int i = 0; i < backend_name.size(); i++) { // form prefix of the tensor file to read from graph pb node name std::string dump_style_kernel_name = backend_name[i]; // remove slot from name std::size_t found_colon = dump_style_kernel_name.find_last_of(":"); dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon); std::string slot_string_to_check; std::string prefix_dump_file_name; SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]); std::string prefix_dump_to_check = dump_style_kernel_name; GetNodeNameWithoutScope(&prefix_dump_to_check); std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" + std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory if (is_sync_mode_) { ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i], iteration[i], root_graph_id[i], is_output[i], result_list); } else { ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i], device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list); } } } void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir, const std::string &backend_name, size_t slot, unsigned int device_id, unsigned int iteration, unsigned int root_graph_id, const bool &is_output, std::vector> *result_list) { std::vector *buffer = NULL; std::string type_name = ""; std::vector shape; uint64_t data_size = 0; std::string abspath = RealPath(specific_dump_dir); DIR *d = opendir(abspath.c_str()); bool found_file = false; std::vector matched_paths; if (d == nullptr) { MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!"; return; } struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string stripped_file_name = GetStrippedFilename(file_name); if (stripped_file_name.empty()) { continue; } std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0); if (found != 0) { continue; } std::string full_path = specific_dump_dir + "/" + file_name; matched_paths.push_back(full_path); found_file = true; } } if (found_file) { shape.clear(); std::string result_path = GetNewestFilePath(matched_paths); ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape, buffer, result_list); } else { AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer, result_list); MS_LOG(INFO) << "Target tensor has not been found."; } (void)closedir(d); } void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check, const std::string &slot_string_to_check, const std::string &backend_name, size_t slot, unsigned int device_id, unsigned int iteration, unsigned int root_graph_id, const bool &is_output, const std::vector &async_file_pool, std::vector> *result_list) { std::vector *buffer = NULL; std::string type_name = ""; std::vector shape; uint64_t data_size = 0; bool found = false; std::vector matched_paths; // if async mode for (const std::string &file_path : async_file_pool) { if (file_path.find(specific_dump_dir) != std::string::npos && file_path.find(prefix_dump_to_check) != std::string::npos && file_path.find(slot_string_to_check) != std::string::npos) { matched_paths.push_back(file_path); found = true; } } if (found) { shape.clear(); std::string result_path = GetNewestFilePath(matched_paths); ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer); AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape, buffer, result_list); } else { // If no npy file is found, add empty tensor data. AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer, result_list); MS_LOG(INFO) << "Target tensor has not been found."; } } std::string DebugServices::GetStrippedFilename(const std::string &file_name) { // strip off the task_id, stream_id, and timestamp, then compare size_t first_dot = file_name.find("."); size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1); size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1); if (fifth_dot == std::string::npos) { return std::string(); } // Look for the second dot's position from the back to avoid issue due to dots in the node name. size_t second_dot = fifth_dot; const int8_t kSecondDotPosition = 2; for (int8_t pos = 5; pos > kSecondDotPosition; pos--) { second_dot = file_name.rfind(".", second_dot - 1); } std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1); std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot); std::string stripped_file_name = start_string + end_string; return stripped_file_name; } std::vector> DebugServices::ReadNeededDumpedTensors( unsigned int iteration, std::vector *async_file_pool) { // get a list of nodes and the devices they are on to monitor std::vector> tensor_list; std::map, std::vector>> device_and_graph_to_nodes; for (auto w_table_item : watchpoint_table_) { auto wp = std::get<1>(w_table_item); unsigned int index = 0; for (auto check_node : wp.check_node_list) { std::vector devices = std::get<1>(wp.check_node_device_list[index]); std::vector graphs = std::get<1>(wp.check_node_graph_list[index]); for (auto device : devices) { for (auto graph : graphs) { std::tuple key(device, graph); device_and_graph_to_nodes[key].push_back(check_node); } } index++; } } // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list // as they are found for (auto const &device_and_graph_item : device_and_graph_to_nodes) { std::tuple device_and_graph = device_and_graph_item.first; uint32_t device_id = std::get<0>(device_and_graph); uint32_t root_graph_id = std::get<1>(device_and_graph); std::vector> wp_nodes = device_and_graph_item.second; std::vector> proto_to_dump; std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" + std::to_string(root_graph_id) + "/" + IterationString(iteration); // convert node names to dump style for (auto node : wp_nodes) { std::string orig_name = std::get<0>(node); std::string dump_style_name = orig_name; // Remove the scope from the fully qualified name to compare for both sync and async case. GetNodeNameWithoutScope(&dump_style_name); bool node_is_out = std::get<1>(node); if (node_is_out) { dump_style_name += ".output"; } else { dump_style_name += ".input"; } proto_to_dump.push_back(std::tuple(orig_name, dump_style_name)); } if (!is_sync_mode_) { // convert all files in proto_to_dump to npy and add to pool of async file names ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool); } if (is_sync_mode_) { // search files in dir for the one that meets the filename prefix and read the file into memory std::string abspath = RealPath(specific_dump_dir); DIR *d = opendir(abspath.c_str()); if (d == nullptr) { MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors."; } else { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; for (auto &node : proto_to_dump) { std::string dump_name = std::get<1>(node); std::string stripped_file_name = GetStrippedFilename(file_name); if (stripped_file_name.empty()) { continue; } std::size_t found = stripped_file_name.rfind(dump_name, 0); if (found == 0) { size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1)); std::vector shape; std::string orig_name = std::get<0>(node); std::string output_str = dump_name.substr(dump_name.rfind(".") + 1); bool output_flag = (output_str == "output"); AddToTensorData(orig_name, slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, NULL, &tensor_list); break; } } } } (void)closedir(d); } } else { GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool, &tensor_list); } } return tensor_list; } std::string DebugServices::IterationString(unsigned int iteration) { std::string iteration_string; bool init_dbg_suspend = (iteration == UINT_MAX); if (init_dbg_suspend) { iteration_string = "init"; } else { iteration_string = std::to_string(iteration); } return iteration_string; } #endif void DebugServices::ReadNodesTensors(const std::vector &name, std::vector *const ret_name, std::vector *const data_ptr, std::vector *const data_size, std::vector *const dtype, std::vector> *const shape) { std::vector>> result_list; tensor_loader_->SearchTensors(name, &result_list); for (auto result : result_list) { if (!std::get<1>(result)) { continue; } ret_name->push_back(std::get<0>(result)); data_ptr->push_back(reinterpret_cast(std::get<1>(result)->GetDataPtr())); data_size->push_back(std::get<1>(result)->GetByteSize()); dtype->push_back(std::get<1>(result)->GetType()); shape->push_back(std::get<1>(result)->GetShape()); } } void DebugServices::SearchNodesTensors(const std::vector &name, std::vector>> *result_list) { if (!result_list) { MS_LOG(DEBUG) << "result_list is nullptr."; return; } tensor_loader_->SearchTensors(name, result_list); } #ifdef ONLINE_DBG_MODE bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const { bool ret = false; for (auto w_table_item : watchpoint_table_) { auto check_node_list = std::get<1>(w_table_item).check_node_list; for (auto check_node : check_node_list) { std::string w_name = std::get<0>(check_node); bool w_type = std::get<1>(check_node); if ((w_type == true && ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) { ret = true; return ret; } } } return ret; } bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const { if (kernel && w_name.length() > 0) { auto input_size = AnfAlgo::GetInputTensorNum(kernel); for (size_t j = 0; j < input_size; ++j) { auto input_kernel = kernel->input(j + 1); std::string input_kernel_name = GetKernelNodeName(input_kernel); auto found = w_name.find_last_of('/'); if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name) return true; } return false; } else { return false; } } #endif void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); } std::vector> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); } std::vector> DebugServices::GetNodeTensorMap(const std::string &node_name) const { return tensor_loader_->GetNodeTensorMap(node_name); } uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); } void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); } void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); } void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); } #ifdef ONLINE_DBG_MODE bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, TypeId device_type, const std::string &addr_format, size_t slot) const { return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type, device_type, addr_format, slot); } #endif bool DebugServices::LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev) { return tensor_loader_->LoadNewTensor(tensor, keep_prev); } std::unordered_map DebugServices::GetWatchpointTable() { return watchpoint_table_; } void DebugServices::ResetLoadedTensors() { wp_id_cache_.clear(); MS_LOG(INFO) << "Resetting loaded tensors"; tensor_loader_->MoveParametersCurrentToPrev(); tensor_loader_->EmptyCurrentTensor(); // will move parameters from previous to current map tensor_loader_->SwapCurrentPrev(); overflow_ops_.clear(); } #ifdef ONLINE_DBG_MODE std::vector> DebugServices::GetNodeTensor(const CNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); std::vector> result; auto output_size = AnfAlgo::GetOutputTensorNum(kernel); auto kernel_name = GetKernelNodeName(kernel); for (size_t j = 0; j < output_size; ++j) { auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j); auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot); if (tensor) result.push_back(tensor); } return result; } #endif bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id, unsigned int iteration) { std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_'); std::vector op_names; std::string overflow_bin_path; #ifdef ONLINE_DBG_MODE auto debugger = Debugger::GetInstance(); overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->graph_id()); auto realpath = Common::GetRealPath(overflow_bin_path); if (!realpath.has_value()) { MS_LOG(ERROR) << "Get real path failed for overflow_bin_path."; return false; } overflow_bin_path = realpath.value(); #else overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" + std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/"; overflow_bin_path = RealPath(overflow_bin_path); #endif overflow_wp_lock_.lock(); MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find; auto found_overflows = overflow_ops_.find(overflow_bin_path); if (found_overflows != overflow_ops_.end()) { MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path; op_names = overflow_ops_[overflow_bin_path]; } else { std::map, std::string> task_stream_to_opname; std::vector> task_stream_hit; const std::string overflow_file_prefix = "Opdebug.Node_OpDebug."; MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path; std::string abspath = RealPath(overflow_bin_path); DIR *d = opendir(abspath.c_str()); if (d == nullptr) { MS_LOG(ERROR) << "OverFlow bin directory does not exist!"; } else { struct dirent *dir = nullptr; while ((dir = readdir(d)) != nullptr) { if (dir->d_type == DT_REG) { // form fully qualified filename std::string file_path = overflow_bin_path; std::string file_name = dir->d_name; file_path.append(file_name); // attempt to read the file std::ifstream infile; infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in); if (!infile.is_open()) { MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno << " ErrInfo:" << strerror(errno); continue; } std::string node_name; uint64_t task_id = 0; uint64_t stream_id = 0; // detect overflow bin file if (file_name.rfind(overflow_file_prefix, 0) == 0) { // start of op overflow data in bin file const uint32_t offset = 321; (void)infile.seekg(offset, std::ios::beg); std::vector buffer; // size of op overflow info section const size_t buf_size = 256; buffer.resize(buf_size); (void)infile.read(buffer.data(), buf_size); if (infile.gcount() != buf_size) { MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!"; continue; } const uint8_t stream_id_offset = 16; const uint8_t task_id_offset = 24; // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4 // byte values currently. stream_id = BytestoUInt64(std::vector(buffer.begin() + stream_id_offset, buffer.end())); task_id = BytestoUInt64(std::vector(buffer.begin() + task_id_offset, buffer.end())); MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id << "."; task_stream_hit.push_back(std::make_pair(task_id, stream_id)); } else { // regular bin file bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id); if (success_parse) { task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name; } } infile.close(); } } (void)closedir(d); } // find the op_names with an overflow hit for (auto &task_stream : task_stream_hit) { auto op_name = task_stream_to_opname[task_stream]; if (!op_name.empty()) { MS_LOG(INFO) << "Operation overflow detected in " << op_name; op_names.push_back(op_name); } } overflow_ops_[overflow_bin_path] = op_names; } overflow_wp_lock_.unlock(); // determine if overflow wp has been triggered for node_name_to_find if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) { MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find; return true; } return false; } bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *node_name, uint64_t *task_id, uint64_t *stream_id) { // get the node_name, task_id, and stream_id from async dump filename // node_type.node_name.task_id.stram_id.timestamp // WARNING: node_name may have dots in it size_t fourth_dot = file_name.rfind("."); size_t third_dot = file_name.rfind(".", fourth_dot - 1); size_t second_dot = file_name.rfind(".", third_dot - 1); size_t first_dot = file_name.find("."); // check if dots were found if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos || fourth_dot == std::string::npos) { return false; } // check if its not an async bin file if (file_name.substr(fourth_dot) == ".npy") { return false; } // get node_name if (first_dot < second_dot) { *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1); } else { MS_LOG(ERROR) << "Async filename parse error to get node_name."; return false; } // get task id if (second_dot < third_dot) { std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1); try { *task_id = std::stoull(extracted_task_id); } catch (...) { MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id."; return false; } } else { MS_LOG(ERROR) << "Async filename parse error to get task_id."; return false; } // get stream id if (third_dot < fourth_dot) { std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1); try { *stream_id = std::stoull(extracted_stream_id); } catch (...) { MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id."; return false; } } else { MS_LOG(ERROR) << "Async filename parse error to get stream_id."; return false; } return true; } std::string DebugServices::RealPath(const std::string &input_path) { if (input_path.length() >= PATH_MAX) { MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX; } size_t path_split_pos = input_path.find_last_of('/'); // get real path char real_path[PATH_MAX] = {0}; // input_path is dir + file_name if (path_split_pos != std::string::npos) { std::string prefix_path = input_path.substr(0, path_split_pos); std::string file_name = input_path.substr(path_split_pos); if (file_name.length() > NAME_MAX) { MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX; } if (realpath(prefix_path.c_str(), real_path) == nullptr) { MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist."; return ""; } return std::string(real_path) + file_name; } // input_path is only file_name if (input_path.length() > NAME_MAX) { MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX; } if (realpath(input_path.c_str(), real_path) == nullptr) { MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created."; } return std::string(real_path); } uint64_t DebugServices::BytestoUInt64(const std::vector &buffer) { return le64toh(*reinterpret_cast(buffer.data())); } bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) { return tensor_loader_->TensorExistsInCurrent(tensor_name); } void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) { tensor_loader_->MoveTensorCurrentToPrev(tensor_name); } void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; } std::string DebugServices::GetNetName() { return net_name_; } void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; } std::string DebugServices::GetDumpDir() { return dump_dir_; } void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; } bool DebugServices::GetSyncMode() { return is_sync_mode_; } #ifdef ONLINE_DBG_MODE } // namespace mindspore #endif