/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "debug/debug_services.h" #include #include #include #include #include #include #include #include #ifdef ONLINE_DBG_MODE #include "backend/session/anf_runtime_algorithm.h" #endif #include "debug/debugger/tensor_summary.h" #ifdef ONLINE_DBG_MODE namespace mindspore { #endif DebugServices::DebugServices() { tensor_loader_ = std::make_shared(); } DebugServices::DebugServices(const DebugServices &other) { tensor_loader_ = other.tensor_loader_; watchpoint_table = other.watchpoint_table; } DebugServices &DebugServices::operator=(const DebugServices &other) { if (this != &other) { tensor_loader_ = other.tensor_loader_; watchpoint_table = other.watchpoint_table; } return *this; } void DebugServices::AddWatchpoint( unsigned int id, unsigned int watch_condition, float parameter, const std::vector> &check_node_list, const std::vector ¶meter_list, const std::vector>> *check_node_device_list, const std::vector>> *check_node_graph_list) { std::lock_guard lg(lock_); watchpoint_t watchpoint_item; watchpoint_item.id = id; watchpoint_item.condition.type = static_cast(watch_condition); watchpoint_item.condition.parameter = parameter; watchpoint_item.check_node_list = check_node_list; if (check_node_device_list != nullptr) { watchpoint_item.check_node_device_list = *check_node_device_list; } if (check_node_graph_list != nullptr) { watchpoint_item.check_node_graph_list = *check_node_graph_list; } watchpoint_item.parameter_list = parameter_list; watchpoint_table[id] = watchpoint_item; } void DebugServices::RemoveWatchpoint(unsigned int id) { std::lock_guard lg(lock_); watchpoint_table.erase(id); } std::unique_ptr GetSummaryPtr(const std::shared_ptr &tensor, void *const previous_tensor_ptr, uint32_t num_elements, int tensor_dtype) { switch (tensor_dtype) { case DbgDataType::DT_UINT8: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT8: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT32: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT32: case DbgDataType::DT_BASE_INT: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_UINT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_INT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT16: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT32: case DbgDataType::DT_BASE_FLOAT: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_FLOAT64: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } case DbgDataType::DT_BOOL: { return std::make_unique>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements); } default: MS_LOG(INFO) << "Unsupported tensor type"; // return a null pointer return std::unique_ptr>{}; } } #ifdef OFFLINE_DBG_MODE void *DebugServices::GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed) { void *previous_tensor_ptr = nullptr; std::shared_ptr tensor_prev; if (previous_iter_tensor_needed && tensor->GetIteration() > 1) { // read data in offline mode std::vector file_paths; if (!is_sync_mode) { ConvertReadTensors(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration() - 1}, std::vector{tensor->GetRootGraphId()}, &file_paths); } std::vector> result_list_prev; ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration() - 1}, std::vector{tensor->GetRootGraphId()}, file_paths, &result_list_prev); tensor_prev = result_list_prev[0]; if (!tensor_prev->GetByteSize()) { tensor_prev.reset(); } else { previous_tensor_ptr = tensor_prev->GetDataPtr(); } } return previous_tensor_ptr; } #endif void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::string &tensor_name, const std::string &tensor_name_no_slot, bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name, std::vector *const watchpoints_to_check) { for (auto w_table_item : watchpoint_table) { auto wp = std::get<1>(w_table_item); // check ONLY init conditions on initial suspended state. // skip other conditions on initial suspended state if (init_dbg_suspend && (wp.condition.type != INIT)) continue; // skip init condition if not init suspend if ((wp.condition.type == INIT) && !init_dbg_suspend) continue; // check change conditions only on step end. if (wp.change_condition() && !step_end) continue; // if recheck, ignore the cache results and reanalyze everything. // if not a recheck, check only unanalyzed tensors if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue; std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); if (!found.empty()) { *qualified_tensor_name = found; watchpoints_to_check->push_back(w_table_item.second); #ifdef OFFLINE_DBG_MODE if (wp.change_condition()) { *previous_iter_tensor_needed = true; } #endif } } } void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name) { // add analyzed tensor to cache if (!recheck) { wp_id_cache[tensor_name].insert(id); } } void DebugServices::CheckWatchpoints(std::vector *const name, std::vector *const slot, std::vector *const condition, std::vector *const watchpoint_id, std::vector> *const parameters, std::vector *const error_codes, const std::vector &op_overflows, const std::vector &async_file_pool, std::vector> *tensor_list, const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector *device_id, std::vector *root_graph_id) { std::lock_guard lg(lock_); if (watchpoint_table.empty()) return; // vector to store execution order of tensors hit std::vector exec_order; for (auto &tensor : *tensor_list) { #ifdef OFFLINE_DBG_MODE // read data in offline mode std::vector> result_list; ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, std::vector{tensor->GetIteration()}, std::vector{tensor->GetRootGraphId()}, async_file_pool, &result_list); tensor = result_list[0]; if (!tensor->GetByteSize()) { tensor.reset(); continue; } #endif const auto tensor_name = tensor->GetName(); const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':')); const auto tensor_slot = std::to_string(tensor->GetSlot()); // no elements to analyze if (tensor->GetByteSize() == 0) continue; int tensor_dtype = tensor->GetType(); std::vector watchpoints_to_check; std::string qualified_tensor_name; bool previous_iter_tensor_needed = false; // Add do nothing line in case offline debug is off, prevent unused var warning (void)previous_iter_tensor_needed; AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot, &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check); // no wp set on current tensor if (watchpoints_to_check.empty()) continue; uint32_t num_elements = tensor->GetNumElements(); #ifdef OFFLINE_DBG_MODE void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed); #else void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr; #endif std::unique_ptr base_summary_ptr; if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) { base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype); if (base_summary_ptr != nullptr) { base_summary_ptr->SummarizeTensor(watchpoints_to_check); } } for (auto &wp : watchpoints_to_check) { bool is_hit = false; int error_code = 0; std::vector parameter_list = {}; if (wp.condition.type == IS_OVERFLOW) { is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); } else if (base_summary_ptr != nullptr) { auto item = base_summary_ptr->IsWatchpointHit(wp); is_hit = std::get<0>(item); error_code = std::get<1>(item); parameter_list = std::get<2>(item); } AddAnalyzedTensorToCache(recheck, wp.id, tensor_name); if (is_hit || error_code) { std::vector::iterator iter; // if the execution order is repeated,inserts the new one before the others with same execution order. iter = std::lower_bound(exec_order.begin(), exec_order.end(), tensor->GetExecutionOrder()); int position = iter - exec_order.begin(); exec_order.insert(iter, tensor->GetExecutionOrder()); name->insert(name->begin() + position, qualified_tensor_name); slot->insert(slot->begin() + position, tensor_slot); condition->insert(condition->begin() + position, wp.condition.type); watchpoint_id->insert(watchpoint_id->begin() + position, wp.id); if (device_id != nullptr) { device_id->insert(device_id->begin() + position, tensor->GetDeviceId()); } if (root_graph_id != nullptr) { root_graph_id->insert(root_graph_id->begin() + position, tensor->GetRootGraphId()); } parameters->insert(parameters->begin() + position, parameter_list); error_codes->insert(error_codes->begin() + position, error_code); } } #ifdef OFFLINE_DBG_MODE // in offline mode remove the need for the data tensor.reset(); #endif } } #ifdef OFFLINE_DBG_MODE void DebugServices::GetSlotInfo(const std::string &file_name, const std::string &dump_name, const std::string &specific_dump_dir, std::vector *slot_list) { // get the slot from the name std::string delimiter = "_"; unsigned int start_pos = dump_name.length(); unsigned int end_pos = file_name.find(delimiter, start_pos); std::string item = file_name.substr(start_pos, end_pos - start_pos); slot_list->push_back(std::stoul(item)); } void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size, std::vector *shape, std::vector **data_buffer) { std::ifstream infile; std::string file_path = file_name; MS_LOG(INFO) << "Reading in file: " << file_path; infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in); if (!infile.is_open()) { MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path; return; } uint64_t file_size = infile.tellg(); infile.seekg(0, std::ios::beg); std::unique_ptr> buffer(new std::vector(file_size)); if (!infile.read(buffer->data(), file_size)) { MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path; return; } uint16_t header_len = *reinterpret_cast(buffer->data() + 8); std::string header(buffer->data() + 9, header_len); std::size_t type_i = header.find("descr") + 10; *tensor_type = header.substr(type_i, 2); std::size_t shape_i_open = header.find("("); std::size_t shape_i_close = header.find(")"); std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1); std::string intermediate; std::stringstream check_shape(shape_str); MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]"; while (getline(check_shape, intermediate, ',')) { shape->push_back(std::stoi(intermediate)); } std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1])); std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies()); std::size_t data_size = data_len * word_size; infile.seekg(header_len + 10); *data_buffer = new std::vector(data_size); if (!infile.read((*data_buffer)->data(), data_size)) { MS_LOG(ERROR) << "Unable to get tensor data from npy"; } *size = data_size; } void DebugServices::ConvertToHostFormat(const std::map> &dir_to_files_map, std::vector *result_list) { std::string file_format = "npy"; for (auto const &d : dir_to_files_map) { std::vector files_to_convert_in_dir; std::string dump_key = d.first; for (auto const &file_name : d.second) { bool already_converted = false; for (std::string &file_found : *result_list) { if (file_found.find(file_name) != std::string::npos) { already_converted = true; } } if (!already_converted) { files_to_convert_in_dir.push_back(dump_key + "/" + file_name); } } std::string current_working_dir(__FILE__); std::size_t pos = current_working_dir.find_last_of("\\/"); current_working_dir = (std::string::npos == pos) ? "" : current_working_dir.substr(0, pos); MS_LOG(INFO) << current_working_dir; std::ostringstream input_file_o; const char *const delim = " "; std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(), std::ostream_iterator(input_file_o, delim)); std::string input_files = input_file_o.str(); MS_LOG(INFO) << "Ops to convert: " << input_files; if (input_files != "") { std::string convert_command = "python " + current_working_dir + "/convert_async.py -out " + dump_key + " -t " + file_format + " -d " + dump_key + " -f NCHW -l " + input_files; (void)(system(convert_command.c_str()) + 1); DIR *d_handle; d_handle = opendir(dump_key.c_str()); if (d_handle != nullptr) { struct dirent *dir = nullptr; while ((dir = readdir(d_handle)) != NULL) { if (dir->d_type == DT_REG) { std::string candidate = dir->d_name; for (const std::string &file_to_find : files_to_convert_in_dir) { std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1); if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) { // we found a converted file for this op result_list->push_back(dump_key + "/" + candidate); } } } } } } } } void DebugServices::ConvertReadTensors(std::vector backend_name, std::vector slot, std::vector device_id, std::vector iteration, std::vector root_graph_id, std::vector *result_list) { std::string file_format = "npy"; std::map> dir_to_files_map; for (unsigned int i = 0; i < backend_name.size(); i++) { // form prefix of the tensor file to read from graph pb node name std::string dump_style_kernel_name = backend_name[i]; const std::string strsrc = "/"; std::string strdst = "_"; std::string::size_type pos = 0; std::string::size_type srclen = strsrc.size(); std::string::size_type dstlen = strdst.size(); // remove slot from name std::size_t found_colon = dump_style_kernel_name.find_last_of(":"); dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon); while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) { dump_style_kernel_name.replace(pos, srclen, strdst); pos += dstlen; } std::string prefix_dump_file_name = dump_style_kernel_name; std::string specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/" + net_name + "_graph_" + std::to_string(root_graph_id[i]) + "/" + std::to_string(root_graph_id[i]) + "/" + std::to_string(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory DIR *d; d = opendir(specific_dump_dir.c_str()); if (d != nullptr) { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1); if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 && file_name.rfind(file_format) == std::string::npos) { // if file matches prefix and is in device format add to candidate files to convert. dir_to_files_map[specific_dump_dir].push_back(file_name); } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 && file_name.rfind(file_format) != std::string::npos) { // otherwise, if file matches prefix and already has been converted to host format // add to result of converted files. result_list->push_back(specific_dump_dir + "/" + file_name); } } } } closedir(d); } ConvertToHostFormat(dir_to_files_map, result_list); } void DebugServices::ConvertWatchPointNodes(const std::vector> &proto_dump, const std::string &specific_dump_dir, std::vector *result_list) { std::string file_format = "npy"; std::map> dir_to_files_map; for (const auto &node : proto_dump) { std::string dump_name = std::get<1>(node); // search files in dir for the one that meets the filename prefix and read the file into memory DIR *d; d = opendir(specific_dump_dir.c_str()); if (d != nullptr) { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1); if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 && file_name.rfind(file_format) == std::string::npos) { // if file matches prefix and is in device format add to candidate files to convert. dir_to_files_map[specific_dump_dir].push_back(file_name); } else if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 && file_name.rfind(file_format) != std::string::npos) { // otherwise, if file matches prefix and already has been converted to host format // add to result of converted files. result_list->push_back(specific_dump_dir + "/" + file_name); } } } } closedir(d); } ConvertToHostFormat(dir_to_files_map, result_list); } void DebugServices::GetTensorDataInfoAsync(const std::vector> &proto_dump, uint32_t iteration, uint32_t device_id, uint32_t root_graph_id, const std::vector &async_file_pool, std::vector> *tensor_list) { for (auto &node : proto_dump) { std::vector slot_list; for (const std::string &file_name : async_file_pool) { std::string dump_name = std::get<1>(node); std::size_t found = file_name.find(dump_name); std::size_t found_out = file_name.find("output"); std::size_t found_dot_start = file_name.find(".", found_out); std::size_t found_dot_end = file_name.find(".", found_dot_start); if (found != std::string::npos && found_out != std::string::npos) { slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1))); } } for (auto slot : slot_list) { // add a TensorData entry (data will be read when needed) std::vector shape; std::string orig_name = std::get<0>(node); auto tensor_data = std::make_shared(); tensor_data->SetName(orig_name); tensor_data->SetExecutionOrder(0); tensor_data->SetSlot(slot); tensor_data->SetIteration(iteration); tensor_data->SetDeviceId(device_id); tensor_data->SetRootGraphId(root_graph_id); tensor_data->SetDataPtr(NULL); tensor_data->SetByteSize(0); tensor_data->SetType(""); tensor_data->SetShape(shape); tensor_list->push_back(tensor_data); } } } std::size_t DebugServices::GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot, const std::string &prefix_dump_file_name, std::string *file_name, std::string *type_name, std::string *out_dir, std::vector *shape) { std::size_t found = 0; found = file_name->rfind(prefix_dump_file_name, 0); if (found != 0) { return found; } // found a file, now get the shape and type // find "_shape_" in the filename std::string shape_delimiter = "_shape_"; unsigned int str_pos = file_name->find(shape_delimiter) + shape_delimiter.length(); // read numbers with '_' delimter until you read a non-number, that will be the type name bool number_found = true; std::string delimiter = "_"; while (number_found) { unsigned int end_pos = file_name->find(delimiter, str_pos); std::string item = file_name->substr(str_pos, end_pos - str_pos); bool is_number = !item.empty() && std::find_if(item.begin(), item.end(), [](unsigned char c) { return !std::isdigit(c); }) == item.end(); if (is_number) { shape->push_back(std::stoul(item)); str_pos = end_pos + 1; } else { *type_name = item; number_found = false; } } return 0; } void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot, const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id, const std::size_t data_size, const std::string &type_name, const std::vector &shape, std::vector *buffer, std::vector> *result_list) { // call LoadNewTensor to store tensor in internal cache auto tensor_data = std::make_shared(); tensor_data->SetName(backend_name); tensor_data->SetExecutionOrder(0); tensor_data->SetSlot(slot); tensor_data->SetIteration(iteration); tensor_data->SetDeviceId(device_id); tensor_data->SetRootGraphId(root_graph_id); if (data_size) { tensor_data->SetDataPtr(buffer->data()); } else { tensor_data->SetDataPtr(NULL); } tensor_data->SetByteSize(data_size); tensor_data->SetType(type_name); tensor_data->SetShape(shape); if (data_size) { tensor_loader_->LoadNewTensor(tensor_data, false); } // add to result_list result_list->push_back(tensor_data); } void DebugServices::ReadDumpedTensor(std::vector backend_name, std::vector slot, std::vector device_id, std::vector iteration, std::vector root_graph_id, const std::vector &async_file_pool, std::vector> *result_list) { for (unsigned int i = 0; i < backend_name.size(); i++) { // form prefix of the tensor file to read from graph pb node name std::string dump_style_kernel_name = backend_name[i]; const std::string strsrc = "/"; std::string strdst; if (is_sync_mode) { strdst = "--"; } else { strdst = "_"; } std::string::size_type pos = 0; std::string::size_type srclen = strsrc.size(); std::string::size_type dstlen = strdst.size(); // remove slot from name std::size_t found_colon = dump_style_kernel_name.find_last_of(":"); dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon); while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) { dump_style_kernel_name.replace(pos, srclen, strdst); pos += dstlen; } std::string prefix_dump_file_name = dump_style_kernel_name; if (is_sync_mode) { prefix_dump_file_name += "_output_" + std::to_string(slot[i]) + "_"; } std::string specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/iteration_" + std::to_string(iteration[i]); // search files in dir for the one that meets the filename prefix and read the file into memory std::vector *buffer = NULL; std::string type_name = ""; std::vector shape; uint64_t data_size = 0; if (is_sync_mode) { DIR *d; d = opendir(specific_dump_dir.c_str()); if (d != nullptr) { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; std::string out_dir; std::size_t found = GetShapeTypeInfo(specific_dump_dir, slot[i], prefix_dump_file_name, &file_name, &type_name, &out_dir, &shape); if (found != 0) { continue; } // read the tensor data from the file std::string file_path = specific_dump_dir + "/" + file_name; std::ifstream infile; infile.open(file_path.c_str(), std::ios::binary | std::ios::ate); if (!infile.is_open()) { MS_LOG(ERROR) << "Failed to open bin file " << file_name; break; } uint64_t file_size = infile.tellg(); infile.seekg(0, std::ios::beg); buffer = new std::vector(file_size); if (!infile.read(buffer->data(), file_size)) { MS_LOG(ERROR) << "Failed to read in bin file " << file_name; break; } data_size = file_size; infile.close(); AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], data_size, type_name, shape, buffer, result_list); } } } else { MS_LOG(INFO) << "directory does not exist!"; } closedir(d); } else { bool found = false; // if async mode for (const std::string &file_path : async_file_pool) { if (file_path.find(prefix_dump_file_name) != std::string::npos && file_path.find(".output." + std::to_string(slot[i])) != std::string::npos) { found = true; shape.clear(); ReadTensorFromNpy(file_path, &type_name, &data_size, &shape, &buffer); AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], data_size, type_name, shape, buffer, result_list); } } // If no npy file is found, add empty tensor data. if (!found) { AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], 0, type_name, shape, buffer, result_list); } } } } void ReplaceSrcFileName(const bool is_sync_mode, std::string *dump_style_name) { const std::string strsrc = "/"; std::string strdst; if (is_sync_mode) { strdst = "--"; } else { strdst = "_"; } std::string::size_type pos = 0; std::string::size_type srclen = strsrc.size(); std::string::size_type dstlen = strdst.size(); while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) { dump_style_name->replace(pos, srclen, strdst); pos += dstlen; } } std::vector> DebugServices::ReadNeededDumpedTensors( unsigned int iteration, std::vector *async_file_pool) { // get a list of nodes and the devices they are on to monitor std::vector> tensor_list; std::map, std::unordered_set> device_and_graph_to_nodes; for (auto w_table_item : watchpoint_table) { auto wp = std::get<1>(w_table_item); for (auto check_node : wp.check_node_list) { unsigned int index = 0; std::string w_name = std::get<0>(check_node); bool w_is_param = std::get<1>(check_node); std::string node_name = w_name; if (w_is_param) { std::size_t found = node_name.find_last_of("/"); node_name = node_name.substr(found + 1); } std::vector devices = std::get<1>(wp.check_node_device_list[index]); std::vector graphs = std::get<1>(wp.check_node_graph_list[index]); for (auto device : devices) { for (auto graph : graphs) { std::tuple key(device, graph); device_and_graph_to_nodes[key].insert(node_name); } } index++; } } // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list // as they are found for (auto const &device_and_graph_item : device_and_graph_to_nodes) { std::tuple device_and_graph = device_and_graph_item.first; uint32_t device_id = std::get<0>(device_and_graph); uint32_t root_graph_id = std::get<1>(device_and_graph); std::unordered_set wp_nodes = device_and_graph_item.second; std::vector> proto_to_dump; std::string specific_dump_dir; if (is_sync_mode) { specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/iteration_" + std::to_string(iteration); } else { specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/" + net_name + "_graph_" + std::to_string(root_graph_id) + "/" + std::to_string(root_graph_id) + "/" + std::to_string(iteration); } // convert node names to dump style for (auto node : wp_nodes) { std::string orig_name = node; std::string dump_style_name = node; ReplaceSrcFileName(is_sync_mode, &dump_style_name); if (is_sync_mode) { dump_style_name.append("_output_"); } proto_to_dump.push_back(std::tuple(orig_name, dump_style_name)); } if (!is_sync_mode) { // convert all files in proto_to_dump to npy and add to pool of async file names ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool); } if (is_sync_mode) { // search files in dir for the one that meets the filename prefix and read the file into memory DIR *d; d = opendir(specific_dump_dir.c_str()); if (d != nullptr) { struct dirent *dir = nullptr; while ((dir = readdir(d)) != NULL) { if (dir->d_type == DT_REG) { std::string file_name = dir->d_name; for (auto &node : proto_to_dump) { std::string dump_name = std::get<1>(node); std::size_t found = 0; found = file_name.rfind(dump_name, 0); if (found == 0) { std::vector slot_list; GetSlotInfo(file_name, dump_name, specific_dump_dir, &slot_list); for (auto slot : slot_list) { // add a TensorData entry (data will be read when needed) std::vector shape; std::string orig_name = std::get<0>(node); auto tensor_data = std::make_shared(); tensor_data->SetName(orig_name); tensor_data->SetExecutionOrder(0); tensor_data->SetSlot(slot); tensor_data->SetIteration(iteration); tensor_data->SetDeviceId(device_id); tensor_data->SetRootGraphId(root_graph_id); tensor_data->SetDataPtr(NULL); tensor_data->SetByteSize(0); tensor_data->SetType(""); tensor_data->SetShape(shape); tensor_list.push_back(tensor_data); } break; } } } } } } else { GetTensorDataInfoAsync(proto_to_dump, iteration, device_id, root_graph_id, *async_file_pool, &tensor_list); } } return tensor_list; } #endif void DebugServices::ReadNodesTensors(const std::vector &name, std::vector *const ret_name, std::vector *const data_ptr, std::vector *const data_size, std::vector *const dtype, std::vector> *const shape) { std::vector>> result_list; tensor_loader_->SearchTensors(name, &result_list); for (auto result : result_list) { if (!std::get<1>(result)) { continue; } ret_name->push_back(std::get<0>(result)); data_ptr->push_back(reinterpret_cast(std::get<1>(result)->GetDataPtr())); data_size->push_back(std::get<1>(result)->GetByteSize()); dtype->push_back(std::get<1>(result)->GetType()); shape->push_back(std::get<1>(result)->GetShape()); } } #ifdef ONLINE_DBG_MODE bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const { bool ret = false; for (auto w_table_item : watchpoint_table) { auto check_node_list = std::get<1>(w_table_item).check_node_list; for (auto check_node : check_node_list) { std::string w_name = std::get<0>(check_node); bool w_type = std::get<1>(check_node); if ((w_type == true && ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) { ret = true; return ret; } } } return ret; } bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const { if (kernel) { auto input_size = AnfAlgo::GetInputTensorNum(kernel); for (size_t j = 0; j < input_size; ++j) { auto input_kernel = kernel->input(j + 1); std::string input_kernel_name = input_kernel->fullname_with_scope(); auto found = w_name.find_last_of('/'); if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true; } return false; } else { return false; } } #endif void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); } std::vector> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); } std::vector> DebugServices::GetNodeTensorMap(const std::string &node_name) const { return tensor_loader_->GetNodeTensorMap(node_name); } uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); } void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); } void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); } void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); } #ifdef ONLINE_DBG_MODE bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, TypeId device_type, const std::string &addr_format, size_t slot) const { return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type, device_type, addr_format, slot); } #endif bool DebugServices::LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev) { return tensor_loader_->LoadNewTensor(tensor, keep_prev); } std::unordered_map DebugServices::GetWatchpointTable() { return watchpoint_table; } void DebugServices::ResetLoadedTensors() { wp_id_cache.clear(); MS_LOG(INFO) << "Resetting loaded tensors"; tensor_loader_->MoveParametersCurrentToPrev(); tensor_loader_->EmptyCurrentTensor(); // will move parameters from previous to current map tensor_loader_->SwapCurrentPrev(); } #ifdef ONLINE_DBG_MODE std::vector> DebugServices::GetNodeTensor(const CNodePtr &kernel) { MS_EXCEPTION_IF_NULL(kernel); std::vector> result; auto output_size = AnfAlgo::GetOutputTensorNum(kernel); auto kernel_name = kernel->fullname_with_scope(); for (size_t j = 0; j < output_size; ++j) { auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j); auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot); if (tensor) result.push_back(tensor); } return result; } #endif bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) { return tensor_loader_->TensorExistsInCurrent(tensor_name); } void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) { tensor_loader_->MoveTensorCurrentToPrev(tensor_name); } void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; } std::string DebugServices::GetNetName() { return net_name; } void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; } std::string DebugServices::GetDumpDir() { return dump_dir; } void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; } bool DebugServices::GetSyncMode() { return is_sync_mode; } #ifdef ONLINE_DBG_MODE } // namespace mindspore #endif