Other Authors: Harshvardhan Gupta, Li Chentags/v0.7.0-beta
| @@ -209,4 +209,24 @@ void DataDumpParser::CheckOpDebugMode(uint32_t op_debug_mode) const { | |||
| MS_LOG(EXCEPTION) << "[DataDump] op_debug_mode in config json file should be [0-3]"; | |||
| } | |||
| } | |||
| std::string DataDumpParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const { | |||
| std::string bin_path = "/var/log/npu/ide_daemon/dump"; | |||
| const char *dump_data_path = std::getenv("DATA_DUMP_PATH"); | |||
| bin_path.append(dump_data_path); | |||
| bin_path.append("_"); | |||
| bin_path.append(std::to_string(device_id)); | |||
| bin_path.append("/"); | |||
| bin_path.append(net_name_); | |||
| bin_path.append("_"); | |||
| bin_path.append(std::to_string(graph_id)); | |||
| bin_path.append("/"); | |||
| bin_path.append(std::to_string(dump_mode_)); | |||
| bin_path.append("/"); | |||
| bin_path.append(std::to_string(dump_step_)); | |||
| bin_path.append("/"); | |||
| return bin_path; | |||
| } | |||
| } // namespace mindspore | |||
| @@ -42,6 +42,7 @@ class DataDumpParser { | |||
| uint32_t dump_step() const { return dump_step_; } | |||
| void MatchKernel(const std::string &kernel_name); | |||
| void PrintUnusedKernel(); | |||
| std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const; | |||
| private: | |||
| DataDumpParser() = default; | |||
| @@ -50,6 +50,8 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, | |||
| } else if (watch_condition == 1) { | |||
| watchpoint_item.conditions.inf.enabled = true; | |||
| watchpoint_item.conditions.neg_inf.enabled = true; | |||
| } else if (watch_condition == 2) { | |||
| watchpoint_item.conditions.overflow.enabled = true; | |||
| } | |||
| watchpoint_item.check_node_list = check_node_list; | |||
| @@ -63,8 +65,8 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { | |||
| } | |||
| void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, | |||
| std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | |||
| std::vector<int> *condition, std::vector<unsigned int> *wacthpoint_id) { | |||
| std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | |||
| const std::vector<std::string> &op_overflows) { | |||
| std::lock_guard<std::mutex> lg(lock_); | |||
| std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor(); | |||
| @@ -74,6 +76,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||
| for (std::size_t i = 0; i < tensor_list.size(); i++) { | |||
| current_tensor_name = tensor_list[i]->GetName(); | |||
| std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot()); | |||
| mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor(); | |||
| int tensor_data_type = tensor_ptr->data_type_c(); | |||
| @@ -106,10 +109,23 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||
| } | |||
| } | |||
| } | |||
| std::vector<unsigned int> hit_encountered; | |||
| // check if no watchpoints are valid for the current tensor | |||
| if (watchpoints_to_check_table.empty()) { | |||
| continue; | |||
| // handle watchpoint conditions that do not require per element checks | |||
| for (auto it_w_table_check = watchpoints_to_check_table.begin(); | |||
| it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) { | |||
| if (it_w_table_check->second.conditions.overflow.enabled) { | |||
| std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":")); | |||
| if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) { | |||
| hit_encountered.push_back(it_w_table_check->second.id); | |||
| } | |||
| } | |||
| } | |||
| if (hit_encountered.size()) { | |||
| HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name, | |||
| &watchpoints_to_check_table, tensor_slot); | |||
| hit_encountered.clear(); | |||
| } | |||
| // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan | |||
| @@ -117,11 +133,14 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||
| continue; | |||
| } | |||
| // check if no watchpoints are remaining | |||
| if (watchpoints_to_check_table.empty()) { | |||
| continue; | |||
| } | |||
| float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c()); | |||
| unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float); | |||
| std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check; | |||
| std::vector<unsigned int> hit_encountered; | |||
| for (unsigned int index = 0; index < num_elements; index++) { | |||
| float x = start_addr[index]; | |||
| @@ -134,33 +153,12 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||
| } else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) { | |||
| hit_encountered.push_back(it_w_table_check->second.id); | |||
| } | |||
| ++it_w_table_check; | |||
| } | |||
| if (hit_encountered.size()) { | |||
| for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { | |||
| std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":")); | |||
| name->push_back(name_no_slot); | |||
| slot->push_back(std::to_string(tensor_list[i]->GetSlot())); | |||
| data_ptr->push_back(reinterpret_cast<char *>(tensor_ptr->data_c())); | |||
| data_size->push_back(tensor_ptr->data().nbytes()); | |||
| int condition_item = -1; | |||
| if (watchpoint_table[*it_hit_id].conditions.nan.enabled) { | |||
| condition_item = 0; | |||
| } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled || | |||
| watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) { | |||
| condition_item = 1; | |||
| } | |||
| condition->push_back(condition_item); | |||
| wacthpoint_id->push_back(*it_hit_id); | |||
| watchpoints_to_check_table.erase(*it_hit_id); | |||
| } | |||
| HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name, | |||
| &watchpoints_to_check_table, tensor_slot); | |||
| hit_encountered.clear(); | |||
| } | |||
| @@ -171,6 +169,34 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||
| } | |||
| } | |||
| void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, | |||
| std::vector<std::string> *name, std::vector<std::string> *slot, | |||
| std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | |||
| std::string current_tensor_name, | |||
| std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table, | |||
| std::string tensor_slot) { | |||
| for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { | |||
| if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { | |||
| std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":")); | |||
| name->push_back(name_no_slot); | |||
| slot->push_back(tensor_slot); | |||
| int condition_item = -1; | |||
| if (watchpoint_table[*it_hit_id].conditions.nan.enabled) { | |||
| condition_item = 0; | |||
| } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled || | |||
| watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) { | |||
| condition_item = 1; | |||
| } else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) { | |||
| condition_item = 2; | |||
| } | |||
| condition->push_back(condition_item); | |||
| watchpoint_id->push_back(*it_hit_id); | |||
| } | |||
| watchpoints_to_check_table->erase(*it_hit_id); | |||
| } | |||
| } | |||
| void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot, | |||
| char **data_ptr, unsigned int *data_size, int *condition, | |||
| unsigned int *wacthpoint_id) { | |||
| @@ -51,6 +51,7 @@ class DebugServices { | |||
| condition_no_param_t inf; | |||
| condition_no_param_t neg_inf; | |||
| condition_no_param_t nan; | |||
| condition_no_param_t overflow; | |||
| condition_with_param_t max_below; | |||
| condition_with_param_t max_above; | |||
| condition_with_param_t min_below; | |||
| @@ -74,9 +75,8 @@ class DebugServices { | |||
| void RemoveWatchpoint(unsigned int id); | |||
| void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<char *> *data_ptr, | |||
| std::vector<unsigned int> *data_size, std::vector<int> *condition, | |||
| std::vector<unsigned int> *wacthpoint_id); | |||
| void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, | |||
| std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows); | |||
| void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot, | |||
| char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id); | |||
| @@ -97,6 +97,12 @@ class DebugServices { | |||
| std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; | |||
| TensorLoader *tensor_loader_; | |||
| void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name, | |||
| std::vector<std::string> *slot, std::vector<int> *condition, | |||
| std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name, | |||
| std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table, | |||
| std::string tensor_slot); | |||
| }; | |||
| } // namespace mindspore | |||
| @@ -79,8 +79,16 @@ message WatchCondition { | |||
| enum Condition { | |||
| nan = 0; | |||
| inf = 1; | |||
| overflow = 2; | |||
| ge = 3; // greater than and equal to | |||
| gt = 4; // greater than | |||
| le = 5; // less than and equal to | |||
| lt = 6; // less than | |||
| between = 7; // between | |||
| } | |||
| Condition condition = 1; | |||
| repeated float value = 2; // for between condition, there will be two values | |||
| repeated bool include = 3; // for between condition, define the value is included or not | |||
| } | |||
| message WatchNode { | |||
| @@ -14,11 +14,18 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include <dirent.h> | |||
| #include <stdio.h> | |||
| #include <fstream> | |||
| #include <tuple> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <iostream> | |||
| #include <cstring> | |||
| #include <utility> | |||
| #include <map> | |||
| #include "debug/debugger/debugger.h" | |||
| #include "debug/data_dump_parser.h" | |||
| #include "pipeline/jit/pipeline.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| @@ -49,7 +56,9 @@ Debugger::Debugger() | |||
| node_name_(""), | |||
| cur_name_(""), | |||
| is_dataset_graph_(false), | |||
| partial_memory_(false) {} | |||
| partial_memory_(false), | |||
| last_overflow_bin_(0), | |||
| overflow_bin_path_("") {} | |||
| void Debugger::Init(const uint32_t device_id, const std::string device_target) { | |||
| // access lock for public method | |||
| @@ -133,6 +142,35 @@ void Debugger::EnableDebugger() { | |||
| "usage for large models."; | |||
| } | |||
| if (device_target_ == kAscendDevice) { | |||
| // set operation overflow info | |||
| overflow_bin_path_ = DataDumpParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_); | |||
| // new overflow dump files will have a timestamp greater than last_overflow_bin_ | |||
| last_overflow_bin_ = 0; | |||
| DIR *d; | |||
| d = opendir(overflow_bin_path_.c_str()); | |||
| if (d) { | |||
| struct dirent *dir; | |||
| while ((dir = readdir(d)) != NULL) { | |||
| if (dir->d_type == DT_REG) { | |||
| std::string file_path = overflow_bin_path_; | |||
| file_path.append(dir->d_name); | |||
| std::size_t found = file_path.find_last_of("."); | |||
| if (found == std::string::npos) { | |||
| continue; | |||
| } | |||
| std::string overflow_time = file_path.substr(found + 1); | |||
| if (stod(overflow_time) <= last_overflow_bin_) { | |||
| MS_LOG(INFO) << "Old op overflow bin folder" << file_path; | |||
| continue; | |||
| } | |||
| last_overflow_bin_ = stod(overflow_time); | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_; | |||
| } | |||
| } | |||
| // initialize grpc client | |||
| if (debugger_enabled_) { | |||
| grpc_client_ = std::make_unique<GrpcClient>(host, port); | |||
| @@ -154,6 +192,9 @@ void Debugger::Reset() { | |||
| graph_ptr_ = nullptr; | |||
| grpc_client_ = nullptr; | |||
| debug_services_ = nullptr; | |||
| last_overflow_bin_ = 0; | |||
| overflow_bin_path_ = ""; | |||
| stream_task_to_opname_.clear(); | |||
| } | |||
| void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) { | |||
| @@ -200,6 +241,7 @@ void Debugger::PostExecuteNode() { | |||
| if (debugger_enabled_ && !is_dataset_graph_) { | |||
| auto watchpoint_table = debug_services_->GetWatchpointTable(); | |||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table); | |||
| // if kernel is watchpoint,and get hit. suspend. | |||
| if (is_watchpoint) { | |||
| auto hits = CheckSingleWatchpoint(cur_name_); | |||
| @@ -225,6 +267,10 @@ void Debugger::PostDebugOp() { | |||
| } | |||
| } | |||
| std::map<std::pair<uint32_t, uint32_t>, std::string> &Debugger::GetStreamTaskToOpnameMap() { | |||
| return stream_task_to_opname_; | |||
| } | |||
| void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) { | |||
| if (graph_ptr_ != graph_ptr) { | |||
| MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id(); | |||
| @@ -476,15 +522,15 @@ void Debugger::Exit() { | |||
| std::exit(EXIT_FAILURE); | |||
| } | |||
| std::list<WatchpointHit> Debugger::CheckWatchpoints() const { | |||
| std::list<WatchpointHit> Debugger::CheckWatchpoints() { | |||
| std::vector<std::string> name; | |||
| std::vector<std::string> slot; | |||
| std::vector<char *> data_ptr; | |||
| std::vector<unsigned int> data_size; | |||
| std::vector<int> condition; | |||
| std::vector<unsigned int> watchpoint_id; | |||
| std::vector<std::string> overflow_ops; | |||
| debug_services_->CheckWatchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id); | |||
| overflow_ops = CheckOpOverflow(); | |||
| debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops); | |||
| std::list<WatchpointHit> hits; | |||
| for (unsigned int i = 0; i < name.size(); i++) { | |||
| WatchpointHit hit; | |||
| @@ -658,4 +704,70 @@ void Debugger::SetStepNum(int32_t cur_num_step) { | |||
| int32_t Debugger::step_num() const { return num_step_; } | |||
| uint64_t BytestoInt64(const std::vector<char> &buffer) { | |||
| uint64_t ret; | |||
| ret = ((uint64_t)buffer[7] << 56) | ((uint64_t)buffer[6] << 48) | ((uint64_t)buffer[5] << 40) | | |||
| ((uint64_t)buffer[4] << 32) | (buffer[3] << 24) | (buffer[2] << 16) | (buffer[1] << 8) | buffer[0]; | |||
| return ret; | |||
| } | |||
| #define BUF_SIZ 256 | |||
| std::vector<std::string> Debugger::CheckOpOverflow() { | |||
| std::vector<double> bin_list; | |||
| std::vector<std::string> op_names; | |||
| DIR *d; | |||
| struct dirent *dir; | |||
| d = opendir(overflow_bin_path_.c_str()); | |||
| if (d) { | |||
| while ((dir = readdir(d)) != NULL) { | |||
| if (dir->d_type == DT_REG) { | |||
| std::string file_path = overflow_bin_path_; | |||
| file_path.append(dir->d_name); | |||
| std::string file_name = dir->d_name; | |||
| std::size_t found = file_name.find_last_of("."); | |||
| if (found == std::string::npos) { | |||
| continue; | |||
| } | |||
| std::string overflow_time = file_name.substr(found + 1); | |||
| if (stod(overflow_time) <= last_overflow_bin_) { | |||
| MS_LOG(INFO) << "File already processed " << file_name; | |||
| continue; | |||
| } | |||
| bin_list.push_back(stod(overflow_time)); | |||
| std::fstream infile; | |||
| infile.open(file_path.c_str(), std::ios::binary | std::ios::in); | |||
| infile.seekg(313, std::ios::beg); | |||
| std::vector<char> buffer; | |||
| buffer.resize(BUF_SIZ); | |||
| infile.read(buffer.data(), BUF_SIZ); | |||
| uint64_t stream_id = BytestoInt64(std::vector<char>(buffer.begin() + 8, buffer.end())); | |||
| uint64_t task_id = BytestoInt64(std::vector<char>(buffer.begin() + 16, buffer.end())); | |||
| MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << "."; | |||
| auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id)); | |||
| if (op != debugger_->stream_task_to_opname_.end()) { | |||
| MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl; | |||
| op_names.push_back(op->second); | |||
| } else { | |||
| MS_LOG(INFO) << "No overflow is detected " << std::endl; | |||
| } | |||
| infile.close(); | |||
| } | |||
| } | |||
| } else { | |||
| MS_LOG(INFO) << "OverFlow bin directory does not exist!"; | |||
| } | |||
| closedir(d); | |||
| MS_LOG(ERROR) << "These operation overflows are detected " << op_names; | |||
| for (auto &i : bin_list) { | |||
| if (i > last_overflow_bin_) { | |||
| last_overflow_bin_ = i; | |||
| } | |||
| } | |||
| return op_names; | |||
| } | |||
| } // namespace mindspore | |||
| @@ -19,6 +19,9 @@ | |||
| #include <list> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <vector> | |||
| #include <map> | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "debug/debugger/grpc_client.h" | |||
| #include "debug/debug_services.h" | |||
| @@ -90,6 +93,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| int32_t step_num() const; | |||
| std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap(); | |||
| private: | |||
| // private constructor for singleton | |||
| Debugger(); | |||
| @@ -130,12 +135,15 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| // analyze tensors and check watchpoint conditions | |||
| // return names of tensors and what condition they hit | |||
| std::list<WatchpointHit> CheckWatchpoints() const; | |||
| std::list<WatchpointHit> CheckWatchpoints(); | |||
| std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const; | |||
| // send watchpoints that hit and enter command wait loop | |||
| void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points); | |||
| // Find if any operation overflow happened and return their names | |||
| std::vector<std::string> CheckOpOverflow(); | |||
| // class members | |||
| std::unique_ptr<GrpcClient> grpc_client_; | |||
| std::unique_ptr<DebugServices> debug_services_; | |||
| @@ -150,7 +158,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||
| bool is_dataset_graph_; | |||
| bool partial_memory_; | |||
| std::mutex access_lock_; | |||
| std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_; | |||
| double last_overflow_bin_; | |||
| std::string overflow_bin_path_; | |||
| // singleton | |||
| static std::mutex instance_lock_; | |||
| static std::shared_ptr<Debugger> debugger_; | |||
| @@ -180,5 +190,6 @@ ProtoVector<TensorProto> GetTensors(const EventReply &reply); | |||
| // get the full name of a tensor, which is the name used in TensorLoader | |||
| std::string GetTensorFullName(const TensorProto &tensor); | |||
| uint64_t BytestoInt64(const std::vector<char> &buffer); | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_ | |||
| @@ -27,6 +27,9 @@ | |||
| #include "proto/op_mapping_info.pb.h" | |||
| #include "utils/ms_context.h" | |||
| #include "debug/data_dump_parser.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| static constexpr uint32_t kAicpuLoadFlag = 1; | |||
| static constexpr uint32_t kAicpuUnloadFlag = 0; | |||
| @@ -90,6 +93,18 @@ void DataDumper::LoadDumpInfo() { | |||
| load_flag_ = true; | |||
| // graph id may changed in Unload | |||
| graph_id_ = kernel_graph_->graph_id(); | |||
| #ifdef ENABLE_DEBUGGER | |||
| auto debugger = mindspore::Debugger::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(debugger); | |||
| std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap(); | |||
| // extract stream id, task id and opname from runtime_info_map for overflow detection | |||
| std::transform(runtime_info_map_.begin(), runtime_info_map_.end(), | |||
| std::inserter(stream_task_to_opname, stream_task_to_opname.end()), | |||
| [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p) | |||
| -> std::pair<std::pair<uint32_t, uint32_t>, std::string> { | |||
| return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first}; | |||
| }); | |||
| #endif | |||
| MS_LOG(INFO) << "[DataDump] LoadDumpInfo end"; | |||
| } | |||