| @@ -14,6 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <map> | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "debug/debug_services.h" | #include "debug/debug_services.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -39,17 +41,19 @@ DebugServices &DebugServices::operator=(const DebugServices &other) { | |||||
| DebugServices::~DebugServices() { delete tensor_loader_; } | DebugServices::~DebugServices() { delete tensor_loader_; } | ||||
| void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | ||||
| const std::vector<std::tuple<std::string, bool>> &check_node_list) { | |||||
| const std::vector<std::tuple<std::string, bool>> &check_node_list, | |||||
| const std::vector<parameter_t> ¶meter_list) { | |||||
| std::lock_guard<std::mutex> lg(lock_); | std::lock_guard<std::mutex> lg(lock_); | ||||
| watchpoint_t watchpoint_item; | watchpoint_t watchpoint_item; | ||||
| watchpoint_item.id = id; | watchpoint_item.id = id; | ||||
| watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition); | watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition); | ||||
| watchpoint_item.condition.parameter = parameter; | watchpoint_item.condition.parameter = parameter; | ||||
| if (watch_condition > 2) | |||||
| // odd indices are greater than conditions and even indicies are less than | |||||
| if (watch_condition > 2 && watch_condition < 13) | |||||
| // odd indices are greater than conditions and even indices are less than | |||||
| watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT"; | watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT"; | ||||
| watchpoint_item.check_node_list = check_node_list; | watchpoint_item.check_node_list = check_node_list; | ||||
| watchpoint_item.parameter_list = parameter_list; | |||||
| watchpoint_table[id] = watchpoint_item; | watchpoint_table[id] = watchpoint_item; | ||||
| } | } | ||||
| @@ -59,11 +63,22 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsigned int n, bool need_min_max, | |||||
| bool need_mean_sd) { | |||||
| DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n, | |||||
| bool need_min_max, bool need_mean_sd, | |||||
| bool need_zero_percentage, | |||||
| bool need_tensor_update_ratio_mean, bool need_allclose) { | |||||
| tensor_stats stats; | tensor_stats stats; | ||||
| double zero_count = 0.0; | |||||
| double rtol = 1.0e-5; | |||||
| double atol = 1.0e-8; | |||||
| double update_ratio_sum = 0.0; | |||||
| double epsilon = 1.0e-9; | |||||
| for (unsigned int i = 0; i < n; ++i) { | for (unsigned int i = 0; i < n; ++i) { | ||||
| auto val = static_cast<double>(start[i]); | auto val = static_cast<double>(start[i]); | ||||
| double val_prev = 0.0; | |||||
| if (start_prev) { | |||||
| val_prev = static_cast<double>(start_prev[i]); | |||||
| } | |||||
| stats.has_nan = stats.has_nan || std::isnan(val); | stats.has_nan = stats.has_nan || std::isnan(val); | ||||
| stats.has_inf = stats.has_inf || std::isinf(val); | stats.has_inf = stats.has_inf || std::isinf(val); | ||||
| if (stats.has_inf && stats.has_nan) { | if (stats.has_inf && stats.has_nan) { | ||||
| @@ -81,15 +96,33 @@ DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsig | |||||
| stats.mean += delta / (i + 1); | stats.mean += delta / (i + 1); | ||||
| stats.m2 += delta * (val - stats.mean); | stats.m2 += delta * (val - stats.mean); | ||||
| } | } | ||||
| if (need_zero_percentage) { | |||||
| if (val == 0) zero_count++; | |||||
| } | |||||
| if (need_tensor_update_ratio_mean && start_prev) { | |||||
| update_ratio_sum += (std::abs(val) / (epsilon + std::abs(val_prev))); | |||||
| } | |||||
| if (need_allclose && start_prev) { | |||||
| stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev))); | |||||
| } | |||||
| } | } | ||||
| if (need_tensor_update_ratio_mean && start_prev) { | |||||
| stats.tensor_update_ratio_mean = (update_ratio_sum / n); | |||||
| } | |||||
| stats.zero_percentage = (zero_count / n) * 100; | |||||
| stats.n = n; | stats.n = n; | ||||
| return stats; | return stats; | ||||
| } | } | ||||
| void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, | void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, | ||||
| std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | ||||
| std::vector<std::vector<parameter_t>> *parameters, | |||||
| const std::vector<std::string> &op_overflows, | const std::vector<std::string> &op_overflows, | ||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list) { | |||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list, | |||||
| const bool init_dbg_suspend) { | |||||
| std::lock_guard<std::mutex> lg(lock_); | std::lock_guard<std::mutex> lg(lock_); | ||||
| if (watchpoint_table.empty()) { | if (watchpoint_table.empty()) { | ||||
| return; | return; | ||||
| @@ -102,79 +135,145 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||||
| mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); | mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); | ||||
| int tensor_dtype = tensor_ptr->data_type_c(); | int tensor_dtype = tensor_ptr->data_type_c(); | ||||
| std::vector<unsigned int> hit_encountered; | std::vector<unsigned int> hit_encountered; | ||||
| std::vector<std::vector<bool>> hit_parms; | |||||
| std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table; | std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table; | ||||
| bool min_max_enabled = false; | bool min_max_enabled = false; | ||||
| bool mean_sd_enabled = false; | bool mean_sd_enabled = false; | ||||
| bool inf_nan_enabled = false; | bool inf_nan_enabled = false; | ||||
| bool zero_percentage_enabled = false; | |||||
| bool tensor_update_ratio_mean_enabled = false; | |||||
| bool allclose_enabled = false; | |||||
| for (auto w_table_item : watchpoint_table) { | for (auto w_table_item : watchpoint_table) { | ||||
| auto wp = std::get<1>(w_table_item); | auto wp = std::get<1>(w_table_item); | ||||
| if (wp.condition.type == INIT && !init_dbg_suspend) continue; | |||||
| if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; | if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; | ||||
| if (wp.IsNodeIncluded(tensor_name_no_slot)) { | if (wp.IsNodeIncluded(tensor_name_no_slot)) { | ||||
| min_max_enabled |= wp.min_max_enabled(); | min_max_enabled |= wp.min_max_enabled(); | ||||
| mean_sd_enabled |= wp.mean_sd_enabled(); | mean_sd_enabled |= wp.mean_sd_enabled(); | ||||
| inf_nan_enabled |= wp.inf_nan_enabled(); | inf_nan_enabled |= wp.inf_nan_enabled(); | ||||
| zero_percentage_enabled |= wp.zero_percentage_enabled(); | |||||
| tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled(); | |||||
| allclose_enabled |= wp.allclose_enabled(); | |||||
| watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second; | watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second; | ||||
| } | } | ||||
| } | } | ||||
| tensor_stats stats; | tensor_stats stats; | ||||
| uint num_elements = tensor_ptr->DataSize(); | uint num_elements = tensor_ptr->DataSize(); | ||||
| if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) { | |||||
| if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled || | |||||
| tensor_update_ratio_mean_enabled || allclose_enabled) { | |||||
| bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled); | |||||
| bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL; | |||||
| switch (tensor_dtype) { | switch (tensor_dtype) { | ||||
| case kNumberTypeUInt8: { | case kNumberTypeUInt8: { | ||||
| auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt8: { | case kNumberTypeInt8: { | ||||
| auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt16: { | case kNumberTypeUInt16: { | ||||
| auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt16: { | case kNumberTypeInt16: { | ||||
| auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt32: { | case kNumberTypeUInt32: { | ||||
| auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt32: | case kNumberTypeInt32: | ||||
| case kNumberTypeInt: { | case kNumberTypeInt: { | ||||
| auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt64: { | case kNumberTypeUInt64: { | ||||
| auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt64: { | case kNumberTypeInt64: { | ||||
| auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat16: { | case kNumberTypeFloat16: { | ||||
| auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<float16 *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat32: | case kNumberTypeFloat32: | ||||
| case kNumberTypeFloat: { | case kNumberTypeFloat: { | ||||
| auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<float *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat64: { | case kNumberTypeFloat64: { | ||||
| auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c()); | auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c()); | ||||
| stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<double *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); | |||||
| break; | break; | ||||
| } | } | ||||
| default: | default: | ||||
| @@ -185,31 +284,97 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||||
| for (auto &it : watchpoints_to_check_table) { | for (auto &it : watchpoints_to_check_table) { | ||||
| auto wp_id = it.second.id; | auto wp_id = it.second.id; | ||||
| std::vector<bool> hit_p; | |||||
| CONDITION_TYPE enabled_condition = it.second.condition.type; | CONDITION_TYPE enabled_condition = it.second.condition.type; | ||||
| bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) || | bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) || | ||||
| (enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) || | |||||
| (enabled_condition == IS_OVERFLOW && | (enabled_condition == IS_OVERFLOW && | ||||
| std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); | std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); | ||||
| if (enabled_condition > 2) { | |||||
| if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) { | |||||
| if (stats.has_inf || stats.has_nan) { | if (stats.has_inf || stats.has_nan) { | ||||
| MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check " | MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check " | ||||
| << condition_label[enabled_condition] << " watchpoint."; | << condition_label[enabled_condition] << " watchpoint."; | ||||
| } else { | |||||
| } else if (enabled_condition < 13) { | |||||
| bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter; | bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter; | ||||
| bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter; | bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter; | ||||
| hit |= it.second.condition.comparison == "GT" ? gt : lt; | hit |= it.second.condition.comparison == "GT" ? gt : lt; | ||||
| } else { | |||||
| std::vector<parameter_t> parameter_list_item = it.second.parameter_list; | |||||
| for (auto &p : parameter_list_item) { | |||||
| if (p.disabled == false) { | |||||
| bool p_hit = false; | |||||
| if (p.name == "zero_percentage_ge") { | |||||
| p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value; | |||||
| } else if (p.name == "max_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MAX) > p.value; | |||||
| } else if (p.name == "max_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MAX) < p.value; | |||||
| } else if (p.name == "min_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MIN) > p.value; | |||||
| } else if (p.name == "min_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MIN) < p.value; | |||||
| } else if (p.name == "mean_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MEAN) > p.value; | |||||
| } else if (p.name == "mean_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MEAN) < p.value; | |||||
| } else if (p.name == "abs_mean_gt") { | |||||
| p_hit = std::abs(stats.parmLookup(STAT_MEAN)) > p.value; | |||||
| } else if (p.name == "abs_mean_lt") { | |||||
| p_hit = std::abs(stats.parmLookup(STAT_MEAN)) < p.value; | |||||
| } else if (p.name == "abs_update_ratio_mean_gt") { | |||||
| p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value; | |||||
| } else if (p.name == "abs_update_ratio_mean_lt") { | |||||
| p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value; | |||||
| } | |||||
| hit |= p_hit; | |||||
| hit_p.push_back(p_hit); | |||||
| } else { | |||||
| hit_p.push_back(false); | |||||
| } | |||||
| } | |||||
| hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE)); | |||||
| if (hit) hit_parms.push_back(hit_p); | |||||
| } | } | ||||
| } | } | ||||
| if (hit) hit_encountered.push_back(wp_id); | if (hit) hit_encountered.push_back(wp_id); | ||||
| } | } | ||||
| unsigned int index_parm_list = 0; | |||||
| for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { | for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { | ||||
| if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { | if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { | ||||
| name->push_back(tensor_name_no_slot); | |||||
| // return fully qualified name for weights and bias to MI | |||||
| auto found_dot = tensor_name_no_slot.find_last_of('.'); | |||||
| if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" || | |||||
| tensor_name_no_slot.substr(found_dot + 1) == "bias")) { | |||||
| auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list; | |||||
| for (auto check_node : check_node_list) { | |||||
| std::string w_name = std::get<0>(check_node); | |||||
| auto found_slash = w_name.find_last_of('/'); | |||||
| if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) { | |||||
| name->push_back(w_name); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| name->push_back(tensor_name_no_slot); | |||||
| } | |||||
| slot->push_back(tensor_slot); | slot->push_back(tensor_slot); | ||||
| int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type; | int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type; | ||||
| condition->push_back(condition_item); | condition->push_back(condition_item); | ||||
| watchpoint_id->push_back(*it_hit_id); | watchpoint_id->push_back(*it_hit_id); | ||||
| std::vector<parameter_t> parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list; | |||||
| if (condition_item >= 13) { | |||||
| unsigned int index_hit_parm = 0; | |||||
| for (auto &p : parameter_list_item) { | |||||
| p.hit = hit_parms[index_parm_list][index_hit_parm]; | |||||
| index_hit_parm++; | |||||
| } | |||||
| index_parm_list++; | |||||
| } | |||||
| parameters->push_back(parameter_list_item); | |||||
| } | } | ||||
| watchpoints_to_check_table.erase(*it_hit_id); | watchpoints_to_check_table.erase(*it_hit_id); | ||||
| } | } | ||||
| @@ -234,7 +399,7 @@ void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector< | |||||
| } | } | ||||
| } | } | ||||
| bool DebugServices::IsWatchPoint(std::string kernel_name) { | |||||
| bool DebugServices::IsWatchPoint(std::string kernel_name, const CNodePtr &kernel) { | |||||
| bool ret = false; | bool ret = false; | ||||
| for (auto w_table_item : watchpoint_table) { | for (auto w_table_item : watchpoint_table) { | ||||
| auto check_node_list = std::get<1>(w_table_item).check_node_list; | auto check_node_list = std::get<1>(w_table_item).check_node_list; | ||||
| @@ -243,7 +408,7 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) { | |||||
| bool w_type = std::get<1>(check_node); | bool w_type = std::get<1>(check_node); | ||||
| if ((w_type == true && | if ((w_type == true && | ||||
| ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || | ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || | ||||
| (w_type == false && kernel_name == w_name)) { | |||||
| (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) { | |||||
| ret = true; | ret = true; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -252,6 +417,39 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| bool DebugServices::IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel) { | |||||
| if (kernel) { | |||||
| auto input_size = AnfAlgo::GetInputTensorNum(kernel); | |||||
| for (size_t j = 0; j < input_size; ++j) { | |||||
| auto input_kernel = kernel->input(j + 1); | |||||
| std::string input_kernel_name = input_kernel->fullname_with_scope(); | |||||
| auto found = w_name.find_last_of('/'); | |||||
| if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true; | |||||
| } | |||||
| return false; | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| void DebugServices::AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, | |||||
| const CNodePtr &kernel) { | |||||
| if (kernel) { | |||||
| auto input_size = AnfAlgo::GetInputTensorNum(kernel); | |||||
| for (size_t j = 0; j < input_size; ++j) { | |||||
| auto input_kernel = kernel->input(j + 1); | |||||
| std::string input_kernel_name = input_kernel->fullname_with_scope(); | |||||
| std::string locate_tensor = input_kernel_name + ":0"; | |||||
| std::map<std::string, std::shared_ptr<TensorData>> tensor_map = tensor_loader_->GetTensorMap(); | |||||
| std::map<std::string, std::shared_ptr<TensorData>>::iterator iter; | |||||
| iter = tensor_map.find(locate_tensor); | |||||
| if (iter != tensor_map.end()) { | |||||
| tensor_list->push_back(iter->second); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; } | TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; } | ||||
| std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() { | std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() { | ||||
| return watchpoint_table; | return watchpoint_table; | ||||
| @@ -52,19 +52,37 @@ class DebugServices { | |||||
| MEAN_GT, | MEAN_GT, | ||||
| MEAN_LT, | MEAN_LT, | ||||
| SD_GT, | SD_GT, | ||||
| SD_LT | |||||
| SD_LT, | |||||
| GENERAL_OVERFLOW, | |||||
| INIT, | |||||
| TOO_LARGE, | |||||
| TOO_SMALL, | |||||
| ALL_ZERO, | |||||
| CHANGE_TOO_LARGE, | |||||
| CHANGE_TOO_SMALL, | |||||
| NOT_CHANGED | |||||
| }; | }; | ||||
| enum STAT_TYPE { STAT_MIN, STAT_MAX, STAT_MEAN, STAT_ZERO_PERCENTAGE, STAT_TENSOR_UPDATE_RATIO_MEAN, STAT_ALLCLOSE }; | |||||
| typedef struct condition { | typedef struct condition { | ||||
| CONDITION_TYPE type; | CONDITION_TYPE type; | ||||
| float parameter = 0; | float parameter = 0; | ||||
| std::string comparison; | std::string comparison; | ||||
| } condition_t; | } condition_t; | ||||
| typedef struct parameter { | |||||
| std::string name; | |||||
| bool disabled; | |||||
| double_t value; | |||||
| bool hit; | |||||
| } parameter_t; | |||||
| typedef struct watchpoint { | typedef struct watchpoint { | ||||
| unsigned int id; | unsigned int id; | ||||
| condition_t condition; | condition_t condition; | ||||
| std::vector<std::tuple<std::string, bool>> check_node_list; | std::vector<std::tuple<std::string, bool>> check_node_list; | ||||
| std::vector<parameter_t> parameter_list; | |||||
| size_t location = 0; | size_t location = 0; | ||||
| bool IsNodeIncluded(const std::string &tensor_name) { | bool IsNodeIncluded(const std::string &tensor_name) { | ||||
| @@ -72,6 +90,8 @@ class DebugServices { | |||||
| for (auto check_node : check_node_list) { | for (auto check_node : check_node_list) { | ||||
| std::string w_name = std::get<0>(check_node); | std::string w_name = std::get<0>(check_node); | ||||
| bool w_type = std::get<1>(check_node); | bool w_type = std::get<1>(check_node); | ||||
| auto found = w_name.find_last_of('/'); | |||||
| if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true; | |||||
| if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { | if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -81,15 +101,27 @@ class DebugServices { | |||||
| bool min_max_enabled() { | bool min_max_enabled() { | ||||
| return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT || | return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT || | ||||
| condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT; | |||||
| condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT || | |||||
| (condition.type == INIT && (!parameter_list[1].disabled || !parameter_list[2].disabled)) || | |||||
| (condition.type == TOO_LARGE && (!parameter_list[1].disabled || !parameter_list[2].disabled)) || | |||||
| (condition.type == TOO_SMALL && (!parameter_list[1].disabled || !parameter_list[2].disabled)); | |||||
| } | } | ||||
| // inf or nan related condition set | // inf or nan related condition set | ||||
| bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; } | |||||
| bool inf_nan_enabled() { | |||||
| return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; | |||||
| } | |||||
| // mean or sd related condition set | // mean or sd related condition set | ||||
| bool mean_sd_enabled() { | bool mean_sd_enabled() { | ||||
| return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || | return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || | ||||
| condition.type == SD_GT; | |||||
| condition.type == SD_GT || | |||||
| (condition.type == TOO_LARGE && (!parameter_list[0].disabled || !parameter_list[3].disabled)) || | |||||
| (condition.type == TOO_SMALL && (!parameter_list[0].disabled || !parameter_list[3].disabled)); | |||||
| } | } | ||||
| bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } | |||||
| bool tensor_update_ratio_mean_enabled() { | |||||
| return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; | |||||
| } | |||||
| bool allclose_enabled() { return condition.type == NOT_CHANGED; } | |||||
| } watchpoint_t; | } watchpoint_t; | ||||
| struct tensor_stats { | struct tensor_stats { | ||||
| @@ -100,6 +132,9 @@ class DebugServices { | |||||
| unsigned int n = 0; | unsigned int n = 0; | ||||
| double mean = 0.0; | double mean = 0.0; | ||||
| double m2 = 0.0; | double m2 = 0.0; | ||||
| double zero_percentage = 0.0; | |||||
| double tensor_update_ratio_mean = -1; | |||||
| bool allclose = false; | |||||
| double statLookup(CONDITION_TYPE type) const { | double statLookup(CONDITION_TYPE type) const { | ||||
| if (type == MAX_GT || type == MAX_LT) return max; | if (type == MAX_GT || type == MAX_LT) return max; | ||||
| @@ -110,6 +145,16 @@ class DebugServices { | |||||
| return std::numeric_limits<double>::quiet_NaN(); | return std::numeric_limits<double>::quiet_NaN(); | ||||
| } | } | ||||
| double parmLookup(STAT_TYPE type) const { | |||||
| if (type == STAT_MAX) return max; | |||||
| if (type == STAT_MIN) return min; | |||||
| if (type == STAT_MEAN) return mean; | |||||
| if (type == STAT_ZERO_PERCENTAGE) return zero_percentage; | |||||
| if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean; | |||||
| if (type == STAT_ALLCLOSE) return allclose; | |||||
| return std::numeric_limits<double>::quiet_NaN(); | |||||
| } | |||||
| double getMean() const { return mean; } | double getMean() const { return mean; } | ||||
| double getVariance() const { | double getVariance() const { | ||||
| @@ -124,19 +169,25 @@ class DebugServices { | |||||
| }; | }; | ||||
| void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | ||||
| const std::vector<std::tuple<std::string, bool>> &check_node_list); | |||||
| const std::vector<std::tuple<std::string, bool>> &check_node_list, | |||||
| const std::vector<parameter_t> ¶meter_list); | |||||
| void RemoveWatchpoint(unsigned int id); | void RemoveWatchpoint(unsigned int id); | ||||
| void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, | void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, | ||||
| std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows, | |||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list); | |||||
| std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters, | |||||
| const std::vector<std::string> &op_overflows, | |||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend); | |||||
| void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name, | void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name, | ||||
| std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size, | ||||
| std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape); | std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape); | ||||
| bool IsWatchPoint(std::string kernel_name); | |||||
| bool IsWatchPoint(std::string kernel_name, const CNodePtr &kernel = nullptr); | |||||
| bool IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel); | |||||
| void AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, const CNodePtr &kernel); | |||||
| TensorLoader *tensor_loader() const; | TensorLoader *tensor_loader() const; | ||||
| @@ -146,14 +197,19 @@ class DebugServices { | |||||
| std::mutex lock_; | std::mutex lock_; | ||||
| std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; | std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; | ||||
| std::vector<std::string> condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", | |||||
| "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", | |||||
| "MEAN_LT", "SD_GT", "SD_LT"}; | |||||
| std::vector<std::string> condition_label = { | |||||
| "HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", | |||||
| "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", | |||||
| "MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT", | |||||
| "TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL", | |||||
| "NOT_CHANGED"}; | |||||
| TensorLoader *tensor_loader_; | TensorLoader *tensor_loader_; | ||||
| template <typename T> | template <typename T> | ||||
| static tensor_stats SummarizeTensor(const T *start, unsigned int n, bool need_min_max, bool need_mean_sd); | |||||
| static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max, | |||||
| bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean, | |||||
| bool need_allclose); | |||||
| }; | }; | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -36,11 +36,11 @@ message Metadata { | |||||
| // the full name of current node | // the full name of current node | ||||
| string cur_node = 4; | string cur_node = 4; | ||||
| // check if training is done. | // check if training is done. | ||||
| bool training_done = 5; | |||||
| bool training_done = 5; | |||||
| } | } | ||||
| message Chunk { | message Chunk { | ||||
| bytes buffer = 1; | |||||
| bytes buffer = 1; | |||||
| } | } | ||||
| message EventReply { | message EventReply { | ||||
| @@ -61,13 +61,13 @@ message EventReply { | |||||
| } | } | ||||
| message RunCMD { | message RunCMD { | ||||
| // step level or node level. "step" or "node" | |||||
| string run_level = 1; | |||||
| oneof cmd { | |||||
| int32 run_steps = 2; | |||||
| // the next node full name | |||||
| string node_name = 3; | |||||
| } | |||||
| // step level or node level. "step" or "node" | |||||
| string run_level = 1; | |||||
| oneof cmd { | |||||
| int32 run_steps = 2; | |||||
| // the next node full name | |||||
| string node_name = 3; | |||||
| } | |||||
| } | } | ||||
| message SetCMD { | message SetCMD { | ||||
| @@ -96,10 +96,24 @@ message WatchCondition { | |||||
| mean_lt = 10; | mean_lt = 10; | ||||
| sd_gt = 11; | sd_gt = 11; | ||||
| sd_lt = 12; | sd_lt = 12; | ||||
| tensor_general_overflow = 13; | |||||
| tensor_initialization = 14; | |||||
| tensor_too_large = 15; | |||||
| tensor_too_small = 16; | |||||
| tensor_all_zero = 17; | |||||
| tensor_change_too_large = 18; | |||||
| tensor_change_too_small = 19; | |||||
| tensor_not_changed = 20; | |||||
| } | } | ||||
| Condition condition = 1; | Condition condition = 1; | ||||
| float value = 2; // for between condition, there will be two values | |||||
| repeated bool include = 3; // for between condition, define the value is included or not | |||||
| float value = 2; | |||||
| message Parameter { | |||||
| string name = 1; | |||||
| bool disabled = 2; | |||||
| double value = 3; | |||||
| bool hit = 4; // Whether this parameter is hit when checking tensor. | |||||
| } | |||||
| repeated Parameter params = 4; | |||||
| } | } | ||||
| message WatchNode { | message WatchNode { | ||||
| @@ -41,6 +41,7 @@ using debugger::TensorProto; | |||||
| using debugger::WatchCondition; | using debugger::WatchCondition; | ||||
| using debugger::WatchCondition_Condition_inf; | using debugger::WatchCondition_Condition_inf; | ||||
| using debugger::WatchCondition_Condition_nan; | using debugger::WatchCondition_Condition_nan; | ||||
| using debugger::WatchCondition_Parameter; | |||||
| using debugger::WatchNode; | using debugger::WatchNode; | ||||
| using debugger::WatchpointHit; | using debugger::WatchpointHit; | ||||
| @@ -67,7 +68,8 @@ Debugger::Debugger() | |||||
| is_dataset_graph_(false), | is_dataset_graph_(false), | ||||
| partial_memory_(false), | partial_memory_(false), | ||||
| last_overflow_bin_(0), | last_overflow_bin_(0), | ||||
| overflow_bin_path_("") { | |||||
| overflow_bin_path_(""), | |||||
| initial_suspend_(true) { | |||||
| if (CheckDebuggerEnabled()) { | if (CheckDebuggerEnabled()) { | ||||
| // configure partial memory reuse | // configure partial memory reuse | ||||
| partial_memory_ = CheckDebuggerPartialMemoryEnabled(); | partial_memory_ = CheckDebuggerPartialMemoryEnabled(); | ||||
| @@ -292,9 +294,9 @@ void Debugger::PostExecute() { | |||||
| } | } | ||||
| } | } | ||||
| bool Debugger::ReadNodeDataRequired() { | |||||
| bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) { | |||||
| if (debugger_enabled_ && !is_dataset_graph_) { | if (debugger_enabled_ && !is_dataset_graph_) { | ||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel); | |||||
| // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data | // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data | ||||
| if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { | if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { | ||||
| return true; | return true; | ||||
| @@ -303,19 +305,19 @@ bool Debugger::ReadNodeDataRequired() { | |||||
| return false; | return false; | ||||
| } | } | ||||
| void Debugger::PostExecuteNode() { | |||||
| void Debugger::PostExecuteNode(const CNodePtr &kernel) { | |||||
| // access lock for public method | // access lock for public method | ||||
| std::lock_guard<std::mutex> a_lock(access_lock_); | std::lock_guard<std::mutex> a_lock(access_lock_); | ||||
| if (pipeline::ExecutorPy::GetDebugTerminate()) { | if (pipeline::ExecutorPy::GetDebugTerminate()) { | ||||
| return; | return; | ||||
| } | } | ||||
| if (debugger_enabled_ && !is_dataset_graph_) { | if (debugger_enabled_ && !is_dataset_graph_) { | ||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); | |||||
| auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel); | |||||
| // if kernel is watchpoint,and get hit. suspend. | // if kernel is watchpoint,and get hit. suspend. | ||||
| bool hit_empty_flag = true; | bool hit_empty_flag = true; | ||||
| if (is_watchpoint) { | if (is_watchpoint) { | ||||
| auto hits = CheckWatchpoints(cur_name_); | |||||
| auto hits = CheckWatchpoints(cur_name_, kernel); | |||||
| if (!hits.empty()) { | if (!hits.empty()) { | ||||
| SendWatchpoints(hits); | SendWatchpoints(hits); | ||||
| CommandLoop(); | CommandLoop(); | ||||
| @@ -477,6 +479,8 @@ void Debugger::CommandLoop() { | |||||
| MS_LOG(INFO) << "rechecking all watchpoints"; | MS_LOG(INFO) << "rechecking all watchpoints"; | ||||
| SendWatchpoints(CheckWatchpoints()); | SendWatchpoints(CheckWatchpoints()); | ||||
| } else { | } else { | ||||
| // no longer the initial suspension. | |||||
| initial_suspend_ = false; | |||||
| // print run cmd content | // print run cmd content | ||||
| // get run_level and node_name | // get run_level and node_name | ||||
| run_level_ = GetRunLevel(reply); | run_level_ = GetRunLevel(reply); | ||||
| @@ -494,10 +498,17 @@ void Debugger::CommandLoop() { | |||||
| { | { | ||||
| // print set cmd content | // print set cmd content | ||||
| ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply); | ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply); | ||||
| for (auto node : recieved_nodes) { | |||||
| for (const auto &node : recieved_nodes) { | |||||
| MS_LOG(INFO) << "node name: " << node.node_name(); | MS_LOG(INFO) << "node name: " << node.node_name(); | ||||
| MS_LOG(INFO) << "node type: " << node.node_type(); | MS_LOG(INFO) << "node type: " << node.node_type(); | ||||
| } | } | ||||
| ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply); | |||||
| for (const auto ¶meter : parameters) { | |||||
| MS_LOG(INFO) << "parameter name: " << parameter.name(); | |||||
| MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled(); | |||||
| MS_LOG(INFO) << "parameter value: " << parameter.value(); | |||||
| } | |||||
| MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition(); | MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition(); | ||||
| MS_LOG(INFO) << "id: " << GetWatchpointID(reply); | MS_LOG(INFO) << "id: " << GetWatchpointID(reply); | ||||
| MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply); | MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply); | ||||
| @@ -506,7 +517,7 @@ void Debugger::CommandLoop() { | |||||
| if (GetWatchpointDelete(reply)) { | if (GetWatchpointDelete(reply)) { | ||||
| RemoveWatchpoint(GetWatchpointID(reply)); | RemoveWatchpoint(GetWatchpointID(reply)); | ||||
| } else { | } else { | ||||
| SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply)); | |||||
| SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply)); | |||||
| } | } | ||||
| break; | break; | ||||
| case DebuggerCommand::kViewCMD: | case DebuggerCommand::kViewCMD: | ||||
| @@ -558,13 +569,25 @@ void AddTensorProtoInfo(TensorProto *tensor_item, TensorProto tensor) { | |||||
| tensor_item->clear_dims(); | tensor_item->clear_dims(); | ||||
| } | } | ||||
| void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id) { | |||||
| void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id, | |||||
| const ProtoVector<WatchCondition_Parameter> ¶meters) { | |||||
| std::vector<std::tuple<std::string, bool>> check_node_list; | std::vector<std::tuple<std::string, bool>> check_node_list; | ||||
| std::vector<DebugServices::parameter_t> parameter_list; | |||||
| std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list), | std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list), | ||||
| [](WatchNode node) -> std::tuple<std::string, bool> { | |||||
| [](const WatchNode &node) -> std::tuple<std::string, bool> { | |||||
| return make_tuple(node.node_name(), node.node_type() == "scope"); | return make_tuple(node.node_name(), node.node_type() == "scope"); | ||||
| }); | }); | ||||
| debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list); | |||||
| std::transform( | |||||
| parameters.begin(), parameters.end(), std::back_inserter(parameter_list), | |||||
| [](const WatchCondition_Parameter ¶meter) -> DebugServices::parameter_t { | |||||
| return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()}; | |||||
| }); | |||||
| debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list); | |||||
| if (initial_suspend_ && | |||||
| static_cast<DebugServices::CONDITION_TYPE>(condition.condition()) == DebugServices::CONDITION_TYPE::INIT) | |||||
| SendWatchpoints(CheckWatchpoints()); | |||||
| } | } | ||||
| void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); } | void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); } | ||||
| @@ -637,12 +660,13 @@ void Debugger::Exit() { | |||||
| } | } | ||||
| } | } | ||||
| std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) { | |||||
| std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) { | |||||
| std::vector<std::string> name; | std::vector<std::string> name; | ||||
| std::vector<std::string> slot; | std::vector<std::string> slot; | ||||
| std::vector<int> condition; | std::vector<int> condition; | ||||
| std::vector<unsigned int> watchpoint_id; | std::vector<unsigned int> watchpoint_id; | ||||
| std::vector<std::string> overflow_ops; | std::vector<std::string> overflow_ops; | ||||
| std::vector<std::vector<DebugServices::parameter_t>> parameters; | |||||
| #ifdef ENABLE_D | #ifdef ENABLE_D | ||||
| overflow_ops = CheckOpOverflow(); | overflow_ops = CheckOpOverflow(); | ||||
| #endif | #endif | ||||
| @@ -652,12 +676,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode | |||||
| tensor_list = tensor_loader->GetTensor(); | tensor_list = tensor_loader->GetTensor(); | ||||
| } else { | } else { | ||||
| tensor_list = tensor_loader->GetNodeTensorMap(watchnode); | tensor_list = tensor_loader->GetNodeTensorMap(watchnode); | ||||
| debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); | |||||
| } | } | ||||
| debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops, tensor_list); | |||||
| debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, overflow_ops, tensor_list, | |||||
| initial_suspend_); | |||||
| std::list<WatchpointHit> hits; | std::list<WatchpointHit> hits; | ||||
| for (unsigned int i = 0; i < name.size(); i++) { | for (unsigned int i = 0; i < name.size(); i++) { | ||||
| WatchpointHit hit; | WatchpointHit hit; | ||||
| std::vector<DebugServices::parameter_t> ¶meter = parameters[i]; | |||||
| hit.set_id(watchpoint_id[i]); | hit.set_id(watchpoint_id[i]); | ||||
| // here TensorProto act as a tensor indicator, not sending tensor content | // here TensorProto act as a tensor indicator, not sending tensor content | ||||
| @@ -668,7 +694,13 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode | |||||
| WatchCondition *condition_item = hit.mutable_watch_condition(); | WatchCondition *condition_item = hit.mutable_watch_condition(); | ||||
| condition_item->set_condition(debugger::WatchCondition_Condition(condition[i])); | condition_item->set_condition(debugger::WatchCondition_Condition(condition[i])); | ||||
| for (const auto &p : parameter) { | |||||
| auto x = condition_item->mutable_params()->Add(); | |||||
| x->set_name(p.name); | |||||
| x->set_disabled(p.disabled); | |||||
| x->set_value(p.value); | |||||
| x->set_hit(p.hit); | |||||
| } | |||||
| hits.push_back(hit); | hits.push_back(hit); | ||||
| } | } | ||||
| return hits; | return hits; | ||||
| @@ -710,6 +742,14 @@ DebuggerCommand GetCommand(const EventReply &reply) { | |||||
| return cmd; | return cmd; | ||||
| } | } | ||||
| ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply) { | |||||
| if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) { | |||||
| MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>()."; | |||||
| return ProtoVector<WatchCondition_Parameter>(); | |||||
| } | |||||
| return reply.set_cmd().watch_condition().params(); | |||||
| } | |||||
| ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) { | ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) { | ||||
| if (!reply.has_set_cmd()) { | if (!reply.has_set_cmd()) { | ||||
| MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>()."; | MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>()."; | ||||
| @@ -954,7 +994,7 @@ void Debugger::LoadGraphOutputs() { | |||||
| std::string kernel_name = node->fullname_with_scope(); | std::string kernel_name = node->fullname_with_scope(); | ||||
| auto output_size = AnfAlgo::GetOutputTensorNum(node); | auto output_size = AnfAlgo::GetOutputTensorNum(node); | ||||
| if (partial_memory_) { | if (partial_memory_) { | ||||
| if (!debug_services_->IsWatchPoint(kernel_name)) { | |||||
| if (!debug_services_->IsWatchPoint(kernel_name, node)) { | |||||
| continue; | continue; | ||||
| } | } | ||||
| } | } | ||||
| @@ -33,6 +33,7 @@ using debugger::GraphProto; | |||||
| using debugger::ModelProto; | using debugger::ModelProto; | ||||
| using debugger::TensorProto; | using debugger::TensorProto; | ||||
| using debugger::WatchCondition; | using debugger::WatchCondition; | ||||
| using debugger::WatchCondition_Parameter; | |||||
| using debugger::WatchNode; | using debugger::WatchNode; | ||||
| using debugger::WatchpointHit; | using debugger::WatchpointHit; | ||||
| @@ -73,9 +74,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| // don't need a graph_ptr because it is saved during pre_execute | // don't need a graph_ptr because it is saved during pre_execute | ||||
| void PostExecute(); | void PostExecute(); | ||||
| bool ReadNodeDataRequired(); | |||||
| bool ReadNodeDataRequired(const CNodePtr &kernel); | |||||
| void PostExecuteNode(); | |||||
| void PostExecuteNode(const CNodePtr &kernel); | |||||
| // suspend the execution after a debug_op | // suspend the execution after a debug_op | ||||
| void PostDebugOp(); | void PostDebugOp(); | ||||
| @@ -148,7 +149,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| void CommandLoop(); | void CommandLoop(); | ||||
| // set what nodes and conditions to watch | // set what nodes and conditions to watch | ||||
| void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id); | |||||
| void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id, | |||||
| const ProtoVector<WatchCondition_Parameter> ¶meters); | |||||
| // remove watchpoint with id | // remove watchpoint with id | ||||
| void RemoveWatchpoint(const int32_t id); | void RemoveWatchpoint(const int32_t id); | ||||
| @@ -161,7 +163,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| // analyze tensors and check watchpoint conditions | // analyze tensors and check watchpoint conditions | ||||
| // return names of tensors and what condition they hit | // return names of tensors and what condition they hit | ||||
| std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string()); | |||||
| std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string(), | |||||
| const CNodePtr &kernel = NULL); | |||||
| // send watchpoints that hit | // send watchpoints that hit | ||||
| void SendWatchpoints(const std::list<WatchpointHit> &points); | void SendWatchpoints(const std::list<WatchpointHit> &points); | ||||
| @@ -192,6 +195,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> { | |||||
| std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_; | std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_; | ||||
| double last_overflow_bin_; | double last_overflow_bin_; | ||||
| std::string overflow_bin_path_; | std::string overflow_bin_path_; | ||||
| // flag to keep track of the very first suspension of debugger | |||||
| bool initial_suspend_; | |||||
| // singleton | // singleton | ||||
| static std::mutex instance_lock_; | static std::mutex instance_lock_; | ||||
| static std::shared_ptr<Debugger> debugger_; | static std::shared_ptr<Debugger> debugger_; | ||||
| @@ -210,6 +215,7 @@ DataType GetDebuggerNumberDataType(const TypePtr &type); | |||||
| DebuggerCommand GetCommand(const EventReply &reply); | DebuggerCommand GetCommand(const EventReply &reply); | ||||
| // parse other data out of EventReply | // parse other data out of EventReply | ||||
| ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply); | |||||
| ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply); | ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply); | ||||
| std::string GetNodeName(const EventReply &reply); | std::string GetNodeName(const EventReply &reply); | ||||
| std::string GetRunLevel(const EventReply &reply); | std::string GetRunLevel(const EventReply &reply); | ||||
| @@ -56,6 +56,8 @@ class TensorLoader { | |||||
| std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; } | std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; } | ||||
| std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; } | |||||
| std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) { | std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) { | ||||
| std::vector<std::shared_ptr<TensorData>> tensors; | std::vector<std::shared_ptr<TensorData>> tensors; | ||||
| for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) { | for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) { | ||||
| @@ -113,7 +113,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| read_data = true; | read_data = true; | ||||
| } | } | ||||
| } else if (debugger->debugger_enabled()) { | } else if (debugger->debugger_enabled()) { | ||||
| read_data = debugger->ReadNodeDataRequired(); | |||||
| read_data = debugger->ReadNodeDataRequired(kernel); | |||||
| } | } | ||||
| if (!read_data) { | if (!read_data) { | ||||
| return; | return; | ||||
| @@ -168,7 +168,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| debugger->PostExecuteNode(); | |||||
| debugger->PostExecuteNode(kernel); | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||