From: @harsh1995 Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -12,6 +12,7 @@ if (ENABLE_DEBUGGER) | |||||
| "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc" | "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc" | ||||
| "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc" | "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc" | ||||
| "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc" | "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc" | ||||
| "${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc" | |||||
| "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" | "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" | ||||
| ) | ) | ||||
| endif (ENABLE_DEBUGGER) | endif (ENABLE_DEBUGGER) | ||||
| @@ -17,6 +17,8 @@ | |||||
| #include <map> | #include <map> | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| #include "debug/debug_services.h" | #include "debug/debug_services.h" | ||||
| #include "debug/debugger/tensor_summary.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| DebugServices::DebugServices() { | DebugServices::DebugServices() { | ||||
| @@ -49,9 +51,6 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, | |||||
| watchpoint_item.id = id; | watchpoint_item.id = id; | ||||
| watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition); | watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition); | ||||
| watchpoint_item.condition.parameter = parameter; | watchpoint_item.condition.parameter = parameter; | ||||
| if (watch_condition > 2 && watch_condition < 13) | |||||
| // odd indices are greater than conditions and even indices are less than | |||||
| watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT"; | |||||
| watchpoint_item.check_node_list = check_node_list; | watchpoint_item.check_node_list = check_node_list; | ||||
| watchpoint_item.parameter_list = parameter_list; | watchpoint_item.parameter_list = parameter_list; | ||||
| watchpoint_table[id] = watchpoint_item; | watchpoint_table[id] = watchpoint_item; | ||||
| @@ -62,77 +61,14 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { | |||||
| watchpoint_table.erase(id); | watchpoint_table.erase(id); | ||||
| } | } | ||||
| template <typename T> | |||||
| DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n, | |||||
| bool need_min_max, bool need_mean_sd, | |||||
| bool need_zero_percentage, | |||||
| bool need_tensor_update_ratio_mean, bool need_allclose, | |||||
| bool need_abs_mean) { | |||||
| tensor_stats stats; | |||||
| double zero_count = 0.0; | |||||
| double rtol = 1.0e-5; | |||||
| double atol = 1.0e-8; | |||||
| double update_ratio_sum = 0.0; | |||||
| double epsilon = 1.0e-9; | |||||
| for (unsigned int i = 0; i < n; ++i) { | |||||
| auto val = static_cast<double>(start[i]); | |||||
| double val_prev = 0.0; | |||||
| if (start_prev) { | |||||
| val_prev = static_cast<double>(start_prev[i]); | |||||
| } | |||||
| stats.has_nan = stats.has_nan || std::isnan(val); | |||||
| stats.has_inf = stats.has_inf || std::isinf(val); | |||||
| if (stats.has_inf && stats.has_nan) { | |||||
| // other statistics don't make sense in this case | |||||
| break; | |||||
| } | |||||
| if (need_min_max) { | |||||
| stats.min = std::min(stats.min, val); | |||||
| stats.max = std::max(stats.max, val); | |||||
| } | |||||
| if (need_mean_sd) { | |||||
| double delta = val - stats.mean; | |||||
| stats.mean += delta / (i + 1); | |||||
| stats.m2 += delta * (val - stats.mean); | |||||
| } | |||||
| if (need_abs_mean) { | |||||
| double delta = std::abs(val) - stats.abs_mean; | |||||
| stats.abs_mean += delta / (i + 1); | |||||
| } | |||||
| if (need_zero_percentage) { | |||||
| if (val == 0) zero_count++; | |||||
| } | |||||
| if (need_tensor_update_ratio_mean && start_prev) { | |||||
| update_ratio_sum += (std::abs(val - val_prev) / (epsilon + std::abs(val_prev))); | |||||
| } | |||||
| if (need_allclose && start_prev) { | |||||
| stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev))); | |||||
| } | |||||
| } | |||||
| if (need_tensor_update_ratio_mean && start_prev) { | |||||
| stats.tensor_update_ratio_mean = (update_ratio_sum / n); | |||||
| } | |||||
| stats.zero_percentage = (zero_count / n) * 100; | |||||
| stats.n = n; | |||||
| return stats; | |||||
| } | |||||
| void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, | void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, | ||||
| std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id, | ||||
| std::vector<std::vector<parameter_t>> *parameters, | std::vector<std::vector<parameter_t>> *parameters, | ||||
| const std::vector<std::string> &op_overflows, | |||||
| std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows, | |||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list, | const std::vector<std::shared_ptr<TensorData>> &tensor_list, | ||||
| const bool init_dbg_suspend) { | const bool init_dbg_suspend) { | ||||
| std::lock_guard<std::mutex> lg(lock_); | std::lock_guard<std::mutex> lg(lock_); | ||||
| if (watchpoint_table.empty()) { | |||||
| return; | |||||
| } | |||||
| if (watchpoint_table.empty()) return; | |||||
| for (const auto &tensor : tensor_list) { | for (const auto &tensor : tensor_list) { | ||||
| const auto tensor_name = tensor->GetName(); | const auto tensor_name = tensor->GetName(); | ||||
| @@ -140,268 +76,113 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector | |||||
| const auto tensor_slot = std::to_string(tensor->GetSlot()); | const auto tensor_slot = std::to_string(tensor->GetSlot()); | ||||
| mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); | mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); | ||||
| int tensor_dtype = tensor_ptr->data_type_c(); | int tensor_dtype = tensor_ptr->data_type_c(); | ||||
| std::vector<unsigned int> hit_encountered; | |||||
| std::vector<std::vector<bool>> hit_parms; | |||||
| std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table; | |||||
| bool min_max_enabled = false; | |||||
| bool mean_sd_enabled = false; | |||||
| bool inf_nan_enabled = false; | |||||
| bool zero_percentage_enabled = false; | |||||
| bool tensor_update_ratio_mean_enabled = false; | |||||
| bool allclose_enabled = false; | |||||
| bool abs_mean_enabled = false; | |||||
| std::vector<watchpoint_t> watchpoints_to_check; | |||||
| std::string qualified_tensor_name; | |||||
| for (auto w_table_item : watchpoint_table) { | for (auto w_table_item : watchpoint_table) { | ||||
| auto wp = std::get<1>(w_table_item); | auto wp = std::get<1>(w_table_item); | ||||
| if (wp.condition.type == INIT && !init_dbg_suspend) continue; | if (wp.condition.type == INIT && !init_dbg_suspend) continue; | ||||
| if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; | if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; | ||||
| if (wp.IsNodeIncluded(tensor_name_no_slot)) { | |||||
| min_max_enabled |= wp.min_max_enabled(); | |||||
| mean_sd_enabled |= wp.mean_sd_enabled(); | |||||
| inf_nan_enabled |= wp.inf_nan_enabled(); | |||||
| zero_percentage_enabled |= wp.zero_percentage_enabled(); | |||||
| tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled(); | |||||
| allclose_enabled |= wp.allclose_enabled(); | |||||
| abs_mean_enabled |= wp.abs_mean_enabled(); | |||||
| watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second; | |||||
| std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); | |||||
| if (!found.empty()) { | |||||
| qualified_tensor_name = found; | |||||
| watchpoints_to_check.push_back(w_table_item.second); | |||||
| } | } | ||||
| } | } | ||||
| tensor_stats stats; | |||||
| uint num_elements = tensor_ptr->DataSize(); | |||||
| if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled || | |||||
| tensor_update_ratio_mean_enabled || allclose_enabled || abs_mean_enabled) { | |||||
| bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled); | |||||
| bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL; | |||||
| // no wp set on current tensor | |||||
| if (watchpoints_to_check.empty()) continue; | |||||
| uint32_t num_elements = tensor_ptr->DataSize(); | |||||
| void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name) | |||||
| ? tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c() | |||||
| : nullptr; | |||||
| std::unique_ptr<ITensorSummary> base_summary_ptr; | |||||
| if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) { | |||||
| switch (tensor_dtype) { | switch (tensor_dtype) { | ||||
| case kNumberTypeUInt8: { | case kNumberTypeUInt8: { | ||||
| auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<uint8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt8: { | case kNumberTypeInt8: { | ||||
| auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<int8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt16: { | case kNumberTypeUInt16: { | ||||
| auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<uint16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt16: { | case kNumberTypeInt16: { | ||||
| auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<int16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt32: { | case kNumberTypeUInt32: { | ||||
| auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<uint32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt32: | case kNumberTypeInt32: | ||||
| case kNumberTypeInt: { | case kNumberTypeInt: { | ||||
| auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<int32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeUInt64: { | case kNumberTypeUInt64: { | ||||
| auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<uint64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<uint64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeInt64: { | case kNumberTypeInt64: { | ||||
| auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<int64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<int64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat16: { | case kNumberTypeFloat16: { | ||||
| auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<float16 *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<float16>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat32: | case kNumberTypeFloat32: | ||||
| case kNumberTypeFloat: { | case kNumberTypeFloat: { | ||||
| auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<float *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<float>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| case kNumberTypeFloat64: { | case kNumberTypeFloat64: { | ||||
| auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c()); | |||||
| auto start_addr_prev = | |||||
| (need_prev && have_prev | |||||
| ? reinterpret_cast<double *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) | |||||
| : NULL); | |||||
| stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, | |||||
| zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, | |||||
| abs_mean_enabled); | |||||
| base_summary_ptr = | |||||
| std::make_unique<TensorSummary<double>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); | |||||
| break; | break; | ||||
| } | } | ||||
| default: | default: | ||||
| MS_LOG(INFO) << "Unsupported tensor type"; | MS_LOG(INFO) << "Unsupported tensor type"; | ||||
| break; | break; | ||||
| } | } | ||||
| base_summary_ptr->SummarizeTensor(watchpoints_to_check); | |||||
| } | } | ||||
| for (auto &it : watchpoints_to_check_table) { | |||||
| auto wp_id = it.second.id; | |||||
| std::vector<bool> hit_p; | |||||
| CONDITION_TYPE enabled_condition = it.second.condition.type; | |||||
| bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) || | |||||
| (enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) || | |||||
| (enabled_condition == IS_OVERFLOW && | |||||
| std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); | |||||
| if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) { | |||||
| if (stats.has_inf || stats.has_nan) { | |||||
| MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check " | |||||
| << condition_label[enabled_condition] << " watchpoint."; | |||||
| } else if (enabled_condition < 13) { | |||||
| bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter; | |||||
| bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter; | |||||
| hit |= it.second.condition.comparison == "GT" ? gt : lt; | |||||
| } else { | |||||
| std::vector<parameter_t> parameter_list_item = it.second.parameter_list; | |||||
| for (auto &p : parameter_list_item) { | |||||
| if (p.disabled == false) { | |||||
| bool p_hit = false; | |||||
| if (p.name == "zero_percentage_ge") { | |||||
| p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value; | |||||
| } else if (p.name == "max_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MAX) > p.value; | |||||
| } else if (p.name == "max_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MAX) < p.value; | |||||
| } else if (p.name == "min_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MIN) > p.value; | |||||
| } else if (p.name == "min_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MIN) < p.value; | |||||
| } else if (p.name == "mean_gt") { | |||||
| p_hit = stats.parmLookup(STAT_MEAN) > p.value; | |||||
| } else if (p.name == "mean_lt") { | |||||
| p_hit = stats.parmLookup(STAT_MEAN) < p.value; | |||||
| } else if (p.name == "abs_mean_gt") { | |||||
| p_hit = stats.parmLookup(STAT_ABS_MEAN) > p.value; | |||||
| } else if (p.name == "abs_mean_lt") { | |||||
| p_hit = stats.parmLookup(STAT_ABS_MEAN) < p.value; | |||||
| } else if (p.name == "abs_update_ratio_mean_gt") { | |||||
| p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value; | |||||
| } else if (p.name == "abs_update_ratio_mean_lt") { | |||||
| p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value; | |||||
| } | |||||
| hit |= p_hit; | |||||
| hit_p.push_back(p_hit); | |||||
| } else { | |||||
| hit_p.push_back(false); | |||||
| } | |||||
| } | |||||
| hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE)); | |||||
| if (hit) hit_parms.push_back(hit_p); | |||||
| } | |||||
| for (auto &wp : watchpoints_to_check) { | |||||
| bool is_hit = false; | |||||
| int error_code = 0; | |||||
| std::vector<parameter_t> parameter_list = {}; | |||||
| if (wp.condition.type == IS_OVERFLOW) { | |||||
| is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); | |||||
| } else { | |||||
| auto item = base_summary_ptr->IsWatchpointHit(wp); | |||||
| is_hit = std::get<0>(item); | |||||
| error_code = std::get<1>(item); | |||||
| parameter_list = std::get<2>(item); | |||||
| } | } | ||||
| if (hit) hit_encountered.push_back(wp_id); | |||||
| } | |||||
| unsigned int index_parm_list = 0; | |||||
| for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { | |||||
| if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { | |||||
| // return fully qualified name for weights and bias to MI | |||||
| auto found_dot = tensor_name_no_slot.find_last_of('.'); | |||||
| if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" || | |||||
| tensor_name_no_slot.substr(found_dot + 1) == "bias")) { | |||||
| auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list; | |||||
| bool found_match = false; | |||||
| for (auto check_node : check_node_list) { | |||||
| std::string w_name = std::get<0>(check_node); | |||||
| auto found_slash = w_name.find_last_of('/'); | |||||
| if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) { | |||||
| name->push_back(w_name); | |||||
| found_match = true; | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (!found_match) { | |||||
| name->push_back(tensor_name_no_slot); | |||||
| } | |||||
| } else { | |||||
| name->push_back(tensor_name_no_slot); | |||||
| } | |||||
| if (is_hit || error_code) { | |||||
| name->push_back(qualified_tensor_name); | |||||
| slot->push_back(tensor_slot); | slot->push_back(tensor_slot); | ||||
| int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type; | |||||
| condition->push_back(condition_item); | |||||
| watchpoint_id->push_back(*it_hit_id); | |||||
| std::vector<parameter_t> parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list; | |||||
| if (condition_item >= 13) { | |||||
| unsigned int index_hit_parm = 0; | |||||
| for (auto &p : parameter_list_item) { | |||||
| p.hit = hit_parms[index_parm_list][index_hit_parm]; | |||||
| index_hit_parm++; | |||||
| } | |||||
| index_parm_list++; | |||||
| } | |||||
| parameters->push_back(parameter_list_item); | |||||
| condition->push_back(wp.condition.type); | |||||
| watchpoint_id->push_back(wp.id); | |||||
| parameters->push_back(parameter_list); | |||||
| error_codes->push_back(error_code); | |||||
| } | } | ||||
| watchpoints_to_check_table.erase(*it_hit_id); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -23,6 +23,7 @@ | |||||
| #include <tuple> | #include <tuple> | ||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include <mutex> | #include <mutex> | ||||
| #include <map> | |||||
| #include <limits> | #include <limits> | ||||
| #include "debug/tensor_load.h" | #include "debug/tensor_load.h" | ||||
| #include "debug/tensor_data.h" | #include "debug/tensor_data.h" | ||||
| @@ -60,23 +61,13 @@ class DebugServices { | |||||
| ALL_ZERO, | ALL_ZERO, | ||||
| CHANGE_TOO_LARGE, | CHANGE_TOO_LARGE, | ||||
| CHANGE_TOO_SMALL, | CHANGE_TOO_SMALL, | ||||
| NOT_CHANGED | |||||
| }; | |||||
| enum STAT_TYPE { | |||||
| STAT_MIN, | |||||
| STAT_MAX, | |||||
| STAT_MEAN, | |||||
| STAT_ZERO_PERCENTAGE, | |||||
| STAT_TENSOR_UPDATE_RATIO_MEAN, | |||||
| STAT_ALLCLOSE, | |||||
| STAT_ABS_MEAN | |||||
| NOT_CHANGED, | |||||
| RANGE | |||||
| }; | }; | ||||
| typedef struct condition { | typedef struct condition { | ||||
| CONDITION_TYPE type; | CONDITION_TYPE type; | ||||
| float parameter = 0; | float parameter = 0; | ||||
| std::string comparison; | |||||
| } condition_t; | } condition_t; | ||||
| typedef struct parameter { | typedef struct parameter { | ||||
| @@ -84,6 +75,25 @@ class DebugServices { | |||||
| bool disabled; | bool disabled; | ||||
| double_t value; | double_t value; | ||||
| bool hit; | bool hit; | ||||
| double_t actual_value; | |||||
| void Evaluate(double_t actualValue, std::string inequality_type) { | |||||
| if (std::isnan(actualValue)) return; | |||||
| actual_value = actualValue; | |||||
| if (inequality_type.empty()) { | |||||
| auto pos = name.find_last_of('_'); | |||||
| if (pos != std::string::npos) { | |||||
| inequality_type = name.substr(pos + 1); | |||||
| } | |||||
| } | |||||
| std::map<std::string, bool> condition_check{{"gt", actual_value > value}, | |||||
| {"lt", actual_value < value}, | |||||
| {"ge", actual_value >= value}, | |||||
| {"le", actual_value <= value}}; | |||||
| hit = condition_check[inequality_type]; | |||||
| } | |||||
| } parameter_t; | } parameter_t; | ||||
| typedef struct watchpoint { | typedef struct watchpoint { | ||||
| @@ -93,18 +103,28 @@ class DebugServices { | |||||
| std::vector<parameter_t> parameter_list; | std::vector<parameter_t> parameter_list; | ||||
| size_t location = 0; | size_t location = 0; | ||||
| bool IsNodeIncluded(const std::string &tensor_name) { | |||||
| std::string FindQualifiedTensorName(const std::string &tensor_name) { | |||||
| std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':')); | std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':')); | ||||
| for (auto check_node : check_node_list) { | for (auto check_node : check_node_list) { | ||||
| std::string w_name = std::get<0>(check_node); | std::string w_name = std::get<0>(check_node); | ||||
| bool w_type = std::get<1>(check_node); | bool w_type = std::get<1>(check_node); | ||||
| auto found = w_name.find_last_of('/'); | auto found = w_name.find_last_of('/'); | ||||
| if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true; | |||||
| if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return w_name; | |||||
| if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { | if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { | ||||
| return true; | |||||
| return w_name; | |||||
| } | } | ||||
| } | } | ||||
| return false; | |||||
| return {}; | |||||
| } | |||||
| bool is_gt_wp() { | |||||
| return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT || | |||||
| condition.type == SD_GT || condition.type == MAX_MIN_GT; | |||||
| } | |||||
| bool is_lt_wp() { | |||||
| return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT || | |||||
| condition.type == SD_LT || condition.type == MAX_MIN_LT; | |||||
| } | } | ||||
| bool min_max_enabled() { | bool min_max_enabled() { | ||||
| @@ -119,67 +139,26 @@ class DebugServices { | |||||
| return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; | return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; | ||||
| } | } | ||||
| // mean or sd related condition set | // mean or sd related condition set | ||||
| bool mean_sd_enabled() { | |||||
| bool mean_sd_enabled() const { | |||||
| return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || | return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || | ||||
| condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || | condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || | ||||
| (condition.type == TOO_SMALL && !parameter_list[3].disabled); | (condition.type == TOO_SMALL && !parameter_list[3].disabled); | ||||
| } | } | ||||
| bool abs_mean_enabled() { | |||||
| bool abs_mean_enabled() const { | |||||
| return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || | return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || | ||||
| (condition.type == TOO_SMALL && !parameter_list[0].disabled); | (condition.type == TOO_SMALL && !parameter_list[0].disabled); | ||||
| } | } | ||||
| bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } | bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } | ||||
| bool tensor_update_ratio_mean_enabled() { | |||||
| return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; | |||||
| } | |||||
| bool allclose_enabled() { return condition.type == NOT_CHANGED; } | |||||
| } watchpoint_t; | |||||
| struct tensor_stats { | |||||
| double min = std::numeric_limits<double>::max(); | |||||
| double max = std::numeric_limits<double>::lowest(); | |||||
| bool has_inf = false; | |||||
| bool has_nan = false; | |||||
| unsigned int n = 0; | |||||
| double mean = 0.0; | |||||
| double m2 = 0.0; | |||||
| double zero_percentage = 0.0; | |||||
| double tensor_update_ratio_mean = -1; | |||||
| bool allclose = false; | |||||
| double abs_mean = 0.0; | |||||
| double statLookup(CONDITION_TYPE type) const { | |||||
| if (type == MAX_GT || type == MAX_LT) return max; | |||||
| if (type == MIN_GT || type == MIN_LT) return min; | |||||
| if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min); | |||||
| if (type == MEAN_GT || type == MEAN_LT) return mean; | |||||
| if (type == SD_GT || type == SD_LT) return getStandardDeviation(); | |||||
| return std::numeric_limits<double>::quiet_NaN(); | |||||
| bool tensor_update_ratio_mean_enabled() const { | |||||
| return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; | |||||
| } | } | ||||
| bool allclose_enabled() const { return condition.type == NOT_CHANGED; } | |||||
| double parmLookup(STAT_TYPE type) const { | |||||
| if (type == STAT_MAX) return max; | |||||
| if (type == STAT_MIN) return min; | |||||
| if (type == STAT_MEAN) return mean; | |||||
| if (type == STAT_ZERO_PERCENTAGE) return zero_percentage; | |||||
| if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean; | |||||
| if (type == STAT_ALLCLOSE) return allclose; | |||||
| if (type == STAT_ABS_MEAN) return abs_mean; | |||||
| return std::numeric_limits<double>::quiet_NaN(); | |||||
| bool range_enabled() const { | |||||
| return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled); | |||||
| } | } | ||||
| double getMean() const { return mean; } | |||||
| double getVariance() const { | |||||
| if (n > 1) { | |||||
| return m2 / (n - 1); | |||||
| } else { | |||||
| return 0.0; | |||||
| } | |||||
| } | |||||
| double getStandardDeviation() const { return sqrt(getVariance()); } | |||||
| }; | |||||
| } watchpoint_t; | |||||
| void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, | ||||
| const std::vector<std::tuple<std::string, bool>> &check_node_list, | const std::vector<std::tuple<std::string, bool>> &check_node_list, | ||||
| @@ -189,7 +168,7 @@ class DebugServices { | |||||
| void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, | void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, | ||||
| std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters, | std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters, | ||||
| const std::vector<std::string> &op_overflows, | |||||
| std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows, | |||||
| const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend); | const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend); | ||||
| void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name, | void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name, | ||||
| @@ -210,19 +189,8 @@ class DebugServices { | |||||
| std::mutex lock_; | std::mutex lock_; | ||||
| std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; | std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; | ||||
| std::vector<std::string> condition_label = { | |||||
| "HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", | |||||
| "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", | |||||
| "MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT", | |||||
| "TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL", | |||||
| "NOT_CHANGED"}; | |||||
| TensorLoader *tensor_loader_; | TensorLoader *tensor_loader_; | ||||
| template <typename T> | |||||
| static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max, | |||||
| bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean, | |||||
| bool need_allclose, bool need_abs_mean_sd); | |||||
| }; | }; | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -45,8 +45,8 @@ message Metadata { | |||||
| } | } | ||||
| message Chunk { | message Chunk { | ||||
| bytes buffer = 1; | |||||
| bool finished = 2; | |||||
| bytes buffer = 1; | |||||
| bool finished = 2; | |||||
| } | } | ||||
| message EventReply { | message EventReply { | ||||
| @@ -111,6 +111,7 @@ message WatchCondition { | |||||
| tensor_change_too_large = 18; | tensor_change_too_large = 18; | ||||
| tensor_change_too_small = 19; | tensor_change_too_small = 19; | ||||
| tensor_not_changed = 20; | tensor_not_changed = 20; | ||||
| tensor_range = 21; | |||||
| } | } | ||||
| Condition condition = 1; | Condition condition = 1; | ||||
| float value = 2; | float value = 2; | ||||
| @@ -119,6 +120,7 @@ message WatchCondition { | |||||
| bool disabled = 2; | bool disabled = 2; | ||||
| double value = 3; | double value = 3; | ||||
| bool hit = 4; // Whether this parameter is hit when checking tensor. | bool hit = 4; // Whether this parameter is hit when checking tensor. | ||||
| double actual_value = 5; | |||||
| } | } | ||||
| repeated Parameter params = 4; | repeated Parameter params = 4; | ||||
| } | } | ||||
| @@ -132,4 +134,5 @@ message WatchpointHit { | |||||
| TensorProto tensor = 1; | TensorProto tensor = 1; | ||||
| WatchCondition watch_condition = 2; | WatchCondition watch_condition = 2; | ||||
| int32 id = 3; | int32 id = 3; | ||||
| int32 error_code = 4; | |||||
| } | } | ||||
| @@ -790,6 +790,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode | |||||
| std::vector<unsigned int> watchpoint_id; | std::vector<unsigned int> watchpoint_id; | ||||
| std::vector<std::string> overflow_ops; | std::vector<std::string> overflow_ops; | ||||
| std::vector<std::vector<DebugServices::parameter_t>> parameters; | std::vector<std::vector<DebugServices::parameter_t>> parameters; | ||||
| std::vector<int32_t> error_codes; | |||||
| #ifdef ENABLE_D | #ifdef ENABLE_D | ||||
| overflow_ops = CheckOpOverflow(); | overflow_ops = CheckOpOverflow(); | ||||
| #endif | #endif | ||||
| @@ -801,14 +802,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode | |||||
| tensor_list = tensor_loader->GetNodeTensorMap(watchnode); | tensor_list = tensor_loader->GetNodeTensorMap(watchnode); | ||||
| debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); | debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); | ||||
| } | } | ||||
| debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, overflow_ops, tensor_list, | |||||
| initial_suspend_); | |||||
| debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops, | |||||
| tensor_list, initial_suspend_); | |||||
| std::list<WatchpointHit> hits; | std::list<WatchpointHit> hits; | ||||
| for (unsigned int i = 0; i < name.size(); i++) { | for (unsigned int i = 0; i < name.size(); i++) { | ||||
| WatchpointHit hit; | WatchpointHit hit; | ||||
| std::vector<DebugServices::parameter_t> ¶meter = parameters[i]; | std::vector<DebugServices::parameter_t> ¶meter = parameters[i]; | ||||
| hit.set_id(watchpoint_id[i]); | hit.set_id(watchpoint_id[i]); | ||||
| hit.set_error_code(error_codes[i]); | |||||
| // here TensorProto act as a tensor indicator, not sending tensor content | // here TensorProto act as a tensor indicator, not sending tensor content | ||||
| TensorProto *tensor_item = hit.mutable_tensor(); | TensorProto *tensor_item = hit.mutable_tensor(); | ||||
| tensor_item->set_node_name(name[i]); | tensor_item->set_node_name(name[i]); | ||||
| @@ -823,6 +824,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode | |||||
| x->set_disabled(p.disabled); | x->set_disabled(p.disabled); | ||||
| x->set_value(p.value); | x->set_value(p.value); | ||||
| x->set_hit(p.hit); | x->set_hit(p.hit); | ||||
| x->set_actual_value(p.actual_value); | |||||
| } | } | ||||
| hits.push_back(hit); | hits.push_back(hit); | ||||
| } | } | ||||
| @@ -0,0 +1,268 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <math.h> | |||||
| #include <algorithm> | |||||
| #include <limits> | |||||
| #include <memory> | |||||
| #include <bitset> | |||||
| #include <tuple> | |||||
| #include "debug/debugger/tensor_summary.h" | |||||
| namespace mindspore { | |||||
| using CONDITION_TYPE = DebugServices::CONDITION_TYPE; | |||||
| RangeCountCalculator::RangeCountCalculator() | |||||
| : range_start_inclusive(-std::numeric_limits<double>::infinity()), | |||||
| range_end_inclusive(std::numeric_limits<double>::infinity()), | |||||
| count(0), | |||||
| total(0) {} | |||||
| void RangeCountCalculator::ProcessElement(double element) { | |||||
| count += (element >= range_start_inclusive && element <= range_end_inclusive); | |||||
| total += 1; | |||||
| } | |||||
| double RangeCountCalculator::GetPercentInRange() { | |||||
| if (total == 0) { | |||||
| return 0.0; | |||||
| } | |||||
| return 100.0 * count / total; | |||||
| } | |||||
| AllCloseCalculator::AllCloseCalculator() : atol(1.0e-8), rtol(1.0e-5), result(true) {} | |||||
| void AllCloseCalculator::ProcessElement(double current, double previous) { | |||||
| result &= (std::abs(current - previous) <= (atol + rtol * std::abs(previous))); | |||||
| } | |||||
| bool AllCloseCalculator::IsAllClose() { return result; } | |||||
| MeanCalculator::MeanCalculator() : mean(0.0), count(0) {} | |||||
| void MeanCalculator::ProcessElement(double value) { | |||||
| count += 1; | |||||
| double delta = value - mean; | |||||
| mean += delta / count; | |||||
| } | |||||
| double MeanCalculator::GetMean() { return mean; } | |||||
| VarianceAndMeanCalculator::VarianceAndMeanCalculator() : mean(0.0), count(0), m2(0.0) {} | |||||
| void VarianceAndMeanCalculator::ProcessElement(double value) { | |||||
| count += 1; | |||||
| double delta = value - mean; | |||||
| mean += delta / count; | |||||
| m2 += delta * (value - mean); | |||||
| } | |||||
| double VarianceAndMeanCalculator::GetMean() { return mean; } | |||||
| double VarianceAndMeanCalculator::GetVariance() { | |||||
| if (count > 1) { | |||||
| return m2 / (count - 1); | |||||
| } else { | |||||
| return 0.0; | |||||
| } | |||||
| } | |||||
| double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); } | |||||
| template <typename T> | |||||
| TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *previous_tensor_ptr, uint32_t num_elements) | |||||
| : current_tensor_ptr(reinterpret_cast<T *>(current_tensor_ptr)), | |||||
| prev_tensor_ptr(reinterpret_cast<T *>(previous_tensor_ptr)), | |||||
| num_elements(num_elements), | |||||
| min(std::numeric_limits<double>::max()), | |||||
| max(std::numeric_limits<double>::lowest()), | |||||
| inf_count(0), | |||||
| nan_count(0), | |||||
| zero_count(0), | |||||
| epsilon(1.0e-9), | |||||
| mean_sd_cal_enabled(false) {} | |||||
| template <typename T> | |||||
| void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) { | |||||
| InitCalculators(wps); | |||||
| for (size_t i = 0; i < num_elements; ++i) { | |||||
| auto current_value = static_cast<double>(current_tensor_ptr[i]); | |||||
| double previous_value = | |||||
| prev_tensor_ptr ? static_cast<double>(prev_tensor_ptr[i]) : std::numeric_limits<double>::quiet_NaN(); | |||||
| inf_count += std::isinf(current_value); | |||||
| nan_count += std::isnan(current_value); | |||||
| zero_count += (current_value == 0); | |||||
| max = std::max(max, current_value); | |||||
| min = std::min(min, current_value); | |||||
| if (mean_sd_cal_enabled) { | |||||
| current_mean_variance.ProcessElement(current_value); | |||||
| } | |||||
| for (auto &it : all_close) { | |||||
| it.second->ProcessElement(current_value, previous_value); | |||||
| } | |||||
| for (auto &range_count : range_counts) { | |||||
| range_count.second->ProcessElement(current_value); | |||||
| } | |||||
| for (auto &mean : means) { | |||||
| if (mean.first == "curr_prev_diff_mean") { | |||||
| mean.second->ProcessElement(std::abs(current_value - previous_value)); | |||||
| } else if (mean.first == "abs_prev_mean") { | |||||
| mean.second->ProcessElement(std::abs(previous_value)); | |||||
| } else if (mean.first == "abs_current_mean") { | |||||
| mean.second->ProcessElement(std::abs(current_value)); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit( | |||||
| DebugServices::watchpoint_t wp) { | |||||
| auto parameter_list = wp.parameter_list; | |||||
| bool hit = false; | |||||
| std::bitset<32> error_code; | |||||
| CONDITION_TYPE type = wp.condition.type; | |||||
| error_code.set(0, nan_count > 0); | |||||
| error_code.set(1, inf_count > 0); | |||||
| if (type == CONDITION_TYPE::HAS_NAN) { | |||||
| error_code.reset(); | |||||
| hit = nan_count > 0; | |||||
| } else if (type == CONDITION_TYPE::HAS_INF) { | |||||
| error_code.reset(); | |||||
| hit = inf_count > 0; | |||||
| } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) { | |||||
| error_code.reset(); | |||||
| hit = (nan_count + inf_count) > 0; | |||||
| } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) { | |||||
| hit = all_close[wp.id]->IsAllClose(); | |||||
| } | |||||
| for (auto ¶meter : parameter_list) { | |||||
| if (parameter.disabled || error_code.any()) { | |||||
| continue; | |||||
| } | |||||
| std::string inequality_type; | |||||
| if (wp.is_gt_wp()) { | |||||
| inequality_type = "gt"; | |||||
| } else if (wp.is_lt_wp()) { | |||||
| inequality_type = "lt"; | |||||
| } | |||||
| parameter.Evaluate(StatLookup(parameter.name, wp), inequality_type); | |||||
| hit |= parameter.hit; | |||||
| } | |||||
| return std::make_tuple(hit, static_cast<int32_t>(error_code.to_ulong()), parameter_list); | |||||
| } | |||||
| template <typename T> | |||||
| double_t TensorSummary<T>::StatLookup(const std::string ¶meter_name, const DebugServices::watchpoint_t &wp) { | |||||
| if (parameter_name == "param") return StatLookup(wp); | |||||
| std::string param_type; | |||||
| auto pos = parameter_name.find_last_of('_'); | |||||
| if (pos != std::string::npos) { | |||||
| param_type = parameter_name.substr(0, pos); | |||||
| } | |||||
| if (param_type == "max") { | |||||
| return max; | |||||
| } else if (param_type == "min") { | |||||
| return min; | |||||
| } else if (param_type == "max_min") { | |||||
| return max - min; | |||||
| } else if (param_type == "mean") { | |||||
| return current_mean_variance.GetMean(); | |||||
| } else if (param_type == "sd") { | |||||
| return current_mean_variance.GetStandardDeviation(); | |||||
| } else if (param_type == "abs_mean") { | |||||
| return means["abs_current_mean"]->GetMean(); | |||||
| } else if (param_type == "abs_mean_update_ratio") { | |||||
| return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon); | |||||
| } else if (param_type == "range_percentage") { | |||||
| return range_counts[wp.id]->GetPercentInRange(); | |||||
| } else if (param_type == "zero_percentage") { | |||||
| return GetZeroValPercent(); | |||||
| } | |||||
| return std::numeric_limits<double_t>::quiet_NaN(); | |||||
| } | |||||
| template <typename T> | |||||
| double_t TensorSummary<T>::StatLookup(const DebugServices::watchpoint_t &wp) { | |||||
| CONDITION_TYPE type = wp.condition.type; | |||||
| if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) { | |||||
| return max; | |||||
| } else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) { | |||||
| return min; | |||||
| } else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) { | |||||
| return current_mean_variance.GetMean(); | |||||
| } else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) { | |||||
| return current_mean_variance.GetStandardDeviation(); | |||||
| } else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) { | |||||
| return max - min; | |||||
| } | |||||
| return std::numeric_limits<double_t>::quiet_NaN(); | |||||
| } | |||||
| template <typename T> | |||||
| double_t TensorSummary<T>::GetZeroValPercent() { | |||||
| if (num_elements == 0) { | |||||
| return 0; | |||||
| } | |||||
| return (zero_count * 100.0) / num_elements; | |||||
| } | |||||
| template <typename T> | |||||
| void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoint_t> &wps) { | |||||
| for (auto &wp : wps) { | |||||
| auto wp_id = wp.id; | |||||
| mean_sd_cal_enabled |= wp.mean_sd_enabled(); | |||||
| if (wp.allclose_enabled() && prev_tensor_ptr) { | |||||
| all_close[wp_id] = std::make_unique<AllCloseCalculator>(); | |||||
| if (!wp.parameter_list[0].disabled) { | |||||
| all_close[wp_id]->set_atol(wp.parameter_list[0].value); | |||||
| } | |||||
| if (!wp.parameter_list[1].disabled) { | |||||
| all_close[wp_id]->set_rtol(wp.parameter_list[1].value); | |||||
| } | |||||
| } else if (wp.range_enabled()) { | |||||
| range_counts[wp_id] = std::make_unique<RangeCountCalculator>(); | |||||
| if (!wp.parameter_list[0].disabled) { | |||||
| range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value); | |||||
| } | |||||
| if (!wp.parameter_list[1].disabled) { | |||||
| range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value); | |||||
| } | |||||
| } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) { | |||||
| means.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()}); | |||||
| means.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()}); | |||||
| } else if (wp.abs_mean_enabled()) { | |||||
| means.insert({"abs_current_mean", std::make_unique<MeanCalculator>()}); | |||||
| } | |||||
| } | |||||
| } | |||||
| template class TensorSummary<uint8_t>; | |||||
| template class TensorSummary<int8_t>; | |||||
| template class TensorSummary<uint16_t>; | |||||
| template class TensorSummary<int16_t>; | |||||
| template class TensorSummary<uint32_t>; | |||||
| template class TensorSummary<int32_t>; | |||||
| template class TensorSummary<uint64_t>; | |||||
| template class TensorSummary<int64_t>; | |||||
| template class TensorSummary<float16>; | |||||
| template class TensorSummary<float>; | |||||
| template class TensorSummary<double>; | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,120 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_TENSOR_SUMMARY_H | |||||
| #define MINDSPORE_TENSOR_SUMMARY_H | |||||
| #include <vector> | |||||
| #include <unordered_map> | |||||
| #include <tuple> | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include "debug/debug_services.h" | |||||
| namespace mindspore { | |||||
| class RangeCountCalculator { | |||||
| public: | |||||
| RangeCountCalculator(); | |||||
| void ProcessElement(double element); | |||||
| double GetPercentInRange(); | |||||
| void set_range_start_inclusive(double value) { range_start_inclusive = value; } | |||||
| void set_range_end_inclusive(double value) { range_end_inclusive = value; } | |||||
| private: | |||||
| double range_start_inclusive; | |||||
| double range_end_inclusive; | |||||
| int count; | |||||
| int total; | |||||
| }; | |||||
| class AllCloseCalculator { | |||||
| public: | |||||
| AllCloseCalculator(); | |||||
| void ProcessElement(double current, double previous); | |||||
| bool IsAllClose(); | |||||
| void set_atol(double value) { atol = value; } | |||||
| void set_rtol(double value) { rtol = value; } | |||||
| private: | |||||
| double atol; | |||||
| double rtol; | |||||
| bool result; | |||||
| }; | |||||
| class MeanCalculator { | |||||
| public: | |||||
| MeanCalculator(); | |||||
| void ProcessElement(double value); | |||||
| double GetMean(); | |||||
| protected: | |||||
| double mean; | |||||
| int count; | |||||
| }; | |||||
| class VarianceAndMeanCalculator { | |||||
| public: | |||||
| VarianceAndMeanCalculator(); | |||||
| void ProcessElement(double value); | |||||
| double GetStandardDeviation(); | |||||
| double GetVariance(); | |||||
| double GetMean(); | |||||
| private: | |||||
| double mean; | |||||
| int count; | |||||
| double m2; | |||||
| }; | |||||
| class ITensorSummary { | |||||
| public: | |||||
| virtual ~ITensorSummary() = default; | |||||
| virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0; | |||||
| virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit( | |||||
| DebugServices::watchpoint_t) = 0; | |||||
| }; | |||||
| template <typename T> | |||||
| class TensorSummary : public ITensorSummary { | |||||
| public: | |||||
| TensorSummary() = default; | |||||
| TensorSummary(void *, void *, uint32_t); | |||||
| void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override; | |||||
| // returns hit, error_code, parameter_list | |||||
| std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override; | |||||
| private: | |||||
| T *current_tensor_ptr; | |||||
| T *prev_tensor_ptr; | |||||
| uint32_t num_elements; | |||||
| double min; | |||||
| double max; | |||||
| uint32_t inf_count; | |||||
| uint32_t nan_count; | |||||
| uint32_t zero_count; | |||||
| double epsilon; | |||||
| bool mean_sd_cal_enabled; | |||||
| VarianceAndMeanCalculator current_mean_variance; | |||||
| std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means; | |||||
| std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close; | |||||
| std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts; | |||||
| double_t StatLookup(const DebugServices::watchpoint_t &); | |||||
| double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &); | |||||
| double_t GetZeroValPercent(); | |||||
| void InitCalculators(const std::vector<DebugServices::watchpoint_t> &); | |||||
| }; | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_TENSOR_SUMMARY_H | |||||
| @@ -56,7 +56,12 @@ class TensorLoader { | |||||
| std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; } | std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; } | ||||
| std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; } | |||||
| std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { | |||||
| if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) { | |||||
| return tensor_list_map[tensor_name + ":prev"]; | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) { | std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) { | ||||
| std::vector<std::shared_ptr<TensorData>> tensors; | std::vector<std::shared_ptr<TensorData>> tensors; | ||||