diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index 0a3899410d..313bff17bc 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -14,6 +14,8 @@ * limitations under the License. */ #include +#include +#include "backend/session/anf_runtime_algorithm.h" #include "debug/debug_services.h" namespace mindspore { @@ -39,17 +41,19 @@ DebugServices &DebugServices::operator=(const DebugServices &other) { DebugServices::~DebugServices() { delete tensor_loader_; } void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, - const std::vector> &check_node_list) { + const std::vector> &check_node_list, + const std::vector ¶meter_list) { std::lock_guard lg(lock_); watchpoint_t watchpoint_item; watchpoint_item.id = id; watchpoint_item.condition.type = static_cast(watch_condition); watchpoint_item.condition.parameter = parameter; - if (watch_condition > 2) - // odd indices are greater than conditions and even indicies are less than + if (watch_condition > 2 && watch_condition < 13) + // odd indices are greater than conditions and even indices are less than watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT"; watchpoint_item.check_node_list = check_node_list; + watchpoint_item.parameter_list = parameter_list; watchpoint_table[id] = watchpoint_item; } @@ -59,11 +63,22 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { } template -DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsigned int n, bool need_min_max, - bool need_mean_sd) { +DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n, + bool need_min_max, bool need_mean_sd, + bool need_zero_percentage, + bool need_tensor_update_ratio_mean, bool need_allclose) { tensor_stats stats; + double zero_count = 0.0; + double rtol = 1.0e-5; + double atol = 1.0e-8; + double update_ratio_sum = 0.0; + double epsilon = 1.0e-9; for (unsigned int i = 0; i < n; ++i) { auto val = static_cast(start[i]); + double val_prev = 0.0; + if (start_prev) { + val_prev = static_cast(start_prev[i]); + } stats.has_nan = stats.has_nan || std::isnan(val); stats.has_inf = stats.has_inf || std::isinf(val); if (stats.has_inf && stats.has_nan) { @@ -81,15 +96,33 @@ DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsig stats.mean += delta / (i + 1); stats.m2 += delta * (val - stats.mean); } + + if (need_zero_percentage) { + if (val == 0) zero_count++; + } + + if (need_tensor_update_ratio_mean && start_prev) { + update_ratio_sum += (std::abs(val) / (epsilon + std::abs(val_prev))); + } + + if (need_allclose && start_prev) { + stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev))); + } } + if (need_tensor_update_ratio_mean && start_prev) { + stats.tensor_update_ratio_mean = (update_ratio_sum / n); + } + stats.zero_percentage = (zero_count / n) * 100; stats.n = n; return stats; } void DebugServices::CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, std::vector *watchpoint_id, + std::vector> *parameters, const std::vector &op_overflows, - const std::vector> &tensor_list) { + const std::vector> &tensor_list, + const bool init_dbg_suspend) { std::lock_guard lg(lock_); if (watchpoint_table.empty()) { return; @@ -102,79 +135,145 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); int tensor_dtype = tensor_ptr->data_type_c(); std::vector hit_encountered; + std::vector> hit_parms; std::unordered_map watchpoints_to_check_table; bool min_max_enabled = false; bool mean_sd_enabled = false; bool inf_nan_enabled = false; + bool zero_percentage_enabled = false; + bool tensor_update_ratio_mean_enabled = false; + bool allclose_enabled = false; for (auto w_table_item : watchpoint_table) { auto wp = std::get<1>(w_table_item); + if (wp.condition.type == INIT && !init_dbg_suspend) continue; if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; if (wp.IsNodeIncluded(tensor_name_no_slot)) { min_max_enabled |= wp.min_max_enabled(); mean_sd_enabled |= wp.mean_sd_enabled(); inf_nan_enabled |= wp.inf_nan_enabled(); + zero_percentage_enabled |= wp.zero_percentage_enabled(); + tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled(); + allclose_enabled |= wp.allclose_enabled(); watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second; } } tensor_stats stats; uint num_elements = tensor_ptr->DataSize(); - if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) { + if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled || + tensor_update_ratio_mean_enabled || allclose_enabled) { + bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled); + bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL; switch (tensor_dtype) { case kNumberTypeUInt8: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeInt8: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeUInt16: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeInt16: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeUInt32: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeInt32: case kNumberTypeInt: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeUInt64: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeInt64: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeFloat16: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeFloat32: case kNumberTypeFloat: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } case kNumberTypeFloat64: { auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled); + auto start_addr_prev = + (need_prev && have_prev + ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) + : NULL); + stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, + zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled); break; } default: @@ -185,31 +284,97 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector for (auto &it : watchpoints_to_check_table) { auto wp_id = it.second.id; + std::vector hit_p; CONDITION_TYPE enabled_condition = it.second.condition.type; bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) || + (enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) || (enabled_condition == IS_OVERFLOW && std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); - if (enabled_condition > 2) { + if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) { if (stats.has_inf || stats.has_nan) { MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check " << condition_label[enabled_condition] << " watchpoint."; - } else { + } else if (enabled_condition < 13) { bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter; bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter; hit |= it.second.condition.comparison == "GT" ? gt : lt; + } else { + std::vector parameter_list_item = it.second.parameter_list; + for (auto &p : parameter_list_item) { + if (p.disabled == false) { + bool p_hit = false; + if (p.name == "zero_percentage_ge") { + p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value; + } else if (p.name == "max_gt") { + p_hit = stats.parmLookup(STAT_MAX) > p.value; + } else if (p.name == "max_lt") { + p_hit = stats.parmLookup(STAT_MAX) < p.value; + } else if (p.name == "min_gt") { + p_hit = stats.parmLookup(STAT_MIN) > p.value; + } else if (p.name == "min_lt") { + p_hit = stats.parmLookup(STAT_MIN) < p.value; + } else if (p.name == "mean_gt") { + p_hit = stats.parmLookup(STAT_MEAN) > p.value; + } else if (p.name == "mean_lt") { + p_hit = stats.parmLookup(STAT_MEAN) < p.value; + } else if (p.name == "abs_mean_gt") { + p_hit = std::abs(stats.parmLookup(STAT_MEAN)) > p.value; + } else if (p.name == "abs_mean_lt") { + p_hit = std::abs(stats.parmLookup(STAT_MEAN)) < p.value; + } else if (p.name == "abs_update_ratio_mean_gt") { + p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value; + } else if (p.name == "abs_update_ratio_mean_lt") { + p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value; + } + hit |= p_hit; + hit_p.push_back(p_hit); + } else { + hit_p.push_back(false); + } + } + + hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE)); + + if (hit) hit_parms.push_back(hit_p); } } if (hit) hit_encountered.push_back(wp_id); } + unsigned int index_parm_list = 0; for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { - name->push_back(tensor_name_no_slot); + // return fully qualified name for weights and bias to MI + auto found_dot = tensor_name_no_slot.find_last_of('.'); + if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" || + tensor_name_no_slot.substr(found_dot + 1) == "bias")) { + auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list; + for (auto check_node : check_node_list) { + std::string w_name = std::get<0>(check_node); + auto found_slash = w_name.find_last_of('/'); + if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) { + name->push_back(w_name); + } + } + } else { + name->push_back(tensor_name_no_slot); + } + slot->push_back(tensor_slot); int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type; condition->push_back(condition_item); watchpoint_id->push_back(*it_hit_id); + std::vector parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list; + if (condition_item >= 13) { + unsigned int index_hit_parm = 0; + for (auto &p : parameter_list_item) { + p.hit = hit_parms[index_parm_list][index_hit_parm]; + index_hit_parm++; + } + index_parm_list++; + } + parameters->push_back(parameter_list_item); } watchpoints_to_check_table.erase(*it_hit_id); } @@ -234,7 +399,7 @@ void DebugServices::ReadNodesTensors(std::vector name, std::vector< } } -bool DebugServices::IsWatchPoint(std::string kernel_name) { +bool DebugServices::IsWatchPoint(std::string kernel_name, const CNodePtr &kernel) { bool ret = false; for (auto w_table_item : watchpoint_table) { auto check_node_list = std::get<1>(w_table_item).check_node_list; @@ -243,7 +408,7 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) { bool w_type = std::get<1>(check_node); if ((w_type == true && ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || - (w_type == false && kernel_name == w_name)) { + (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) { ret = true; return ret; } @@ -252,6 +417,39 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) { return ret; } +bool DebugServices::IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel) { + if (kernel) { + auto input_size = AnfAlgo::GetInputTensorNum(kernel); + for (size_t j = 0; j < input_size; ++j) { + auto input_kernel = kernel->input(j + 1); + std::string input_kernel_name = input_kernel->fullname_with_scope(); + auto found = w_name.find_last_of('/'); + if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true; + } + return false; + } else { + return false; + } +} + +void DebugServices::AddWeightsBiasInputs(std::vector> *tensor_list, + const CNodePtr &kernel) { + if (kernel) { + auto input_size = AnfAlgo::GetInputTensorNum(kernel); + for (size_t j = 0; j < input_size; ++j) { + auto input_kernel = kernel->input(j + 1); + std::string input_kernel_name = input_kernel->fullname_with_scope(); + std::string locate_tensor = input_kernel_name + ":0"; + std::map> tensor_map = tensor_loader_->GetTensorMap(); + std::map>::iterator iter; + iter = tensor_map.find(locate_tensor); + if (iter != tensor_map.end()) { + tensor_list->push_back(iter->second); + } + } + } +} + TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; } std::unordered_map DebugServices::GetWatchpointTable() { return watchpoint_table; diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index eda152b4d8..02510e2c39 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -52,19 +52,37 @@ class DebugServices { MEAN_GT, MEAN_LT, SD_GT, - SD_LT + SD_LT, + GENERAL_OVERFLOW, + INIT, + TOO_LARGE, + TOO_SMALL, + ALL_ZERO, + CHANGE_TOO_LARGE, + CHANGE_TOO_SMALL, + NOT_CHANGED }; + enum STAT_TYPE { STAT_MIN, STAT_MAX, STAT_MEAN, STAT_ZERO_PERCENTAGE, STAT_TENSOR_UPDATE_RATIO_MEAN, STAT_ALLCLOSE }; + typedef struct condition { CONDITION_TYPE type; float parameter = 0; std::string comparison; } condition_t; + typedef struct parameter { + std::string name; + bool disabled; + double_t value; + bool hit; + } parameter_t; + typedef struct watchpoint { unsigned int id; condition_t condition; std::vector> check_node_list; + std::vector parameter_list; size_t location = 0; bool IsNodeIncluded(const std::string &tensor_name) { @@ -72,6 +90,8 @@ class DebugServices { for (auto check_node : check_node_list) { std::string w_name = std::get<0>(check_node); bool w_type = std::get<1>(check_node); + auto found = w_name.find_last_of('/'); + if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true; if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { return true; } @@ -81,15 +101,27 @@ class DebugServices { bool min_max_enabled() { return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT || - condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT; + condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT || + (condition.type == INIT && (!parameter_list[1].disabled || !parameter_list[2].disabled)) || + (condition.type == TOO_LARGE && (!parameter_list[1].disabled || !parameter_list[2].disabled)) || + (condition.type == TOO_SMALL && (!parameter_list[1].disabled || !parameter_list[2].disabled)); } // inf or nan related condition set - bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; } + bool inf_nan_enabled() { + return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; + } // mean or sd related condition set bool mean_sd_enabled() { return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || - condition.type == SD_GT; + condition.type == SD_GT || + (condition.type == TOO_LARGE && (!parameter_list[0].disabled || !parameter_list[3].disabled)) || + (condition.type == TOO_SMALL && (!parameter_list[0].disabled || !parameter_list[3].disabled)); } + bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } + bool tensor_update_ratio_mean_enabled() { + return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; + } + bool allclose_enabled() { return condition.type == NOT_CHANGED; } } watchpoint_t; struct tensor_stats { @@ -100,6 +132,9 @@ class DebugServices { unsigned int n = 0; double mean = 0.0; double m2 = 0.0; + double zero_percentage = 0.0; + double tensor_update_ratio_mean = -1; + bool allclose = false; double statLookup(CONDITION_TYPE type) const { if (type == MAX_GT || type == MAX_LT) return max; @@ -110,6 +145,16 @@ class DebugServices { return std::numeric_limits::quiet_NaN(); } + double parmLookup(STAT_TYPE type) const { + if (type == STAT_MAX) return max; + if (type == STAT_MIN) return min; + if (type == STAT_MEAN) return mean; + if (type == STAT_ZERO_PERCENTAGE) return zero_percentage; + if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean; + if (type == STAT_ALLCLOSE) return allclose; + return std::numeric_limits::quiet_NaN(); + } + double getMean() const { return mean; } double getVariance() const { @@ -124,19 +169,25 @@ class DebugServices { }; void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, - const std::vector> &check_node_list); + const std::vector> &check_node_list, + const std::vector ¶meter_list); void RemoveWatchpoint(unsigned int id); void CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, - std::vector *watchpoint_id, const std::vector &op_overflows, - const std::vector> &tensor_list); + std::vector *watchpoint_id, std::vector> *parameters, + const std::vector &op_overflows, + const std::vector> &tensor_list, bool init_dbg_suspend); void ReadNodesTensors(std::vector name, std::vector *ret_name, std::vector *data_ptr, std::vector *data_size, std::vector *dtype, std::vector> *shape); - bool IsWatchPoint(std::string kernel_name); + bool IsWatchPoint(std::string kernel_name, const CNodePtr &kernel = nullptr); + + bool IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel); + + void AddWeightsBiasInputs(std::vector> *tensor_list, const CNodePtr &kernel); TensorLoader *tensor_loader() const; @@ -146,14 +197,19 @@ class DebugServices { std::mutex lock_; std::unordered_map watchpoint_table; - std::vector condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", - "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", - "MEAN_LT", "SD_GT", "SD_LT"}; + std::vector condition_label = { + "HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", + "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", + "MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT", + "TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL", + "NOT_CHANGED"}; TensorLoader *tensor_loader_; template - static tensor_stats SummarizeTensor(const T *start, unsigned int n, bool need_min_max, bool need_mean_sd); + static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max, + bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean, + bool need_allclose); }; } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto index 5c1ca5ceed..fd83f7adc8 100644 --- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto +++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto @@ -36,11 +36,11 @@ message Metadata { // the full name of current node string cur_node = 4; // check if training is done. - bool training_done = 5; + bool training_done = 5; } message Chunk { - bytes buffer = 1; + bytes buffer = 1; } message EventReply { @@ -61,13 +61,13 @@ message EventReply { } message RunCMD { - // step level or node level. "step" or "node" - string run_level = 1; - oneof cmd { - int32 run_steps = 2; - // the next node full name - string node_name = 3; - } + // step level or node level. "step" or "node" + string run_level = 1; + oneof cmd { + int32 run_steps = 2; + // the next node full name + string node_name = 3; + } } message SetCMD { @@ -96,10 +96,24 @@ message WatchCondition { mean_lt = 10; sd_gt = 11; sd_lt = 12; + tensor_general_overflow = 13; + tensor_initialization = 14; + tensor_too_large = 15; + tensor_too_small = 16; + tensor_all_zero = 17; + tensor_change_too_large = 18; + tensor_change_too_small = 19; + tensor_not_changed = 20; } Condition condition = 1; - float value = 2; // for between condition, there will be two values - repeated bool include = 3; // for between condition, define the value is included or not + float value = 2; + message Parameter { + string name = 1; + bool disabled = 2; + double value = 3; + bool hit = 4; // Whether this parameter is hit when checking tensor. + } + repeated Parameter params = 4; } message WatchNode { diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 72ef147496..09094f6caf 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -41,6 +41,7 @@ using debugger::TensorProto; using debugger::WatchCondition; using debugger::WatchCondition_Condition_inf; using debugger::WatchCondition_Condition_nan; +using debugger::WatchCondition_Parameter; using debugger::WatchNode; using debugger::WatchpointHit; @@ -67,7 +68,8 @@ Debugger::Debugger() is_dataset_graph_(false), partial_memory_(false), last_overflow_bin_(0), - overflow_bin_path_("") { + overflow_bin_path_(""), + initial_suspend_(true) { if (CheckDebuggerEnabled()) { // configure partial memory reuse partial_memory_ = CheckDebuggerPartialMemoryEnabled(); @@ -292,9 +294,9 @@ void Debugger::PostExecute() { } } -bool Debugger::ReadNodeDataRequired() { +bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) { if (debugger_enabled_ && !is_dataset_graph_) { - auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); + auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel); // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) { return true; @@ -303,19 +305,19 @@ bool Debugger::ReadNodeDataRequired() { return false; } -void Debugger::PostExecuteNode() { +void Debugger::PostExecuteNode(const CNodePtr &kernel) { // access lock for public method std::lock_guard a_lock(access_lock_); if (pipeline::ExecutorPy::GetDebugTerminate()) { return; } if (debugger_enabled_ && !is_dataset_graph_) { - auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_); + auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel); // if kernel is watchpoint,and get hit. suspend. bool hit_empty_flag = true; if (is_watchpoint) { - auto hits = CheckWatchpoints(cur_name_); + auto hits = CheckWatchpoints(cur_name_, kernel); if (!hits.empty()) { SendWatchpoints(hits); CommandLoop(); @@ -477,6 +479,8 @@ void Debugger::CommandLoop() { MS_LOG(INFO) << "rechecking all watchpoints"; SendWatchpoints(CheckWatchpoints()); } else { + // no longer the initial suspension. + initial_suspend_ = false; // print run cmd content // get run_level and node_name run_level_ = GetRunLevel(reply); @@ -494,10 +498,17 @@ void Debugger::CommandLoop() { { // print set cmd content ProtoVector recieved_nodes = GetWatchnodes(reply); - for (auto node : recieved_nodes) { + for (const auto &node : recieved_nodes) { MS_LOG(INFO) << "node name: " << node.node_name(); MS_LOG(INFO) << "node type: " << node.node_type(); } + + ProtoVector parameters = GetParameters(reply); + for (const auto ¶meter : parameters) { + MS_LOG(INFO) << "parameter name: " << parameter.name(); + MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled(); + MS_LOG(INFO) << "parameter value: " << parameter.value(); + } MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition(); MS_LOG(INFO) << "id: " << GetWatchpointID(reply); MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply); @@ -506,7 +517,7 @@ void Debugger::CommandLoop() { if (GetWatchpointDelete(reply)) { RemoveWatchpoint(GetWatchpointID(reply)); } else { - SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply)); + SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply)); } break; case DebuggerCommand::kViewCMD: @@ -558,13 +569,25 @@ void AddTensorProtoInfo(TensorProto *tensor_item, TensorProto tensor) { tensor_item->clear_dims(); } -void Debugger::SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id) { +void Debugger::SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id, + const ProtoVector ¶meters) { std::vector> check_node_list; + std::vector parameter_list; + std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list), - [](WatchNode node) -> std::tuple { + [](const WatchNode &node) -> std::tuple { return make_tuple(node.node_name(), node.node_type() == "scope"); }); - debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list); + + std::transform( + parameters.begin(), parameters.end(), std::back_inserter(parameter_list), + [](const WatchCondition_Parameter ¶meter) -> DebugServices::parameter_t { + return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()}; + }); + debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list); + if (initial_suspend_ && + static_cast(condition.condition()) == DebugServices::CONDITION_TYPE::INIT) + SendWatchpoints(CheckWatchpoints()); } void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); } @@ -637,12 +660,13 @@ void Debugger::Exit() { } } -std::list Debugger::CheckWatchpoints(const std::string &watchnode) { +std::list Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) { std::vector name; std::vector slot; std::vector condition; std::vector watchpoint_id; std::vector overflow_ops; + std::vector> parameters; #ifdef ENABLE_D overflow_ops = CheckOpOverflow(); #endif @@ -652,12 +676,14 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode tensor_list = tensor_loader->GetTensor(); } else { tensor_list = tensor_loader->GetNodeTensorMap(watchnode); + debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); } - - debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops, tensor_list); + debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, overflow_ops, tensor_list, + initial_suspend_); std::list hits; for (unsigned int i = 0; i < name.size(); i++) { WatchpointHit hit; + std::vector ¶meter = parameters[i]; hit.set_id(watchpoint_id[i]); // here TensorProto act as a tensor indicator, not sending tensor content @@ -668,7 +694,13 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode WatchCondition *condition_item = hit.mutable_watch_condition(); condition_item->set_condition(debugger::WatchCondition_Condition(condition[i])); - + for (const auto &p : parameter) { + auto x = condition_item->mutable_params()->Add(); + x->set_name(p.name); + x->set_disabled(p.disabled); + x->set_value(p.value); + x->set_hit(p.hit); + } hits.push_back(hit); } return hits; @@ -710,6 +742,14 @@ DebuggerCommand GetCommand(const EventReply &reply) { return cmd; } +ProtoVector GetParameters(const EventReply &reply) { + if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) { + MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector()."; + return ProtoVector(); + } + return reply.set_cmd().watch_condition().params(); +} + ProtoVector GetWatchnodes(const EventReply &reply) { if (!reply.has_set_cmd()) { MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector()."; @@ -954,7 +994,7 @@ void Debugger::LoadGraphOutputs() { std::string kernel_name = node->fullname_with_scope(); auto output_size = AnfAlgo::GetOutputTensorNum(node); if (partial_memory_) { - if (!debug_services_->IsWatchPoint(kernel_name)) { + if (!debug_services_->IsWatchPoint(kernel_name, node)) { continue; } } diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index d092605a17..b9ceb4d083 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -33,6 +33,7 @@ using debugger::GraphProto; using debugger::ModelProto; using debugger::TensorProto; using debugger::WatchCondition; +using debugger::WatchCondition_Parameter; using debugger::WatchNode; using debugger::WatchpointHit; @@ -73,9 +74,9 @@ class Debugger : public std::enable_shared_from_this { // don't need a graph_ptr because it is saved during pre_execute void PostExecute(); - bool ReadNodeDataRequired(); + bool ReadNodeDataRequired(const CNodePtr &kernel); - void PostExecuteNode(); + void PostExecuteNode(const CNodePtr &kernel); // suspend the execution after a debug_op void PostDebugOp(); @@ -148,7 +149,8 @@ class Debugger : public std::enable_shared_from_this { void CommandLoop(); // set what nodes and conditions to watch - void SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id); + void SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id, + const ProtoVector ¶meters); // remove watchpoint with id void RemoveWatchpoint(const int32_t id); @@ -161,7 +163,8 @@ class Debugger : public std::enable_shared_from_this { // analyze tensors and check watchpoint conditions // return names of tensors and what condition they hit - std::list CheckWatchpoints(const std::string &watchnode = std::string()); + std::list CheckWatchpoints(const std::string &watchnode = std::string(), + const CNodePtr &kernel = NULL); // send watchpoints that hit void SendWatchpoints(const std::list &points); @@ -192,6 +195,8 @@ class Debugger : public std::enable_shared_from_this { std::map, std::string> stream_task_to_opname_; double last_overflow_bin_; std::string overflow_bin_path_; + // flag to keep track of the very first suspension of debugger + bool initial_suspend_; // singleton static std::mutex instance_lock_; static std::shared_ptr debugger_; @@ -210,6 +215,7 @@ DataType GetDebuggerNumberDataType(const TypePtr &type); DebuggerCommand GetCommand(const EventReply &reply); // parse other data out of EventReply +ProtoVector GetParameters(const EventReply &reply); ProtoVector GetWatchnodes(const EventReply &reply); std::string GetNodeName(const EventReply &reply); std::string GetRunLevel(const EventReply &reply); diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index 7de4f1221a..e8839eda79 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -56,6 +56,8 @@ class TensorLoader { std::map> GetTensorMap() { return tensor_list_map; } + std::shared_ptr GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; } + std::vector> GetNodeTensorMap(std::string node_name) { std::vector> tensors; for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) { diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 216ecb0eee..fb7fd4f602 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -113,7 +113,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, read_data = true; } } else if (debugger->debugger_enabled()) { - read_data = debugger->ReadNodeDataRequired(); + read_data = debugger->ReadNodeDataRequired(kernel); } if (!read_data) { return; @@ -168,7 +168,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, } } } - debugger->PostExecuteNode(); + debugger->PostExecuteNode(kernel); } } // namespace