Browse Source

support watchpoints on weights and bias, add support advanced watchpoints

tags/v1.1.0
Harshvardhan Gupta 5 years ago
parent
commit
727d424553
7 changed files with 384 additions and 68 deletions
  1. +221
    -23
      mindspore/ccsrc/debug/debug_services.cc
  2. +68
    -12
      mindspore/ccsrc/debug/debug_services.h
  3. +25
    -11
      mindspore/ccsrc/debug/debugger/debug_grpc.proto
  4. +56
    -16
      mindspore/ccsrc/debug/debugger/debugger.cc
  5. +10
    -4
      mindspore/ccsrc/debug/debugger/debugger.h
  6. +2
    -0
      mindspore/ccsrc/debug/tensor_load.h
  7. +2
    -2
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc

+ 221
- 23
mindspore/ccsrc/debug/debug_services.cc View File

@@ -14,6 +14,8 @@
* limitations under the License.
*/
#include <algorithm>
#include <map>
#include "backend/session/anf_runtime_algorithm.h"
#include "debug/debug_services.h"
namespace mindspore {

@@ -39,17 +41,19 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
DebugServices::~DebugServices() { delete tensor_loader_; }

void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list) {
const std::vector<std::tuple<std::string, bool>> &check_node_list,
const std::vector<parameter_t> &parameter_list) {
std::lock_guard<std::mutex> lg(lock_);

watchpoint_t watchpoint_item;
watchpoint_item.id = id;
watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
watchpoint_item.condition.parameter = parameter;
if (watch_condition > 2)
// odd indices are greater than conditions and even indicies are less than
if (watch_condition > 2 && watch_condition < 13)
// odd indices are greater than conditions and even indices are less than
watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
watchpoint_item.check_node_list = check_node_list;
watchpoint_item.parameter_list = parameter_list;
watchpoint_table[id] = watchpoint_item;
}

@@ -59,11 +63,22 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
}

template <typename T>
DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsigned int n, bool need_min_max,
bool need_mean_sd) {
DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n,
bool need_min_max, bool need_mean_sd,
bool need_zero_percentage,
bool need_tensor_update_ratio_mean, bool need_allclose) {
tensor_stats stats;
double zero_count = 0.0;
double rtol = 1.0e-5;
double atol = 1.0e-8;
double update_ratio_sum = 0.0;
double epsilon = 1.0e-9;
for (unsigned int i = 0; i < n; ++i) {
auto val = static_cast<double>(start[i]);
double val_prev = 0.0;
if (start_prev) {
val_prev = static_cast<double>(start_prev[i]);
}
stats.has_nan = stats.has_nan || std::isnan(val);
stats.has_inf = stats.has_inf || std::isinf(val);
if (stats.has_inf && stats.has_nan) {
@@ -81,15 +96,33 @@ DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsig
stats.mean += delta / (i + 1);
stats.m2 += delta * (val - stats.mean);
}

if (need_zero_percentage) {
if (val == 0) zero_count++;
}

if (need_tensor_update_ratio_mean && start_prev) {
update_ratio_sum += (std::abs(val) / (epsilon + std::abs(val_prev)));
}

if (need_allclose && start_prev) {
stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev)));
}
}
if (need_tensor_update_ratio_mean && start_prev) {
stats.tensor_update_ratio_mean = (update_ratio_sum / n);
}
stats.zero_percentage = (zero_count / n) * 100;
stats.n = n;
return stats;
}

void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
std::vector<std::vector<parameter_t>> *parameters,
const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
const std::vector<std::shared_ptr<TensorData>> &tensor_list,
const bool init_dbg_suspend) {
std::lock_guard<std::mutex> lg(lock_);
if (watchpoint_table.empty()) {
return;
@@ -102,79 +135,145 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
int tensor_dtype = tensor_ptr->data_type_c();
std::vector<unsigned int> hit_encountered;
std::vector<std::vector<bool>> hit_parms;
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
bool min_max_enabled = false;
bool mean_sd_enabled = false;
bool inf_nan_enabled = false;
bool zero_percentage_enabled = false;
bool tensor_update_ratio_mean_enabled = false;
bool allclose_enabled = false;
for (auto w_table_item : watchpoint_table) {
auto wp = std::get<1>(w_table_item);
if (wp.condition.type == INIT && !init_dbg_suspend) continue;
if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;
if (wp.IsNodeIncluded(tensor_name_no_slot)) {
min_max_enabled |= wp.min_max_enabled();
mean_sd_enabled |= wp.mean_sd_enabled();
inf_nan_enabled |= wp.inf_nan_enabled();
zero_percentage_enabled |= wp.zero_percentage_enabled();
tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled();
allclose_enabled |= wp.allclose_enabled();
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
}
}
tensor_stats stats;
uint num_elements = tensor_ptr->DataSize();
if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) {
if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled ||
tensor_update_ratio_mean_enabled || allclose_enabled) {
bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled);
bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL;
switch (tensor_dtype) {
case kNumberTypeUInt8: {
auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<uint8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeInt8: {
auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<int8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeUInt16: {
auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<uint16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeInt16: {
auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<int16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeUInt32: {
auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<uint32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeInt32:
case kNumberTypeInt: {
auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<int32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeUInt64: {
auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<uint64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeInt64: {
auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<int64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeFloat16: {
auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<float16 *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeFloat32:
case kNumberTypeFloat: {
auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<float *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
case kNumberTypeFloat64: {
auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c());
stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
auto start_addr_prev =
(need_prev && have_prev
? reinterpret_cast<double *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
: NULL);
stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled);
break;
}
default:
@@ -185,31 +284,97 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector

for (auto &it : watchpoints_to_check_table) {
auto wp_id = it.second.id;
std::vector<bool> hit_p;
CONDITION_TYPE enabled_condition = it.second.condition.type;
bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
(enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) ||
(enabled_condition == IS_OVERFLOW &&
std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());

if (enabled_condition > 2) {
if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) {
if (stats.has_inf || stats.has_nan) {
MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
<< condition_label[enabled_condition] << " watchpoint.";
} else {
} else if (enabled_condition < 13) {
bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
hit |= it.second.condition.comparison == "GT" ? gt : lt;
} else {
std::vector<parameter_t> parameter_list_item = it.second.parameter_list;
for (auto &p : parameter_list_item) {
if (p.disabled == false) {
bool p_hit = false;
if (p.name == "zero_percentage_ge") {
p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value;
} else if (p.name == "max_gt") {
p_hit = stats.parmLookup(STAT_MAX) > p.value;
} else if (p.name == "max_lt") {
p_hit = stats.parmLookup(STAT_MAX) < p.value;
} else if (p.name == "min_gt") {
p_hit = stats.parmLookup(STAT_MIN) > p.value;
} else if (p.name == "min_lt") {
p_hit = stats.parmLookup(STAT_MIN) < p.value;
} else if (p.name == "mean_gt") {
p_hit = stats.parmLookup(STAT_MEAN) > p.value;
} else if (p.name == "mean_lt") {
p_hit = stats.parmLookup(STAT_MEAN) < p.value;
} else if (p.name == "abs_mean_gt") {
p_hit = std::abs(stats.parmLookup(STAT_MEAN)) > p.value;
} else if (p.name == "abs_mean_lt") {
p_hit = std::abs(stats.parmLookup(STAT_MEAN)) < p.value;
} else if (p.name == "abs_update_ratio_mean_gt") {
p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value;
} else if (p.name == "abs_update_ratio_mean_lt") {
p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value;
}
hit |= p_hit;
hit_p.push_back(p_hit);
} else {
hit_p.push_back(false);
}
}

hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE));

if (hit) hit_parms.push_back(hit_p);
}
}
if (hit) hit_encountered.push_back(wp_id);
}

unsigned int index_parm_list = 0;
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
name->push_back(tensor_name_no_slot);
// return fully qualified name for weights and bias to MI
auto found_dot = tensor_name_no_slot.find_last_of('.');
if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" ||
tensor_name_no_slot.substr(found_dot + 1) == "bias")) {
auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list;
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
auto found_slash = w_name.find_last_of('/');
if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) {
name->push_back(w_name);
}
}
} else {
name->push_back(tensor_name_no_slot);
}

slot->push_back(tensor_slot);
int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
condition->push_back(condition_item);
watchpoint_id->push_back(*it_hit_id);
std::vector<parameter_t> parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list;
if (condition_item >= 13) {
unsigned int index_hit_parm = 0;
for (auto &p : parameter_list_item) {
p.hit = hit_parms[index_parm_list][index_hit_parm];
index_hit_parm++;
}
index_parm_list++;
}
parameters->push_back(parameter_list_item);
}
watchpoints_to_check_table.erase(*it_hit_id);
}
@@ -234,7 +399,7 @@ void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<
}
}

bool DebugServices::IsWatchPoint(std::string kernel_name) {
bool DebugServices::IsWatchPoint(std::string kernel_name, const CNodePtr &kernel) {
bool ret = false;
for (auto w_table_item : watchpoint_table) {
auto check_node_list = std::get<1>(w_table_item).check_node_list;
@@ -243,7 +408,7 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) {
bool w_type = std::get<1>(check_node);
if ((w_type == true &&
((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
(w_type == false && kernel_name == w_name)) {
(w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
ret = true;
return ret;
}
@@ -252,6 +417,39 @@ bool DebugServices::IsWatchPoint(std::string kernel_name) {
return ret;
}

bool DebugServices::IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel) {
if (kernel) {
auto input_size = AnfAlgo::GetInputTensorNum(kernel);
for (size_t j = 0; j < input_size; ++j) {
auto input_kernel = kernel->input(j + 1);
std::string input_kernel_name = input_kernel->fullname_with_scope();
auto found = w_name.find_last_of('/');
if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true;
}
return false;
} else {
return false;
}
}

void DebugServices::AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list,
const CNodePtr &kernel) {
if (kernel) {
auto input_size = AnfAlgo::GetInputTensorNum(kernel);
for (size_t j = 0; j < input_size; ++j) {
auto input_kernel = kernel->input(j + 1);
std::string input_kernel_name = input_kernel->fullname_with_scope();
std::string locate_tensor = input_kernel_name + ":0";
std::map<std::string, std::shared_ptr<TensorData>> tensor_map = tensor_loader_->GetTensorMap();
std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
iter = tensor_map.find(locate_tensor);
if (iter != tensor_map.end()) {
tensor_list->push_back(iter->second);
}
}
}
}

TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; }
std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
return watchpoint_table;


+ 68
- 12
mindspore/ccsrc/debug/debug_services.h View File

@@ -52,19 +52,37 @@ class DebugServices {
MEAN_GT,
MEAN_LT,
SD_GT,
SD_LT
SD_LT,
GENERAL_OVERFLOW,
INIT,
TOO_LARGE,
TOO_SMALL,
ALL_ZERO,
CHANGE_TOO_LARGE,
CHANGE_TOO_SMALL,
NOT_CHANGED
};

enum STAT_TYPE { STAT_MIN, STAT_MAX, STAT_MEAN, STAT_ZERO_PERCENTAGE, STAT_TENSOR_UPDATE_RATIO_MEAN, STAT_ALLCLOSE };

typedef struct condition {
CONDITION_TYPE type;
float parameter = 0;
std::string comparison;
} condition_t;

typedef struct parameter {
std::string name;
bool disabled;
double_t value;
bool hit;
} parameter_t;

typedef struct watchpoint {
unsigned int id;
condition_t condition;
std::vector<std::tuple<std::string, bool>> check_node_list;
std::vector<parameter_t> parameter_list;
size_t location = 0;

bool IsNodeIncluded(const std::string &tensor_name) {
@@ -72,6 +90,8 @@ class DebugServices {
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node);
auto found = w_name.find_last_of('/');
if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true;
if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
return true;
}
@@ -81,15 +101,27 @@ class DebugServices {

bool min_max_enabled() {
return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT ||
condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT;
condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT ||
(condition.type == INIT && (!parameter_list[1].disabled || !parameter_list[2].disabled)) ||
(condition.type == TOO_LARGE && (!parameter_list[1].disabled || !parameter_list[2].disabled)) ||
(condition.type == TOO_SMALL && (!parameter_list[1].disabled || !parameter_list[2].disabled));
}
// inf or nan related condition set
bool inf_nan_enabled() { return condition.type == HAS_INF || condition.type == HAS_NAN; }
bool inf_nan_enabled() {
return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW;
}
// mean or sd related condition set
bool mean_sd_enabled() {
return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
condition.type == SD_GT;
condition.type == SD_GT ||
(condition.type == TOO_LARGE && (!parameter_list[0].disabled || !parameter_list[3].disabled)) ||
(condition.type == TOO_SMALL && (!parameter_list[0].disabled || !parameter_list[3].disabled));
}
bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; }
bool tensor_update_ratio_mean_enabled() {
return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
}
bool allclose_enabled() { return condition.type == NOT_CHANGED; }
} watchpoint_t;

struct tensor_stats {
@@ -100,6 +132,9 @@ class DebugServices {
unsigned int n = 0;
double mean = 0.0;
double m2 = 0.0;
double zero_percentage = 0.0;
double tensor_update_ratio_mean = -1;
bool allclose = false;

double statLookup(CONDITION_TYPE type) const {
if (type == MAX_GT || type == MAX_LT) return max;
@@ -110,6 +145,16 @@ class DebugServices {
return std::numeric_limits<double>::quiet_NaN();
}

double parmLookup(STAT_TYPE type) const {
if (type == STAT_MAX) return max;
if (type == STAT_MIN) return min;
if (type == STAT_MEAN) return mean;
if (type == STAT_ZERO_PERCENTAGE) return zero_percentage;
if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean;
if (type == STAT_ALLCLOSE) return allclose;
return std::numeric_limits<double>::quiet_NaN();
}

double getMean() const { return mean; }

double getVariance() const {
@@ -124,19 +169,25 @@ class DebugServices {
};

void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list);
const std::vector<std::tuple<std::string, bool>> &check_node_list,
const std::vector<parameter_t> &parameter_list);

void RemoveWatchpoint(unsigned int id);

void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list);
std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);

void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape);

bool IsWatchPoint(std::string kernel_name);
bool IsWatchPoint(std::string kernel_name, const CNodePtr &kernel = nullptr);

bool IsWatchPointNodeInput(std::string w_name, const CNodePtr &kernel);

void AddWeightsBiasInputs(std::vector<std::shared_ptr<TensorData>> *tensor_list, const CNodePtr &kernel);

TensorLoader *tensor_loader() const;

@@ -146,14 +197,19 @@ class DebugServices {
std::mutex lock_;

std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
std::vector<std::string> condition_label = {"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT",
"MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT",
"MEAN_LT", "SD_GT", "SD_LT"};
std::vector<std::string> condition_label = {
"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT",
"MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT",
"MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT",
"TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL",
"NOT_CHANGED"};

TensorLoader *tensor_loader_;

template <typename T>
static tensor_stats SummarizeTensor(const T *start, unsigned int n, bool need_min_max, bool need_mean_sd);
static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max,
bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean,
bool need_allclose);
};
} // namespace mindspore



+ 25
- 11
mindspore/ccsrc/debug/debugger/debug_grpc.proto View File

@@ -36,11 +36,11 @@ message Metadata {
// the full name of current node
string cur_node = 4;
// check if training is done.
bool training_done = 5;
bool training_done = 5;
}

message Chunk {
bytes buffer = 1;
bytes buffer = 1;
}

message EventReply {
@@ -61,13 +61,13 @@ message EventReply {
}

message RunCMD {
// step level or node level. "step" or "node"
string run_level = 1;
oneof cmd {
int32 run_steps = 2;
// the next node full name
string node_name = 3;
}
// step level or node level. "step" or "node"
string run_level = 1;
oneof cmd {
int32 run_steps = 2;
// the next node full name
string node_name = 3;
}
}

message SetCMD {
@@ -96,10 +96,24 @@ message WatchCondition {
mean_lt = 10;
sd_gt = 11;
sd_lt = 12;
tensor_general_overflow = 13;
tensor_initialization = 14;
tensor_too_large = 15;
tensor_too_small = 16;
tensor_all_zero = 17;
tensor_change_too_large = 18;
tensor_change_too_small = 19;
tensor_not_changed = 20;
}
Condition condition = 1;
float value = 2; // for between condition, there will be two values
repeated bool include = 3; // for between condition, define the value is included or not
float value = 2;
message Parameter {
string name = 1;
bool disabled = 2;
double value = 3;
bool hit = 4; // Whether this parameter is hit when checking tensor.
}
repeated Parameter params = 4;
}

message WatchNode {


+ 56
- 16
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -41,6 +41,7 @@ using debugger::TensorProto;
using debugger::WatchCondition;
using debugger::WatchCondition_Condition_inf;
using debugger::WatchCondition_Condition_nan;
using debugger::WatchCondition_Parameter;
using debugger::WatchNode;
using debugger::WatchpointHit;

@@ -67,7 +68,8 @@ Debugger::Debugger()
is_dataset_graph_(false),
partial_memory_(false),
last_overflow_bin_(0),
overflow_bin_path_("") {
overflow_bin_path_(""),
initial_suspend_(true) {
if (CheckDebuggerEnabled()) {
// configure partial memory reuse
partial_memory_ = CheckDebuggerPartialMemoryEnabled();
@@ -292,9 +294,9 @@ void Debugger::PostExecute() {
}
}

bool Debugger::ReadNodeDataRequired() {
bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) {
if (debugger_enabled_ && !is_dataset_graph_) {
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_);
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
// if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
return true;
@@ -303,19 +305,19 @@ bool Debugger::ReadNodeDataRequired() {
return false;
}

void Debugger::PostExecuteNode() {
void Debugger::PostExecuteNode(const CNodePtr &kernel) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
if (pipeline::ExecutorPy::GetDebugTerminate()) {
return;
}
if (debugger_enabled_ && !is_dataset_graph_) {
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_);
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);

// if kernel is watchpoint,and get hit. suspend.
bool hit_empty_flag = true;
if (is_watchpoint) {
auto hits = CheckWatchpoints(cur_name_);
auto hits = CheckWatchpoints(cur_name_, kernel);
if (!hits.empty()) {
SendWatchpoints(hits);
CommandLoop();
@@ -477,6 +479,8 @@ void Debugger::CommandLoop() {
MS_LOG(INFO) << "rechecking all watchpoints";
SendWatchpoints(CheckWatchpoints());
} else {
// no longer the initial suspension.
initial_suspend_ = false;
// print run cmd content
// get run_level and node_name
run_level_ = GetRunLevel(reply);
@@ -494,10 +498,17 @@ void Debugger::CommandLoop() {
{
// print set cmd content
ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply);
for (auto node : recieved_nodes) {
for (const auto &node : recieved_nodes) {
MS_LOG(INFO) << "node name: " << node.node_name();
MS_LOG(INFO) << "node type: " << node.node_type();
}

ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply);
for (const auto &parameter : parameters) {
MS_LOG(INFO) << "parameter name: " << parameter.name();
MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled();
MS_LOG(INFO) << "parameter value: " << parameter.value();
}
MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition();
MS_LOG(INFO) << "id: " << GetWatchpointID(reply);
MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply);
@@ -506,7 +517,7 @@ void Debugger::CommandLoop() {
if (GetWatchpointDelete(reply)) {
RemoveWatchpoint(GetWatchpointID(reply));
} else {
SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply));
SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
}
break;
case DebuggerCommand::kViewCMD:
@@ -558,13 +569,25 @@ void AddTensorProtoInfo(TensorProto *tensor_item, TensorProto tensor) {
tensor_item->clear_dims();
}

void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id) {
void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
const ProtoVector<WatchCondition_Parameter> &parameters) {
std::vector<std::tuple<std::string, bool>> check_node_list;
std::vector<DebugServices::parameter_t> parameter_list;

std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list),
[](WatchNode node) -> std::tuple<std::string, bool> {
[](const WatchNode &node) -> std::tuple<std::string, bool> {
return make_tuple(node.node_name(), node.node_type() == "scope");
});
debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list);

std::transform(
parameters.begin(), parameters.end(), std::back_inserter(parameter_list),
[](const WatchCondition_Parameter &parameter) -> DebugServices::parameter_t {
return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
});
debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
if (initial_suspend_ &&
static_cast<DebugServices::CONDITION_TYPE>(condition.condition()) == DebugServices::CONDITION_TYPE::INIT)
SendWatchpoints(CheckWatchpoints());
}

void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
@@ -637,12 +660,13 @@ void Debugger::Exit() {
}
}

std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode) {
std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel) {
std::vector<std::string> name;
std::vector<std::string> slot;
std::vector<int> condition;
std::vector<unsigned int> watchpoint_id;
std::vector<std::string> overflow_ops;
std::vector<std::vector<DebugServices::parameter_t>> parameters;
#ifdef ENABLE_D
overflow_ops = CheckOpOverflow();
#endif
@@ -652,12 +676,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
tensor_list = tensor_loader->GetTensor();
} else {
tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
}
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops, tensor_list);
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, overflow_ops, tensor_list,
initial_suspend_);
std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit;
std::vector<DebugServices::parameter_t> &parameter = parameters[i];
hit.set_id(watchpoint_id[i]);

// here TensorProto act as a tensor indicator, not sending tensor content
@@ -668,7 +694,13 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode

WatchCondition *condition_item = hit.mutable_watch_condition();
condition_item->set_condition(debugger::WatchCondition_Condition(condition[i]));

for (const auto &p : parameter) {
auto x = condition_item->mutable_params()->Add();
x->set_name(p.name);
x->set_disabled(p.disabled);
x->set_value(p.value);
x->set_hit(p.hit);
}
hits.push_back(hit);
}
return hits;
@@ -710,6 +742,14 @@ DebuggerCommand GetCommand(const EventReply &reply) {
return cmd;
}

ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply) {
if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>().";
return ProtoVector<WatchCondition_Parameter>();
}
return reply.set_cmd().watch_condition().params();
}

ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) {
if (!reply.has_set_cmd()) {
MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>().";
@@ -954,7 +994,7 @@ void Debugger::LoadGraphOutputs() {
std::string kernel_name = node->fullname_with_scope();
auto output_size = AnfAlgo::GetOutputTensorNum(node);
if (partial_memory_) {
if (!debug_services_->IsWatchPoint(kernel_name)) {
if (!debug_services_->IsWatchPoint(kernel_name, node)) {
continue;
}
}


+ 10
- 4
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -33,6 +33,7 @@ using debugger::GraphProto;
using debugger::ModelProto;
using debugger::TensorProto;
using debugger::WatchCondition;
using debugger::WatchCondition_Parameter;
using debugger::WatchNode;
using debugger::WatchpointHit;

@@ -73,9 +74,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// don't need a graph_ptr because it is saved during pre_execute
void PostExecute();

bool ReadNodeDataRequired();
bool ReadNodeDataRequired(const CNodePtr &kernel);

void PostExecuteNode();
void PostExecuteNode(const CNodePtr &kernel);

// suspend the execution after a debug_op
void PostDebugOp();
@@ -148,7 +149,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void CommandLoop();

// set what nodes and conditions to watch
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id);
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
const ProtoVector<WatchCondition_Parameter> &parameters);

// remove watchpoint with id
void RemoveWatchpoint(const int32_t id);
@@ -161,7 +163,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string());
std::list<WatchpointHit> CheckWatchpoints(const std::string &watchnode = std::string(),
const CNodePtr &kernel = NULL);

// send watchpoints that hit
void SendWatchpoints(const std::list<WatchpointHit> &points);
@@ -192,6 +195,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_;
double last_overflow_bin_;
std::string overflow_bin_path_;
// flag to keep track of the very first suspension of debugger
bool initial_suspend_;
// singleton
static std::mutex instance_lock_;
static std::shared_ptr<Debugger> debugger_;
@@ -210,6 +215,7 @@ DataType GetDebuggerNumberDataType(const TypePtr &type);
DebuggerCommand GetCommand(const EventReply &reply);

// parse other data out of EventReply
ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply);
ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply);
std::string GetNodeName(const EventReply &reply);
std::string GetRunLevel(const EventReply &reply);


+ 2
- 0
mindspore/ccsrc/debug/tensor_load.h View File

@@ -56,6 +56,8 @@ class TensorLoader {

std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }

std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; }

std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
std::vector<std::shared_ptr<TensorData>> tensors;
for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) {


+ 2
- 2
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -113,7 +113,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
read_data = true;
}
} else if (debugger->debugger_enabled()) {
read_data = debugger->ReadNodeDataRequired();
read_data = debugger->ReadNodeDataRequired(kernel);
}
if (!read_data) {
return;
@@ -168,7 +168,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
}
}
}
debugger->PostExecuteNode();
debugger->PostExecuteNode(kernel);
}
} // namespace



Loading…
Cancel
Save