init

5 years ago · 561f9082e9
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -12,6 +12,7 @@ if (ENABLE_DEBUGGER)
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
        )
 endif (ENABLE_DEBUGGER)
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@@ -17,6 +17,8 @@
 #include <map>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "debug/debug_services.h"
 #include "debug/debugger/tensor_summary.h"

 namespace mindspore {

 DebugServices::DebugServices() {
@@ -49,9 +51,6 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
  watchpoint_item.id = id;
  watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  watchpoint_item.condition.parameter = parameter;
  if (watch_condition > 2 && watch_condition < 13)
    // odd indices are greater than conditions and even indices are less than
    watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
  watchpoint_item.check_node_list = check_node_list;
  watchpoint_item.parameter_list = parameter_list;
  watchpoint_table[id] = watchpoint_item;
@@ -62,77 +61,14 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
  watchpoint_table.erase(id);
 }

 template <typename T>
 DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n,
                                                           bool need_min_max, bool need_mean_sd,
                                                           bool need_zero_percentage,
                                                           bool need_tensor_update_ratio_mean, bool need_allclose,
                                                           bool need_abs_mean) {
  tensor_stats stats;
  double zero_count = 0.0;
  double rtol = 1.0e-5;
  double atol = 1.0e-8;
  double update_ratio_sum = 0.0;
  double epsilon = 1.0e-9;
  for (unsigned int i = 0; i < n; ++i) {
    auto val = static_cast<double>(start[i]);
    double val_prev = 0.0;
    if (start_prev) {
      val_prev = static_cast<double>(start_prev[i]);
    }
    stats.has_nan = stats.has_nan || std::isnan(val);
    stats.has_inf = stats.has_inf || std::isinf(val);
    if (stats.has_inf && stats.has_nan) {
      // other statistics don't make sense in this case
      break;
    }

    if (need_min_max) {
      stats.min = std::min(stats.min, val);
      stats.max = std::max(stats.max, val);
    }

    if (need_mean_sd) {
      double delta = val - stats.mean;
      stats.mean += delta / (i + 1);
      stats.m2 += delta * (val - stats.mean);
    }

    if (need_abs_mean) {
      double delta = std::abs(val) - stats.abs_mean;
      stats.abs_mean += delta / (i + 1);
    }

    if (need_zero_percentage) {
      if (val == 0) zero_count++;
    }

    if (need_tensor_update_ratio_mean && start_prev) {
      update_ratio_sum += (std::abs(val - val_prev) / (epsilon + std::abs(val_prev)));
    }

    if (need_allclose && start_prev) {
      stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev)));
    }
  }
  if (need_tensor_update_ratio_mean && start_prev) {
    stats.tensor_update_ratio_mean = (update_ratio_sum / n);
  }
  stats.zero_percentage = (zero_count / n) * 100;
  stats.n = n;
  return stats;
 }

 void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
                                     std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
                                     std::vector<std::vector<parameter_t>> *parameters,
                                     const std::vector<std::string> &op_overflows,
                                     std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
                                     const std::vector<std::shared_ptr<TensorData>> &tensor_list,
                                     const bool init_dbg_suspend) {
  std::lock_guard<std::mutex> lg(lock_);
  if (watchpoint_table.empty()) {
    return;
  }
  if (watchpoint_table.empty()) return;

  for (const auto &tensor : tensor_list) {
    const auto tensor_name = tensor->GetName();
@@ -140,268 +76,113 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
    const auto tensor_slot = std::to_string(tensor->GetSlot());
    mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
    int tensor_dtype = tensor_ptr->data_type_c();
    std::vector<unsigned int> hit_encountered;
    std::vector<std::vector<bool>> hit_parms;
    std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
    bool min_max_enabled = false;
    bool mean_sd_enabled = false;
    bool inf_nan_enabled = false;
    bool zero_percentage_enabled = false;
    bool tensor_update_ratio_mean_enabled = false;
    bool allclose_enabled = false;
    bool abs_mean_enabled = false;
    std::vector<watchpoint_t> watchpoints_to_check;
    std::string qualified_tensor_name;
    for (auto w_table_item : watchpoint_table) {
      auto wp = std::get<1>(w_table_item);
      if (wp.condition.type == INIT && !init_dbg_suspend) continue;
      if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;
      if (wp.IsNodeIncluded(tensor_name_no_slot)) {
        min_max_enabled |= wp.min_max_enabled();
        mean_sd_enabled |= wp.mean_sd_enabled();
        inf_nan_enabled |= wp.inf_nan_enabled();
        zero_percentage_enabled |= wp.zero_percentage_enabled();
        tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled();
        allclose_enabled |= wp.allclose_enabled();
        abs_mean_enabled |= wp.abs_mean_enabled();
        watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
      std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
      if (!found.empty()) {
        qualified_tensor_name = found;
        watchpoints_to_check.push_back(w_table_item.second);
      }
    }
    tensor_stats stats;
    uint num_elements = tensor_ptr->DataSize();
    if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled ||
        tensor_update_ratio_mean_enabled || allclose_enabled || abs_mean_enabled) {
      bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled);
      bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL;
    // no wp set on current tensor
    if (watchpoints_to_check.empty()) continue;

    uint32_t num_elements = tensor_ptr->DataSize();
    void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name)
                                  ? tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()
                                  : nullptr;
    std::unique_ptr<ITensorSummary> base_summary_ptr;
    if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
      switch (tensor_dtype) {
        case kNumberTypeUInt8: {
          auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<uint8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<uint8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeInt8: {
          auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<int8_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<int8_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeUInt16: {
          auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<uint16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<uint16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeInt16: {
          auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<int16_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<int16_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeUInt32: {
          auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<uint32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<uint32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeInt32:
        case kNumberTypeInt: {
          auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<int32_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<int32_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeUInt64: {
          auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<uint64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<uint64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeInt64: {
          auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<int64_t *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<int64_t>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeFloat16: {
          auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<float16 *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<float16>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeFloat32:
        case kNumberTypeFloat: {
          auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<float *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<float>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        case kNumberTypeFloat64: {
          auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c());
          auto start_addr_prev =
            (need_prev && have_prev
               ? reinterpret_cast<double *>(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c())
               : NULL);
          stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled,
                                  zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled,
                                  abs_mean_enabled);
          base_summary_ptr =
            std::make_unique<TensorSummary<double>>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements);
          break;
        }
        default:
          MS_LOG(INFO) << "Unsupported tensor type";
          break;
      }
      base_summary_ptr->SummarizeTensor(watchpoints_to_check);
    }

    for (auto &it : watchpoints_to_check_table) {
      auto wp_id = it.second.id;
      std::vector<bool> hit_p;
      CONDITION_TYPE enabled_condition = it.second.condition.type;
      bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
                 (enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) ||
                 (enabled_condition == IS_OVERFLOW &&
                  std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());

      if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) {
        if (stats.has_inf || stats.has_nan) {
          MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
                          << condition_label[enabled_condition] << " watchpoint.";
        } else if (enabled_condition < 13) {
          bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
          bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
          hit |= it.second.condition.comparison == "GT" ? gt : lt;
        } else {
          std::vector<parameter_t> parameter_list_item = it.second.parameter_list;
          for (auto &p : parameter_list_item) {
            if (p.disabled == false) {
              bool p_hit = false;
              if (p.name == "zero_percentage_ge") {
                p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value;
              } else if (p.name == "max_gt") {
                p_hit = stats.parmLookup(STAT_MAX) > p.value;
              } else if (p.name == "max_lt") {
                p_hit = stats.parmLookup(STAT_MAX) < p.value;
              } else if (p.name == "min_gt") {
                p_hit = stats.parmLookup(STAT_MIN) > p.value;
              } else if (p.name == "min_lt") {
                p_hit = stats.parmLookup(STAT_MIN) < p.value;
              } else if (p.name == "mean_gt") {
                p_hit = stats.parmLookup(STAT_MEAN) > p.value;
              } else if (p.name == "mean_lt") {
                p_hit = stats.parmLookup(STAT_MEAN) < p.value;
              } else if (p.name == "abs_mean_gt") {
                p_hit = stats.parmLookup(STAT_ABS_MEAN) > p.value;
              } else if (p.name == "abs_mean_lt") {
                p_hit = stats.parmLookup(STAT_ABS_MEAN) < p.value;
              } else if (p.name == "abs_update_ratio_mean_gt") {
                p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value;
              } else if (p.name == "abs_update_ratio_mean_lt") {
                p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value;
              }
              hit |= p_hit;
              hit_p.push_back(p_hit);
            } else {
              hit_p.push_back(false);
            }
          }

          hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE));

          if (hit) hit_parms.push_back(hit_p);
        }
    for (auto &wp : watchpoints_to_check) {
      bool is_hit = false;
      int error_code = 0;
      std::vector<parameter_t> parameter_list = {};
      if (wp.condition.type == IS_OVERFLOW) {
        is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
      } else {
        auto item = base_summary_ptr->IsWatchpointHit(wp);
        is_hit = std::get<0>(item);
        error_code = std::get<1>(item);
        parameter_list = std::get<2>(item);
      }
      if (hit) hit_encountered.push_back(wp_id);
    }

    unsigned int index_parm_list = 0;
    for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
      if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
        // return fully qualified name for weights and bias to MI
        auto found_dot = tensor_name_no_slot.find_last_of('.');
        if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" ||
                                               tensor_name_no_slot.substr(found_dot + 1) == "bias")) {
          auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list;
          bool found_match = false;
          for (auto check_node : check_node_list) {
            std::string w_name = std::get<0>(check_node);
            auto found_slash = w_name.find_last_of('/');
            if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) {
              name->push_back(w_name);
              found_match = true;
              break;
            }
          }
          if (!found_match) {
            name->push_back(tensor_name_no_slot);
          }
        } else {
          name->push_back(tensor_name_no_slot);
        }

      if (is_hit || error_code) {
        name->push_back(qualified_tensor_name);
        slot->push_back(tensor_slot);
        int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
        condition->push_back(condition_item);
        watchpoint_id->push_back(*it_hit_id);
        std::vector<parameter_t> parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list;
        if (condition_item >= 13) {
          unsigned int index_hit_parm = 0;
          for (auto &p : parameter_list_item) {
            p.hit = hit_parms[index_parm_list][index_hit_parm];
            index_hit_parm++;
          }
          index_parm_list++;
        }
        parameters->push_back(parameter_list_item);
        condition->push_back(wp.condition.type);
        watchpoint_id->push_back(wp.id);
        parameters->push_back(parameter_list);
        error_codes->push_back(error_code);
      }
      watchpoints_to_check_table.erase(*it_hit_id);
    }
  }
 }
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@@ -23,6 +23,7 @@
 #include <tuple>
 #include <unordered_map>
 #include <mutex>
 #include <map>
 #include <limits>
 #include "debug/tensor_load.h"
 #include "debug/tensor_data.h"
@@ -60,23 +61,13 @@ class DebugServices {
    ALL_ZERO,
    CHANGE_TOO_LARGE,
    CHANGE_TOO_SMALL,
    NOT_CHANGED
  };

  enum STAT_TYPE {
    STAT_MIN,
    STAT_MAX,
    STAT_MEAN,
    STAT_ZERO_PERCENTAGE,
    STAT_TENSOR_UPDATE_RATIO_MEAN,
    STAT_ALLCLOSE,
    STAT_ABS_MEAN
    NOT_CHANGED,
    RANGE
  };

  typedef struct condition {
    CONDITION_TYPE type;
    float parameter = 0;
    std::string comparison;
  } condition_t;

  typedef struct parameter {
@@ -84,6 +75,25 @@ class DebugServices {
    bool disabled;
    double_t value;
    bool hit;
    double_t actual_value;
    void Evaluate(double_t actualValue, std::string inequality_type) {
      if (std::isnan(actualValue)) return;

      actual_value = actualValue;
      if (inequality_type.empty()) {
        auto pos = name.find_last_of('_');
        if (pos != std::string::npos) {
          inequality_type = name.substr(pos + 1);
        }
      }

      std::map<std::string, bool> condition_check{{"gt", actual_value > value},
                                                  {"lt", actual_value < value},
                                                  {"ge", actual_value >= value},
                                                  {"le", actual_value <= value}};

      hit = condition_check[inequality_type];
    }
  } parameter_t;

  typedef struct watchpoint {
@@ -93,18 +103,28 @@ class DebugServices {
    std::vector<parameter_t> parameter_list;
    size_t location = 0;

    bool IsNodeIncluded(const std::string &tensor_name) {
    std::string FindQualifiedTensorName(const std::string &tensor_name) {
      std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
      for (auto check_node : check_node_list) {
        std::string w_name = std::get<0>(check_node);
        bool w_type = std::get<1>(check_node);
        auto found = w_name.find_last_of('/');
        if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true;
        if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return w_name;
        if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
          return true;
          return w_name;
        }
      }
      return false;
      return {};
    }

    bool is_gt_wp() {
      return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT ||
             condition.type == SD_GT || condition.type == MAX_MIN_GT;
    }

    bool is_lt_wp() {
      return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT ||
             condition.type == SD_LT || condition.type == MAX_MIN_LT;
    }

    bool min_max_enabled() {
@@ -119,67 +139,26 @@ class DebugServices {
      return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW;
    }
    // mean or sd related condition set
    bool mean_sd_enabled() {
    bool mean_sd_enabled() const {
      return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
             condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) ||
             (condition.type == TOO_SMALL && !parameter_list[3].disabled);
    }
    bool abs_mean_enabled() {
    bool abs_mean_enabled() const {
      return (condition.type == TOO_LARGE && !parameter_list[0].disabled) ||
             (condition.type == TOO_SMALL && !parameter_list[0].disabled);
    }
    bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; }
    bool tensor_update_ratio_mean_enabled() {
      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
    }
    bool allclose_enabled() { return condition.type == NOT_CHANGED; }
  } watchpoint_t;

  struct tensor_stats {
    double min = std::numeric_limits<double>::max();
    double max = std::numeric_limits<double>::lowest();
    bool has_inf = false;
    bool has_nan = false;
    unsigned int n = 0;
    double mean = 0.0;
    double m2 = 0.0;
    double zero_percentage = 0.0;
    double tensor_update_ratio_mean = -1;
    bool allclose = false;
    double abs_mean = 0.0;

    double statLookup(CONDITION_TYPE type) const {
      if (type == MAX_GT || type == MAX_LT) return max;
      if (type == MIN_GT || type == MIN_LT) return min;
      if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
      if (type == MEAN_GT || type == MEAN_LT) return mean;
      if (type == SD_GT || type == SD_LT) return getStandardDeviation();
      return std::numeric_limits<double>::quiet_NaN();
    bool tensor_update_ratio_mean_enabled() const {
      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
    }
    bool allclose_enabled() const { return condition.type == NOT_CHANGED; }

    double parmLookup(STAT_TYPE type) const {
      if (type == STAT_MAX) return max;
      if (type == STAT_MIN) return min;
      if (type == STAT_MEAN) return mean;
      if (type == STAT_ZERO_PERCENTAGE) return zero_percentage;
      if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean;
      if (type == STAT_ALLCLOSE) return allclose;
      if (type == STAT_ABS_MEAN) return abs_mean;
      return std::numeric_limits<double>::quiet_NaN();
    bool range_enabled() const {
      return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
    }

    double getMean() const { return mean; }

    double getVariance() const {
      if (n > 1) {
        return m2 / (n - 1);
      } else {
        return 0.0;
      }
    }

    double getStandardDeviation() const { return sqrt(getVariance()); }
  };
  } watchpoint_t;

  void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
                     const std::vector<std::tuple<std::string, bool>> &check_node_list,
@@ -189,7 +168,7 @@ class DebugServices {

  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
                        std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
                        const std::vector<std::string> &op_overflows,
                        std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);

  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
@@ -210,19 +189,8 @@ class DebugServices {
  std::mutex lock_;

  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
  std::vector<std::string> condition_label = {
    "HAS_NAN",    "HAS_INF",   "IS_OVERFLOW", "MAX_GT",           "MAX_LT",
    "MIN_GT",     "MIN_LT",    "MAX_MIN_GT",  "MAX_MIN_LT",       "MEAN_GT",
    "MEAN_LT",    "SD_GT",     "SD_LT",       "GENERAL_OVERFLOW", "INIT",
    "TOO_LARGE",  "TOO_SMALL", "ALL_ZERO",    "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL",
    "NOT_CHANGED"};

  TensorLoader *tensor_loader_;

  template <typename T>
  static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max,
                                      bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean,
                                      bool need_allclose, bool need_abs_mean_sd);
 };
 }  // namespace mindspore

--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@@ -37,14 +37,14 @@ message Metadata {
  // the full name of current node
  string cur_node = 4;
  // check if training is done.
  bool training_done = 5; 
  bool training_done = 5;
  // the number of total graphs
  int32 graph_num = 6;
 }

 message Chunk {
    bytes buffer = 1;
    bool finished = 2;
  bytes buffer = 1;
  bool finished = 2;
 }

 message EventReply {
@@ -108,6 +108,7 @@ message WatchCondition {
    tensor_change_too_large = 18;
    tensor_change_too_small = 19;
    tensor_not_changed = 20;
    tensor_range = 21;
  }
  Condition condition = 1;
  float value = 2;
@@ -116,6 +117,7 @@ message WatchCondition {
    bool disabled = 2;
    double value = 3;
    bool hit = 4;  // Whether this parameter is hit when checking tensor.
    double actual_value = 5;
  }
  repeated Parameter params = 4;
 }
@@ -129,4 +131,5 @@ message WatchpointHit {
  TensorProto tensor = 1;
  WatchCondition watch_condition = 2;
  int32 id = 3;
  int32 error_code = 4;
 }
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -757,6 +757,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
  std::vector<unsigned int> watchpoint_id;
  std::vector<std::string> overflow_ops;
  std::vector<std::vector<DebugServices::parameter_t>> parameters;
  std::vector<int32_t> error_codes;
 #ifdef ENABLE_D
  overflow_ops = CheckOpOverflow();
 #endif
@@ -768,14 +769,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
    tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
    debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
  }
  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, overflow_ops, tensor_list,
                                    initial_suspend_);
  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
                                    tensor_list, initial_suspend_);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
    std::vector<DebugServices::parameter_t> &parameter = parameters[i];
    hit.set_id(watchpoint_id[i]);

    hit.set_error_code(error_codes[i]);
    // here TensorProto act as a tensor indicator, not sending tensor content
    TensorProto *tensor_item = hit.mutable_tensor();
    tensor_item->set_node_name(name[i]);
@@ -790,6 +791,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
      x->set_disabled(p.disabled);
      x->set_value(p.value);
      x->set_hit(p.hit);
      x->set_actual_value(p.actual_value);
    }
    hits.push_back(hit);
  }
--- a/mindspore/ccsrc/debug/debugger/tensor_summary.cc
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc
@@ -0,0 +1,268 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <math.h>
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <bitset>
 #include <tuple>
 #include "debug/debugger/tensor_summary.h"

 namespace mindspore {
 using CONDITION_TYPE = DebugServices::CONDITION_TYPE;

 RangeCountCalculator::RangeCountCalculator()
    : range_start_inclusive(-std::numeric_limits<double>::infinity()),
      range_end_inclusive(std::numeric_limits<double>::infinity()),
      count(0),
      total(0) {}

 void RangeCountCalculator::ProcessElement(double element) {
  count += (element >= range_start_inclusive && element <= range_end_inclusive);
  total += 1;
 }

 double RangeCountCalculator::GetPercentInRange() {
  if (total == 0) {
    return 0.0;
  }
  return 100.0 * count / total;
 }

 AllCloseCalculator::AllCloseCalculator() : atol(1.0e-8), rtol(1.0e-5), result(true) {}

 void AllCloseCalculator::ProcessElement(double current, double previous) {
  result &= (std::abs(current - previous) <= (atol + rtol * std::abs(previous)));
 }

 bool AllCloseCalculator::IsAllClose() { return result; }

 MeanCalculator::MeanCalculator() : mean(0.0), count(0) {}

 void MeanCalculator::ProcessElement(double value) {
  count += 1;
  double delta = value - mean;
  mean += delta / count;
 }

 double MeanCalculator::GetMean() { return mean; }

 VarianceAndMeanCalculator::VarianceAndMeanCalculator() : mean(0.0), count(0), m2(0.0) {}

 void VarianceAndMeanCalculator::ProcessElement(double value) {
  count += 1;
  double delta = value - mean;
  mean += delta / count;
  m2 += delta * (value - mean);
 }

 double VarianceAndMeanCalculator::GetMean() { return mean; }

 double VarianceAndMeanCalculator::GetVariance() {
  if (count > 1) {
    return m2 / (count - 1);
  } else {
    return 0.0;
  }
 }

 double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); }

 template <typename T>
 TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *previous_tensor_ptr, uint32_t num_elements)
    : current_tensor_ptr(reinterpret_cast<T *>(current_tensor_ptr)),
      prev_tensor_ptr(reinterpret_cast<T *>(previous_tensor_ptr)),
      num_elements(num_elements),
      min(std::numeric_limits<double>::max()),
      max(std::numeric_limits<double>::lowest()),
      inf_count(0),
      nan_count(0),
      zero_count(0),
      epsilon(1.0e-9),
      mean_sd_cal_enabled(false) {}

 template <typename T>
 void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) {
  InitCalculators(wps);
  for (size_t i = 0; i < num_elements; ++i) {
    auto current_value = static_cast<double>(current_tensor_ptr[i]);
    double previous_value =
      prev_tensor_ptr ? static_cast<double>(prev_tensor_ptr[i]) : std::numeric_limits<double>::quiet_NaN();
    inf_count += std::isinf(current_value);
    nan_count += std::isnan(current_value);
    zero_count += (current_value == 0);
    max = std::max(max, current_value);
    min = std::min(min, current_value);
    if (mean_sd_cal_enabled) {
      current_mean_variance.ProcessElement(current_value);
    }
    for (auto &it : all_close) {
      it.second->ProcessElement(current_value, previous_value);
    }
    for (auto &range_count : range_counts) {
      range_count.second->ProcessElement(current_value);
    }
    for (auto &mean : means) {
      if (mean.first == "curr_prev_diff_mean") {
        mean.second->ProcessElement(std::abs(current_value - previous_value));
      } else if (mean.first == "abs_prev_mean") {
        mean.second->ProcessElement(std::abs(previous_value));
      } else if (mean.first == "abs_current_mean") {
        mean.second->ProcessElement(std::abs(current_value));
      }
    }
  }
 }

 template <typename T>
 std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit(
  DebugServices::watchpoint_t wp) {
  auto parameter_list = wp.parameter_list;
  bool hit = false;
  std::bitset<32> error_code;
  CONDITION_TYPE type = wp.condition.type;

  error_code.set(0, nan_count > 0);
  error_code.set(1, inf_count > 0);

  if (type == CONDITION_TYPE::HAS_NAN) {
    error_code.reset();
    hit = nan_count > 0;
  } else if (type == CONDITION_TYPE::HAS_INF) {
    error_code.reset();
    hit = inf_count > 0;
  } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) {
    error_code.reset();
    hit = (nan_count + inf_count) > 0;
  } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) {
    hit = all_close[wp.id]->IsAllClose();
  }

  for (auto &parameter : parameter_list) {
    if (parameter.disabled || error_code.any()) {
      continue;
    }
    std::string inequality_type;
    if (wp.is_gt_wp()) {
      inequality_type = "gt";
    } else if (wp.is_lt_wp()) {
      inequality_type = "lt";
    }
    parameter.Evaluate(StatLookup(parameter.name, wp), inequality_type);
    hit |= parameter.hit;
  }
  return std::make_tuple(hit, static_cast<int32_t>(error_code.to_ulong()), parameter_list);
 }

 template <typename T>
 double_t TensorSummary<T>::StatLookup(const std::string &parameter_name, const DebugServices::watchpoint_t &wp) {
  if (parameter_name == "param") return StatLookup(wp);
  std::string param_type;
  auto pos = parameter_name.find_last_of('_');
  if (pos != std::string::npos) {
    param_type = parameter_name.substr(0, pos);
  }

  if (param_type == "max") {
    return max;
  } else if (param_type == "min") {
    return min;
  } else if (param_type == "max_min") {
    return max - min;
  } else if (param_type == "mean") {
    return current_mean_variance.GetMean();
  } else if (param_type == "sd") {
    return current_mean_variance.GetStandardDeviation();
  } else if (param_type == "abs_mean") {
    return means["abs_current_mean"]->GetMean();
  } else if (param_type == "abs_mean_update_ratio") {
    return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon);
  } else if (param_type == "range_percentage") {
    return range_counts[wp.id]->GetPercentInRange();
  } else if (param_type == "zero_percentage") {
    return GetZeroValPercent();
  }
  return std::numeric_limits<double_t>::quiet_NaN();
 }

 template <typename T>
 double_t TensorSummary<T>::StatLookup(const DebugServices::watchpoint_t &wp) {
  CONDITION_TYPE type = wp.condition.type;
  if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) {
    return max;
  } else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) {
    return min;
  } else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) {
    return current_mean_variance.GetMean();
  } else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) {
    return current_mean_variance.GetStandardDeviation();
  } else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) {
    return max - min;
  }
  return std::numeric_limits<double_t>::quiet_NaN();
 }

 template <typename T>
 double_t TensorSummary<T>::GetZeroValPercent() {
  if (num_elements == 0) {
    return 0;
  }

  return (zero_count * 100.0) / num_elements;
 }

 template <typename T>
 void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoint_t> &wps) {
  for (auto &wp : wps) {
    auto wp_id = wp.id;
    mean_sd_cal_enabled |= wp.mean_sd_enabled();
    if (wp.allclose_enabled() && prev_tensor_ptr) {
      all_close[wp_id] = std::make_unique<AllCloseCalculator>();
      if (!wp.parameter_list[0].disabled) {
        all_close[wp_id]->set_atol(wp.parameter_list[0].value);
      }
      if (!wp.parameter_list[1].disabled) {
        all_close[wp_id]->set_rtol(wp.parameter_list[1].value);
      }
    } else if (wp.range_enabled()) {
      range_counts[wp_id] = std::make_unique<RangeCountCalculator>();
      if (!wp.parameter_list[0].disabled) {
        range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value);
      }
      if (!wp.parameter_list[1].disabled) {
        range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
      }
    } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) {
      means.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
      means.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
    } else if (wp.abs_mean_enabled()) {
      means.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
    }
  }
 }
 template class TensorSummary<uint8_t>;
 template class TensorSummary<int8_t>;
 template class TensorSummary<uint16_t>;
 template class TensorSummary<int16_t>;
 template class TensorSummary<uint32_t>;
 template class TensorSummary<int32_t>;
 template class TensorSummary<uint64_t>;
 template class TensorSummary<int64_t>;
 template class TensorSummary<float16>;
 template class TensorSummary<float>;
 template class TensorSummary<double>;
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/tensor_summary.h
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h
@@ -0,0 +1,120 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_TENSOR_SUMMARY_H
 #define MINDSPORE_TENSOR_SUMMARY_H

 #include <vector>
 #include <unordered_map>
 #include <tuple>
 #include <memory>
 #include <string>

 #include "debug/debug_services.h"

 namespace mindspore {
 class RangeCountCalculator {
 public:
  RangeCountCalculator();
  void ProcessElement(double element);
  double GetPercentInRange();
  void set_range_start_inclusive(double value) { range_start_inclusive = value; }
  void set_range_end_inclusive(double value) { range_end_inclusive = value; }

 private:
  double range_start_inclusive;
  double range_end_inclusive;
  int count;
  int total;
 };

 class AllCloseCalculator {
 public:
  AllCloseCalculator();
  void ProcessElement(double current, double previous);
  bool IsAllClose();
  void set_atol(double value) { atol = value; }
  void set_rtol(double value) { rtol = value; }

 private:
  double atol;
  double rtol;
  bool result;
 };

 class MeanCalculator {
 public:
  MeanCalculator();
  void ProcessElement(double value);
  double GetMean();

 protected:
  double mean;
  int count;
 };

 class VarianceAndMeanCalculator {
 public:
  VarianceAndMeanCalculator();
  void ProcessElement(double value);
  double GetStandardDeviation();
  double GetVariance();
  double GetMean();

 private:
  double mean;
  int count;
  double m2;
 };

 class ITensorSummary {
 public:
  virtual ~ITensorSummary() = default;
  virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
  virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
    DebugServices::watchpoint_t) = 0;
 };

 template <typename T>
 class TensorSummary : public ITensorSummary {
 public:
  TensorSummary() = default;
  TensorSummary(void *, void *, uint32_t);
  void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override;
  // returns hit, error_code, parameter_list
  std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override;

 private:
  T *current_tensor_ptr;
  T *prev_tensor_ptr;
  uint32_t num_elements;
  double min;
  double max;
  uint32_t inf_count;
  uint32_t nan_count;
  uint32_t zero_count;
  double epsilon;
  bool mean_sd_cal_enabled;
  VarianceAndMeanCalculator current_mean_variance;
  std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means;
  std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close;
  std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts;
  double_t StatLookup(const DebugServices::watchpoint_t &);
  double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &);
  double_t GetZeroValPercent();
  void InitCalculators(const std::vector<DebugServices::watchpoint_t> &);
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_TENSOR_SUMMARY_H
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@@ -56,7 +56,12 @@ class TensorLoader {

  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }

  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; }
  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
    if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
      return tensor_list_map[tensor_name + ":prev"];
    }
    return nullptr;
  }

  std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
    std::vector<std::shared_ptr<TensorData>> tensors;