You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 11 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <algorithm>
  17. #include "debug/debug_services.h"
  18. namespace mindspore {
  19. DebugServices::DebugServices() {
  20. tensor_loader_ = new TensorLoader();
  21. uint32_t iter_num = -1;
  22. tensor_loader_->set_iter_num(iter_num);
  23. }
  24. DebugServices::DebugServices(const DebugServices &other) {
  25. tensor_loader_ = other.tensor_loader_;
  26. watchpoint_table = other.watchpoint_table;
  27. }
  28. DebugServices &DebugServices::operator=(const DebugServices &other) {
  29. if (this != &other) {
  30. tensor_loader_ = other.tensor_loader_;
  31. watchpoint_table = other.watchpoint_table;
  32. }
  33. return *this;
  34. }
  35. DebugServices::~DebugServices() { delete tensor_loader_; }
  36. void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
  37. const std::vector<std::tuple<std::string, bool>> &check_node_list) {
  38. std::lock_guard<std::mutex> lg(lock_);
  39. watchpoint_t watchpoint_item;
  40. watchpoint_item.id = id;
  41. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  42. watchpoint_item.condition.parameter = parameter;
  43. if (watch_condition > 2)
  44. // odd indices are greater than conditions and even indicies are less than
  45. watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT";
  46. watchpoint_item.check_node_list = check_node_list;
  47. watchpoint_table[id] = watchpoint_item;
  48. }
  49. void DebugServices::RemoveWatchpoint(unsigned int id) {
  50. std::lock_guard<std::mutex> lg(lock_);
  51. watchpoint_table.erase(id);
  52. }
  53. template <typename T>
  54. DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, unsigned int n, bool need_min_max,
  55. bool need_mean_sd) {
  56. tensor_stats stats;
  57. for (unsigned int i = 0; i < n; ++i) {
  58. auto val = static_cast<double>(start[i]);
  59. stats.has_nan = stats.has_nan || std::isnan(val);
  60. stats.has_inf = stats.has_inf || std::isinf(val);
  61. if (stats.has_inf && stats.has_nan) {
  62. // other statistics don't make sense in this case
  63. break;
  64. }
  65. if (need_min_max) {
  66. stats.min = std::min(stats.min, val);
  67. stats.max = std::max(stats.max, val);
  68. }
  69. if (need_mean_sd) {
  70. double delta = val - stats.mean;
  71. stats.mean += delta / (i + 1);
  72. stats.m2 += delta * (val - stats.mean);
  73. }
  74. }
  75. stats.n = n;
  76. return stats;
  77. }
  78. void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
  79. std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
  80. const std::vector<std::string> &op_overflows,
  81. const std::vector<std::shared_ptr<TensorData>> &tensor_list) {
  82. std::lock_guard<std::mutex> lg(lock_);
  83. if (watchpoint_table.empty()) {
  84. return;
  85. }
  86. for (const auto &tensor : tensor_list) {
  87. const auto tensor_name = tensor->GetName();
  88. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  89. const auto tensor_slot = std::to_string(tensor->GetSlot());
  90. mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor();
  91. int tensor_dtype = tensor_ptr->data_type_c();
  92. std::vector<unsigned int> hit_encountered;
  93. std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
  94. bool min_max_enabled = false;
  95. bool mean_sd_enabled = false;
  96. bool inf_nan_enabled = false;
  97. for (auto w_table_item : watchpoint_table) {
  98. auto wp = std::get<1>(w_table_item);
  99. if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue;
  100. if (wp.IsNodeIncluded(tensor_name_no_slot)) {
  101. min_max_enabled |= wp.min_max_enabled();
  102. mean_sd_enabled |= wp.mean_sd_enabled();
  103. inf_nan_enabled |= wp.inf_nan_enabled();
  104. watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
  105. }
  106. }
  107. tensor_stats stats;
  108. uint num_elements = tensor_ptr->DataSize();
  109. if (min_max_enabled || mean_sd_enabled || inf_nan_enabled) {
  110. switch (tensor_dtype) {
  111. case kNumberTypeUInt8: {
  112. auto start_addr = reinterpret_cast<uint8_t *>(tensor_ptr->data_c());
  113. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  114. break;
  115. }
  116. case kNumberTypeInt8: {
  117. auto start_addr = reinterpret_cast<int8_t *>(tensor_ptr->data_c());
  118. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  119. break;
  120. }
  121. case kNumberTypeUInt16: {
  122. auto start_addr = reinterpret_cast<uint16_t *>(tensor_ptr->data_c());
  123. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  124. break;
  125. }
  126. case kNumberTypeInt16: {
  127. auto start_addr = reinterpret_cast<int16_t *>(tensor_ptr->data_c());
  128. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  129. break;
  130. }
  131. case kNumberTypeUInt32: {
  132. auto start_addr = reinterpret_cast<uint32_t *>(tensor_ptr->data_c());
  133. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  134. break;
  135. }
  136. case kNumberTypeInt32:
  137. case kNumberTypeInt: {
  138. auto start_addr = reinterpret_cast<int32_t *>(tensor_ptr->data_c());
  139. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  140. break;
  141. }
  142. case kNumberTypeUInt64: {
  143. auto start_addr = reinterpret_cast<uint64_t *>(tensor_ptr->data_c());
  144. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  145. break;
  146. }
  147. case kNumberTypeInt64: {
  148. auto start_addr = reinterpret_cast<int64_t *>(tensor_ptr->data_c());
  149. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  150. break;
  151. }
  152. case kNumberTypeFloat16: {
  153. auto start_addr = reinterpret_cast<float16 *>(tensor_ptr->data_c());
  154. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  155. break;
  156. }
  157. case kNumberTypeFloat32:
  158. case kNumberTypeFloat: {
  159. auto start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
  160. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  161. break;
  162. }
  163. case kNumberTypeFloat64: {
  164. auto start_addr = reinterpret_cast<double *>(tensor_ptr->data_c());
  165. stats = SummarizeTensor(start_addr, num_elements, min_max_enabled, mean_sd_enabled);
  166. break;
  167. }
  168. default:
  169. MS_LOG(INFO) << "Unsupported tensor type";
  170. break;
  171. }
  172. }
  173. for (auto &it : watchpoints_to_check_table) {
  174. auto wp_id = it.second.id;
  175. CONDITION_TYPE enabled_condition = it.second.condition.type;
  176. bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) ||
  177. (enabled_condition == IS_OVERFLOW &&
  178. std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
  179. if (enabled_condition > 2) {
  180. if (stats.has_inf || stats.has_nan) {
  181. MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check "
  182. << condition_label[enabled_condition] << " watchpoint.";
  183. } else {
  184. bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter;
  185. bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter;
  186. hit |= it.second.condition.comparison == "GT" ? gt : lt;
  187. }
  188. }
  189. if (hit) hit_encountered.push_back(wp_id);
  190. }
  191. for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
  192. if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
  193. name->push_back(tensor_name_no_slot);
  194. slot->push_back(tensor_slot);
  195. int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type;
  196. condition->push_back(condition_item);
  197. watchpoint_id->push_back(*it_hit_id);
  198. }
  199. watchpoints_to_check_table.erase(*it_hit_id);
  200. }
  201. }
  202. }
  203. void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
  204. std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
  205. std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
  206. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  207. tensor_loader_->SearchTensors(name, &result_list);
  208. for (auto result : result_list) {
  209. if (!std::get<1>(result)) {
  210. continue;
  211. }
  212. ret_name->push_back(std::get<0>(result));
  213. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetTensor()->data_c()));
  214. data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes());
  215. dtype->push_back(std::get<1>(result)->GetTensor()->Dtype());
  216. shape->push_back(std::get<1>(result)->GetTensor()->shape());
  217. }
  218. }
  219. bool DebugServices::IsWatchPoint(std::string kernel_name) {
  220. bool ret = false;
  221. for (auto w_table_item : watchpoint_table) {
  222. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  223. for (auto check_node : check_node_list) {
  224. std::string w_name = std::get<0>(check_node);
  225. bool w_type = std::get<1>(check_node);
  226. if ((w_type == true &&
  227. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  228. (w_type == false && kernel_name == w_name)) {
  229. ret = true;
  230. return ret;
  231. }
  232. }
  233. }
  234. return ret;
  235. }
  236. TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; }
  237. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  238. return watchpoint_table;
  239. }
  240. } // namespace mindspore