You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. namespace mindspore {
  18. DebugServices::DebugServices() {
  19. tensor_loader_ = new TensorLoader();
  20. uint32_t iter_num = -1;
  21. tensor_loader_->set_iter_num(iter_num);
  22. }
  23. DebugServices::DebugServices(const DebugServices &other) {
  24. tensor_loader_ = other.tensor_loader_;
  25. watchpoint_table = other.watchpoint_table;
  26. }
  27. DebugServices &DebugServices::operator=(const DebugServices &other) {
  28. if (this != &other) {
  29. tensor_loader_ = other.tensor_loader_;
  30. watchpoint_table = other.watchpoint_table;
  31. }
  32. return *this;
  33. }
  34. DebugServices::~DebugServices() { delete tensor_loader_; }
  35. void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
  36. const std::vector<std::tuple<std::string, bool>> &check_node_list) {
  37. std::lock_guard<std::mutex> lg(lock_);
  38. watchpoint_t watchpoint_item;
  39. watchpoint_item.id = id;
  40. if (watch_condition == 0) {
  41. watchpoint_item.conditions.nan.enabled = true;
  42. } else if (watch_condition == 1) {
  43. watchpoint_item.conditions.inf.enabled = true;
  44. watchpoint_item.conditions.neg_inf.enabled = true;
  45. } else if (watch_condition == 2) {
  46. watchpoint_item.conditions.overflow.enabled = true;
  47. }
  48. watchpoint_item.check_node_list = check_node_list;
  49. watchpoint_table[id] = watchpoint_item;
  50. }
  51. void DebugServices::RemoveWatchpoint(unsigned int id) {
  52. std::lock_guard<std::mutex> lg(lock_);
  53. watchpoint_table.erase(id);
  54. }
  55. void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
  56. std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
  57. const std::vector<std::string> &op_overflows) {
  58. std::lock_guard<std::mutex> lg(lock_);
  59. std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
  60. std::string current_tensor_name;
  61. std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
  62. const size_t location = 0;
  63. for (std::size_t i = 0; i < tensor_list.size(); i++) {
  64. current_tensor_name = tensor_list[i]->GetName();
  65. std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
  66. mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
  67. int tensor_data_type = tensor_ptr->data_type_c();
  68. // check if we need to analyze this node and for which watchpoints we will check
  69. // create a list of watchpoints to check
  70. watchpoints_to_check_table.clear();
  71. for (auto w_table_item : watchpoint_table) {
  72. // if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
  73. // don't check the watchpoint for this tensor
  74. if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
  75. std::get<1>(w_table_item).conditions.nan.enabled) {
  76. if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
  77. tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
  78. continue;
  79. }
  80. }
  81. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  82. for (auto check_node : check_node_list) {
  83. std::string w_name = std::get<0>(check_node);
  84. bool w_type = std::get<1>(check_node);
  85. // check if the current node tensor name is included the watchpoint
  86. std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
  87. if ((w_type == true && (current_tensor_name.find(w_name) == location || w_name == "*")) ||
  88. (w_type == false && current_node_name == w_name)) {
  89. watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
  90. break;
  91. }
  92. }
  93. }
  94. std::vector<unsigned int> hit_encountered;
  95. // handle watchpoint conditions that do not require per element checks
  96. for (auto it_w_table_check = watchpoints_to_check_table.begin();
  97. it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
  98. if (it_w_table_check->second.conditions.overflow.enabled) {
  99. std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
  100. if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
  101. hit_encountered.push_back(it_w_table_check->second.id);
  102. }
  103. }
  104. }
  105. if (hit_encountered.size()) {
  106. HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
  107. &watchpoints_to_check_table, tensor_slot);
  108. hit_encountered.clear();
  109. }
  110. // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
  111. if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
  112. continue;
  113. }
  114. // check if no watchpoints are remaining
  115. if (watchpoints_to_check_table.empty()) {
  116. continue;
  117. }
  118. float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
  119. unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
  120. std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
  121. for (unsigned int index = 0; index < num_elements; index++) {
  122. float x = start_addr[index];
  123. it_w_table_check = watchpoints_to_check_table.begin();
  124. while (it_w_table_check != watchpoints_to_check_table.end()) {
  125. if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
  126. isinf(x)) {
  127. hit_encountered.push_back(it_w_table_check->second.id);
  128. } else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
  129. hit_encountered.push_back(it_w_table_check->second.id);
  130. }
  131. ++it_w_table_check;
  132. }
  133. if (hit_encountered.size()) {
  134. HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
  135. &watchpoints_to_check_table, tensor_slot);
  136. hit_encountered.clear();
  137. }
  138. if (watchpoints_to_check_table.empty()) {
  139. break;
  140. }
  141. }
  142. }
  143. }
  144. void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
  145. std::vector<std::string> *name, std::vector<std::string> *slot,
  146. std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
  147. std::string current_tensor_name,
  148. std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
  149. std::string tensor_slot) {
  150. for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
  151. if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
  152. std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
  153. name->push_back(name_no_slot);
  154. slot->push_back(tensor_slot);
  155. int condition_item = -1;
  156. if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
  157. condition_item = 0;
  158. } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
  159. watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
  160. condition_item = 1;
  161. } else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
  162. condition_item = 2;
  163. }
  164. condition->push_back(condition_item);
  165. watchpoint_id->push_back(*it_hit_id);
  166. }
  167. watchpoints_to_check_table->erase(*it_hit_id);
  168. }
  169. }
  170. void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
  171. char **data_ptr, unsigned int *data_size, int *condition,
  172. unsigned int *wacthpoint_id) {
  173. std::lock_guard<std::mutex> lg(lock_);
  174. std::string current_watchtensor_name;
  175. current_watchtensor_name = watchtensor->GetName();
  176. mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor();
  177. int tensor_data_type = tensor_ptr->data_type_c();
  178. watchpoint_t watchpoint_to_check;
  179. for (auto w_table_item : watchpoint_table) {
  180. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  181. for (auto check_node : check_node_list) {
  182. std::string w_name = std::get<0>(check_node);
  183. bool w_type = std::get<1>(check_node);
  184. // get current the full info including condition, id..., for current watchtensor
  185. std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
  186. if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) ||
  187. (w_type == false && current_node_name == w_name)) {
  188. watchpoint_to_check = w_table_item.second;
  189. // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
  190. if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
  191. return;
  192. }
  193. break;
  194. }
  195. }
  196. }
  197. float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
  198. unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
  199. for (unsigned int index = 0; index < num_elements; index++) {
  200. float x = start_addr[index];
  201. if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) ||
  202. (watchpoint_to_check.conditions.nan.enabled && isnan(x))) {
  203. std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
  204. *name = name_no_slot;
  205. *slot = std::to_string(watchtensor->GetSlot());
  206. *data_ptr = reinterpret_cast<char *>(tensor_ptr->data_c());
  207. *data_size = tensor_ptr->data().nbytes();
  208. int condition_item = -1;
  209. if (watchpoint_to_check.conditions.nan.enabled) {
  210. condition_item = 0;
  211. } else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) {
  212. condition_item = 1;
  213. }
  214. *condition = condition_item;
  215. *wacthpoint_id = watchpoint_to_check.id;
  216. }
  217. }
  218. }
  219. void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
  220. std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
  221. std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
  222. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  223. tensor_loader_->SearchTensors(name, &result_list);
  224. for (auto result : result_list) {
  225. if (!std::get<1>(result)) {
  226. continue;
  227. }
  228. ret_name->push_back(std::get<0>(result));
  229. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetTensor()->data_c()));
  230. data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes());
  231. dtype->push_back(std::get<1>(result)->GetTensor()->Dtype());
  232. shape->push_back(std::get<1>(result)->GetTensor()->shape());
  233. }
  234. }
  235. bool DebugServices::IsWatchPoint(std::string kernel_name,
  236. std::unordered_map<unsigned int, watchpoint_t> watchpoint_table) {
  237. bool ret = false;
  238. for (auto w_table_item : watchpoint_table) {
  239. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  240. for (auto check_node : check_node_list) {
  241. std::string w_name = std::get<0>(check_node);
  242. bool w_type = std::get<1>(check_node);
  243. if ((w_type == true &&
  244. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  245. (w_type == false && kernel_name == w_name)) {
  246. ret = true;
  247. return ret;
  248. }
  249. }
  250. }
  251. return ret;
  252. }
  253. TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; }
  254. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  255. return watchpoint_table;
  256. }
  257. } // namespace mindspore