You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 33 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <fstream>
  19. #include <algorithm>
  20. #include <map>
  21. #include <unordered_set>
  22. #ifdef ONLINE_DBG_MODE
  23. #include "backend/session/anf_runtime_algorithm.h"
  24. #endif
  25. #include "debug/debugger/tensor_summary.h"
  26. #ifdef ONLINE_DBG_MODE
  27. namespace mindspore {
  28. #endif
  29. DebugServices::DebugServices() {
  30. tensor_loader_ = new TensorLoader();
  31. uint32_t iter_num = -1;
  32. tensor_loader_->set_iter_num(iter_num);
  33. }
  34. DebugServices::DebugServices(const DebugServices &other) {
  35. tensor_loader_ = other.tensor_loader_;
  36. watchpoint_table = other.watchpoint_table;
  37. }
  38. DebugServices &DebugServices::operator=(const DebugServices &other) {
  39. if (this != &other) {
  40. tensor_loader_ = other.tensor_loader_;
  41. watchpoint_table = other.watchpoint_table;
  42. }
  43. return *this;
  44. }
  45. DebugServices::~DebugServices() { delete tensor_loader_; }
  46. void DebugServices::AddWatchpoint(
  47. unsigned int id, unsigned int watch_condition, float parameter,
  48. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  49. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  50. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  51. std::lock_guard<std::mutex> lg(lock_);
  52. watchpoint_t watchpoint_item;
  53. watchpoint_item.id = id;
  54. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  55. watchpoint_item.condition.parameter = parameter;
  56. watchpoint_item.check_node_list = check_node_list;
  57. if (check_node_device_list != nullptr) {
  58. watchpoint_item.check_node_device_list = *check_node_device_list;
  59. }
  60. if (check_node_graph_list != nullptr) {
  61. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  62. }
  63. watchpoint_item.parameter_list = parameter_list;
  64. watchpoint_table[id] = watchpoint_item;
  65. }
  66. void DebugServices::RemoveWatchpoint(unsigned int id) {
  67. std::lock_guard<std::mutex> lg(lock_);
  68. watchpoint_table.erase(id);
  69. }
  70. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor, void *previous_tensor_ptr,
  71. uint32_t num_elements, int tensor_dtype) {
  72. switch (tensor_dtype) {
  73. case DbgDataType::DT_UINT8: {
  74. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  75. }
  76. case DbgDataType::DT_INT8: {
  77. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  78. }
  79. case DbgDataType::DT_UINT16: {
  80. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  81. }
  82. case DbgDataType::DT_INT16: {
  83. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  84. }
  85. case DbgDataType::DT_UINT32: {
  86. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  87. }
  88. case DbgDataType::DT_INT32:
  89. case DbgDataType::DT_BASE_INT: {
  90. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  91. }
  92. case DbgDataType::DT_UINT64: {
  93. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  94. }
  95. case DbgDataType::DT_INT64: {
  96. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  97. }
  98. case DbgDataType::DT_FLOAT16: {
  99. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  100. }
  101. case DbgDataType::DT_FLOAT32:
  102. case DbgDataType::DT_BASE_FLOAT: {
  103. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  104. }
  105. case DbgDataType::DT_FLOAT64: {
  106. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  107. }
  108. case DbgDataType::DT_BOOL: {
  109. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  110. }
  111. default:
  112. MS_LOG(INFO) << "Unsupported tensor type";
  113. // return a null pointer
  114. return std::unique_ptr<TensorSummary<int32_t>>{};
  115. }
  116. }
  117. #ifdef OFFLINE_DBG_MODE
  118. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
  119. void *previous_tensor_ptr = nullptr;
  120. std::shared_ptr<TensorData> tensor_prev;
  121. if (previous_iter_tensor_needed && tensor->GetIteration() > 1) {
  122. // read data in offline mode
  123. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  124. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  125. std::vector<unsigned int>{tensor->GetDeviceId()},
  126. std::vector<unsigned int>{tensor->GetIteration() - 1},
  127. std::vector<unsigned int>{tensor->GetRootGraphId()}, &result_list_prev);
  128. tensor_prev = result_list_prev[0];
  129. if (!tensor_prev->GetByteSize()) {
  130. tensor_prev.reset();
  131. } else {
  132. previous_tensor_ptr = tensor_prev->GetDataPtr();
  133. }
  134. }
  135. return previous_tensor_ptr;
  136. }
  137. #endif
  138. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  139. const std::string &tensor_name, const std::string &tensor_name_no_slot,
  140. bool *previous_iter_tensor_needed, std::string *qualified_tensor_name,
  141. std::vector<watchpoint_t> *watchpoints_to_check) {
  142. for (auto w_table_item : watchpoint_table) {
  143. auto wp = std::get<1>(w_table_item);
  144. // check ONLY init conditions on initial suspended state.
  145. // skip other conditions on initial suspended state
  146. if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
  147. // skip init condition if not init suspend
  148. if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
  149. // check change conditions only on step end.
  150. if (wp.change_condition() && !step_end) continue;
  151. // if recheck, ignore the cache results and reanalyze everything.
  152. // if not a recheck, check only unanalyzed tensors
  153. if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
  154. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
  155. if (!found.empty()) {
  156. *qualified_tensor_name = found;
  157. watchpoints_to_check->push_back(w_table_item.second);
  158. #ifdef OFFLINE_DBG_MODE
  159. if (wp.change_condition()) {
  160. *previous_iter_tensor_needed = true;
  161. }
  162. #endif
  163. }
  164. }
  165. }
  166. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  167. const std::string &tensor_name) {
  168. // add analyzed tensor to cache
  169. if (!recheck) {
  170. wp_id_cache[tensor_name].insert(id);
  171. }
  172. }
  173. void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
  174. std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
  175. std::vector<std::vector<parameter_t>> *parameters,
  176. std::vector<int32_t> *error_codes, const std::vector<std::string> &op_overflows,
  177. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  178. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  179. std::vector<unsigned int> *root_graph_id) {
  180. std::lock_guard<std::mutex> lg(lock_);
  181. if (watchpoint_table.empty()) return;
  182. for (auto &tensor : *tensor_list) {
  183. #ifdef OFFLINE_DBG_MODE
  184. // read data in offline mode
  185. std::vector<std::shared_ptr<TensorData>> result_list;
  186. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  187. std::vector<unsigned int>{tensor->GetDeviceId()},
  188. std::vector<unsigned int>{tensor->GetIteration()},
  189. std::vector<unsigned int>{tensor->GetRootGraphId()}, &result_list);
  190. tensor = result_list[0];
  191. if (!tensor->GetByteSize()) {
  192. tensor.reset();
  193. continue;
  194. }
  195. #endif
  196. const auto tensor_name = tensor->GetName();
  197. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  198. const auto tensor_slot = std::to_string(tensor->GetSlot());
  199. // no elements to analyze
  200. if (tensor->GetByteSize() == 0) continue;
  201. int tensor_dtype = tensor->GetType();
  202. std::vector<watchpoint_t> watchpoints_to_check;
  203. std::string qualified_tensor_name;
  204. bool previous_iter_tensor_needed = false;
  205. // Add do nothing line in case offline debug is off, prevent unused var warning
  206. (void)previous_iter_tensor_needed;
  207. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
  208. &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
  209. // no wp set on current tensor
  210. if (watchpoints_to_check.empty()) continue;
  211. uint32_t num_elements = tensor->GetNumElements();
  212. #ifdef OFFLINE_DBG_MODE
  213. void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
  214. #else
  215. void *previous_tensor_ptr =
  216. tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
  217. #endif
  218. std::unique_ptr<ITensorSummary> base_summary_ptr;
  219. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  220. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
  221. if (base_summary_ptr != nullptr) {
  222. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  223. }
  224. }
  225. for (auto &wp : watchpoints_to_check) {
  226. bool is_hit = false;
  227. int error_code = 0;
  228. std::vector<parameter_t> parameter_list = {};
  229. if (wp.condition.type == IS_OVERFLOW) {
  230. is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
  231. } else if (base_summary_ptr != nullptr) {
  232. auto item = base_summary_ptr->IsWatchpointHit(wp);
  233. is_hit = std::get<0>(item);
  234. error_code = std::get<1>(item);
  235. parameter_list = std::get<2>(item);
  236. }
  237. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  238. if (is_hit || error_code) {
  239. name->push_back(qualified_tensor_name);
  240. slot->push_back(tensor_slot);
  241. condition->push_back(wp.condition.type);
  242. watchpoint_id->push_back(wp.id);
  243. if (device_id != nullptr) {
  244. device_id->push_back(tensor->GetDeviceId());
  245. }
  246. if (root_graph_id != nullptr) {
  247. root_graph_id->push_back(tensor->GetRootGraphId());
  248. }
  249. parameters->push_back(parameter_list);
  250. error_codes->push_back(error_code);
  251. }
  252. }
  253. #ifdef OFFLINE_DBG_MODE
  254. // in offline mode remove the need for the data
  255. tensor.reset();
  256. #endif
  257. }
  258. }
  259. #ifdef OFFLINE_DBG_MODE
  260. void DebugServices::GetSlotInfo(const std::string &file_name, const std::string &dump_name,
  261. const std::string &specific_dump_dir, std::vector<size_t> *slot_list) {
  262. if (is_sync_mode) {
  263. // get the slot from the name
  264. std::string delimiter = "_";
  265. unsigned int start_pos = dump_name.length();
  266. unsigned int end_pos = file_name.find(delimiter, start_pos);
  267. std::string item = file_name.substr(start_pos, end_pos - start_pos);
  268. slot_list->push_back(std::stoul(item));
  269. } else {
  270. std::string out_dir = "/tmp/" + file_name;
  271. std::string input_file = specific_dump_dir + "/" + file_name;
  272. std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null";
  273. std::string convert_command =
  274. "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " +
  275. out_dir + " -t bin " + log_enabled;
  276. (void)(system(convert_command.c_str()) + 1);
  277. convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " +
  278. input_file + " -out " + out_dir + " -f NCHW -t bin " + log_enabled;
  279. (void)(system(convert_command.c_str()) + 1);
  280. std::string prefix_converted_dump_file_name = file_name + ".output.";
  281. DIR *convert_dir_ptr = opendir(out_dir.c_str());
  282. if (convert_dir_ptr != nullptr) {
  283. struct dirent *convert_dir_contents = nullptr;
  284. while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) {
  285. if (convert_dir_contents->d_type == DT_REG) {
  286. std::string converted_file_name = convert_dir_contents->d_name;
  287. std::size_t nd_file = converted_file_name.rfind(".ND.bin");
  288. std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin");
  289. std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin");
  290. if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) {
  291. continue;
  292. }
  293. std::size_t found_c = converted_file_name.find(prefix_converted_dump_file_name);
  294. if (found_c != 0) {
  295. continue;
  296. }
  297. std::size_t slot_start_pos = prefix_converted_dump_file_name.length();
  298. std::size_t slot_end_pos = converted_file_name.find(".", slot_start_pos) - 1;
  299. std::string slot_item = converted_file_name.substr(slot_start_pos, slot_end_pos - slot_start_pos + 1);
  300. slot_list->push_back(std::stoul(slot_item));
  301. }
  302. }
  303. } else {
  304. MS_LOG(INFO) << out_dir << " directory does not exist!";
  305. }
  306. closedir(convert_dir_ptr);
  307. // std::string delete_cmd = "rm -rf " + out_dir;
  308. // system(delete_cmd.c_str());
  309. }
  310. }
  311. std::size_t DebugServices::GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot,
  312. const std::string &prefix_dump_file_name, std::string *file_name,
  313. std::string *type_name, std::string *out_dir, std::vector<int64_t> *shape) {
  314. std::size_t found = 0;
  315. if (is_sync_mode) {
  316. found = file_name->rfind(prefix_dump_file_name, 0);
  317. } else {
  318. std::string file_name_w_o_prefix = file_name->substr(file_name->find('.') + 1);
  319. found = file_name_w_o_prefix.rfind(prefix_dump_file_name, 0);
  320. }
  321. if (found != 0) {
  322. return found;
  323. }
  324. if (is_sync_mode) {
  325. // found a file, now get the shape and type
  326. // find "_shape_" in the filename
  327. std::string shape_delimiter = "_shape_";
  328. unsigned int str_pos = file_name->find(shape_delimiter) + shape_delimiter.length();
  329. // read numbers with '_' delimter until you read a non-number, that will be the type name
  330. bool number_found = true;
  331. std::string delimiter = "_";
  332. while (number_found) {
  333. unsigned int end_pos = file_name->find(delimiter, str_pos);
  334. std::string item = file_name->substr(str_pos, end_pos - str_pos);
  335. bool is_number = !item.empty() && std::find_if(item.begin(), item.end(),
  336. [](unsigned char c) { return !std::isdigit(c); }) == item.end();
  337. if (is_number) {
  338. shape->push_back(std::stoul(item));
  339. str_pos = end_pos + 1;
  340. } else {
  341. *type_name = item;
  342. number_found = false;
  343. }
  344. }
  345. } else {
  346. *out_dir = "/tmp/" + *file_name;
  347. std::string input_file = specific_dump_dir + "/" + *file_name;
  348. std::string log_enabled = DbgLogger::verbose ? "" : "> /dev/null";
  349. std::string convert_command =
  350. "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " + input_file + " -out " +
  351. *out_dir + " -t bin " + log_enabled;
  352. (void)(system(convert_command.c_str()) + 1);
  353. convert_command = "python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/msaccucmp.pyc convert -d " +
  354. input_file + " -out " + *out_dir + " -f NCHW -t bin " + log_enabled;
  355. (void)(system(convert_command.c_str()) + 1);
  356. std::string prefix_converted_dump_file_name = *file_name + ".output." + std::to_string(slot);
  357. *file_name = "";
  358. DIR *convert_dir_ptr = opendir(out_dir->c_str());
  359. if (convert_dir_ptr != nullptr) {
  360. struct dirent *convert_dir_contents = nullptr;
  361. while ((convert_dir_contents = readdir(convert_dir_ptr)) != NULL) {
  362. if (convert_dir_contents->d_type == DT_REG) {
  363. std::string converted_file_name = convert_dir_contents->d_name;
  364. std::size_t nd_file = converted_file_name.rfind(".ND.bin");
  365. std::size_t fractal_z_file = converted_file_name.rfind(".FRACTAL_Z.bin");
  366. std::size_t nchw_file = converted_file_name.rfind(".NCHW.bin");
  367. if (nd_file == std::string::npos && nchw_file == std::string::npos && fractal_z_file == std::string::npos) {
  368. continue;
  369. }
  370. std::size_t found_c = converted_file_name.rfind(prefix_converted_dump_file_name, 0);
  371. if (found_c != 0) {
  372. continue;
  373. }
  374. *file_name = converted_file_name;
  375. }
  376. }
  377. } else {
  378. MS_LOG(INFO) << *out_dir << " directory does not exist!";
  379. }
  380. closedir(convert_dir_ptr);
  381. if (*file_name == "") {
  382. MS_LOG(WARNING) << out_dir << ": no valid files found post msaccucmp exec";
  383. return 1;
  384. }
  385. // std::string delete_cmd = "rm -rf " + out_dir;
  386. // system(delete_cmd.c_str());
  387. // found a file, now get the shape and type
  388. std::stringstream check_filename(*file_name);
  389. std::vector<std::string> tokens;
  390. std::string intermediate;
  391. while (getline(check_filename, intermediate, '.')) {
  392. tokens.push_back(intermediate);
  393. }
  394. *type_name = tokens[8];
  395. std::string shape_str = tokens[7];
  396. std::stringstream check_shape(shape_str);
  397. while (getline(check_shape, intermediate, '_')) {
  398. shape->push_back(std::stoul(intermediate));
  399. }
  400. }
  401. return 0;
  402. }
  403. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  404. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  405. std::vector<unsigned int> root_graph_id,
  406. std::vector<std::shared_ptr<TensorData>> *result_list) {
  407. for (unsigned int i = 0; i < backend_name.size(); i++) {
  408. // form prefix of the tensor file to read from graph pb node name
  409. std::string dump_style_kernel_name = backend_name[i];
  410. const std::string strsrc = "/";
  411. std::string strdst;
  412. if (is_sync_mode) {
  413. strdst = "--";
  414. } else {
  415. strdst = "_";
  416. }
  417. std::string::size_type pos = 0;
  418. std::string::size_type srclen = strsrc.size();
  419. std::string::size_type dstlen = strdst.size();
  420. // remove slot from name
  421. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  422. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  423. while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) {
  424. dump_style_kernel_name.replace(pos, srclen, strdst);
  425. pos += dstlen;
  426. }
  427. std::string prefix_dump_file_name = dump_style_kernel_name;
  428. if (is_sync_mode) {
  429. prefix_dump_file_name += "_output_" + std::to_string(slot[i]) + "_";
  430. }
  431. std::string specific_dump_dir;
  432. if (is_sync_mode) {
  433. specific_dump_dir =
  434. dump_dir + "/device_" + std::to_string(device_id[i]) + "/iteration_" + std::to_string(iteration[i]);
  435. } else {
  436. specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/" + net_name + "_graph_" +
  437. std::to_string(root_graph_id[i]) + "/" + std::to_string(root_graph_id[i]) + "/" +
  438. std::to_string(iteration[i]);
  439. }
  440. // search files in dir for the one that meets the filename prefix and read the file into memory
  441. DIR *d;
  442. d = opendir(specific_dump_dir.c_str());
  443. std::vector<char> *buffer = NULL;
  444. std::string type_name = "";
  445. std::vector<int64_t> shape;
  446. uint64_t data_size = 0;
  447. if (d != nullptr) {
  448. struct dirent *dir = nullptr;
  449. while ((dir = readdir(d)) != NULL) {
  450. if (dir->d_type == DT_REG) {
  451. std::string file_name = dir->d_name;
  452. std::string out_dir;
  453. std::size_t found = GetShapeTypeInfo(specific_dump_dir, slot[i], prefix_dump_file_name, &file_name,
  454. &type_name, &out_dir, &shape);
  455. if (found != 0) {
  456. continue;
  457. }
  458. // read the tensor data from the file
  459. std::string file_path;
  460. if (is_sync_mode) {
  461. file_path = specific_dump_dir + "/" + file_name;
  462. } else {
  463. file_path = out_dir + "/" + file_name;
  464. }
  465. std::ifstream infile;
  466. infile.open(file_path.c_str(), std::ios::binary | std::ios::ate);
  467. if (!infile.is_open()) {
  468. MS_LOG(ERROR) << "Failed to open bin file " << file_name;
  469. break;
  470. }
  471. uint64_t file_size = infile.tellg();
  472. infile.seekg(0, std::ios::beg);
  473. buffer = new std::vector<char>(file_size);
  474. if (!infile.read(buffer->data(), file_size)) {
  475. MS_LOG(ERROR) << "Failed to read in bin file " << file_name;
  476. break;
  477. }
  478. data_size = file_size;
  479. infile.close();
  480. }
  481. }
  482. } else {
  483. MS_LOG(INFO) << "directory does not exist!";
  484. }
  485. closedir(d);
  486. // call LoadNewTensor to store tensor in internal cache
  487. auto tensor_data = std::make_shared<TensorData>();
  488. tensor_data->SetName(backend_name[i]);
  489. tensor_data->SetExecutionOrder(0);
  490. tensor_data->SetSlot(slot[i]);
  491. tensor_data->SetIteration(iteration[i]);
  492. tensor_data->SetDeviceId(device_id[i]);
  493. tensor_data->SetRootGraphId(root_graph_id[i]);
  494. if (data_size) {
  495. tensor_data->SetDataPtr(buffer->data());
  496. } else {
  497. tensor_data->SetDataPtr(NULL);
  498. }
  499. tensor_data->SetByteSize(data_size);
  500. tensor_data->SetType(type_name);
  501. tensor_data->SetShape(shape);
  502. if (data_size) {
  503. tensor_loader_->LoadNewTensor(tensor_data, false);
  504. }
  505. // add to result_list
  506. result_list->push_back(tensor_data);
  507. }
  508. }
  509. void ReplaceSrcFileName(const bool is_sync_mode, std::string *dump_style_name) {
  510. const std::string strsrc = "/";
  511. std::string strdst;
  512. if (is_sync_mode) {
  513. strdst = "--";
  514. } else {
  515. strdst = "_";
  516. }
  517. std::string::size_type pos = 0;
  518. std::string::size_type srclen = strsrc.size();
  519. std::string::size_type dstlen = strdst.size();
  520. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  521. dump_style_name->replace(pos, srclen, strdst);
  522. pos += dstlen;
  523. }
  524. }
  525. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration) {
  526. // get a list of nodes and the devices they are on to monitor
  527. std::vector<std::shared_ptr<TensorData>> tensor_list;
  528. std::map<std::tuple<uint32_t, uint32_t>, std::unordered_set<std::string>> device_and_graph_to_nodes;
  529. for (auto w_table_item : watchpoint_table) {
  530. auto wp = std::get<1>(w_table_item);
  531. for (auto check_node : wp.check_node_list) {
  532. unsigned int index = 0;
  533. std::string w_name = std::get<0>(check_node);
  534. bool w_is_param = std::get<1>(check_node);
  535. std::string node_name = w_name;
  536. if (w_is_param) {
  537. std::size_t found = node_name.find_last_of("/");
  538. node_name = node_name.substr(found + 1);
  539. }
  540. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  541. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  542. for (auto device : devices) {
  543. for (auto graph : graphs) {
  544. std::tuple<uint32_t, uint32_t> key(device, graph);
  545. device_and_graph_to_nodes[key].insert(node_name);
  546. }
  547. }
  548. index++;
  549. }
  550. }
  551. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  552. // as they are found
  553. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  554. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  555. uint32_t device_id = std::get<0>(device_and_graph);
  556. uint32_t root_graph_id = std::get<1>(device_and_graph);
  557. std::unordered_set<std::string> wp_nodes = device_and_graph_item.second;
  558. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  559. std::string specific_dump_dir;
  560. if (is_sync_mode) {
  561. specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/iteration_" + std::to_string(iteration);
  562. } else {
  563. specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/" + net_name + "_graph_" +
  564. std::to_string(root_graph_id) + "/" + std::to_string(root_graph_id) + "/" +
  565. std::to_string(iteration);
  566. }
  567. // convert node names to dump style
  568. for (auto node : wp_nodes) {
  569. std::string orig_name = node;
  570. std::string dump_style_name = node;
  571. ReplaceSrcFileName(is_sync_mode, &dump_style_name);
  572. if (is_sync_mode) {
  573. dump_style_name.append("_output_");
  574. }
  575. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  576. }
  577. // search files in dir for the one that meets the filename prefix and read the file into memory
  578. DIR *d;
  579. d = opendir(specific_dump_dir.c_str());
  580. if (d != nullptr) {
  581. struct dirent *dir = nullptr;
  582. while ((dir = readdir(d)) != NULL) {
  583. if (dir->d_type == DT_REG) {
  584. std::string file_name = dir->d_name;
  585. for (auto &node : proto_to_dump) {
  586. std::string dump_name = std::get<1>(node);
  587. std::size_t found = 0;
  588. if (is_sync_mode) {
  589. found = file_name.rfind(dump_name, 0);
  590. } else {
  591. std::string file_name_w_o_prefix = file_name.substr(file_name.find('.') + 1);
  592. found = file_name_w_o_prefix.rfind(dump_name, 0);
  593. }
  594. if (found == 0) {
  595. std::vector<size_t> slot_list;
  596. GetSlotInfo(file_name, dump_name, specific_dump_dir, &slot_list);
  597. for (auto slot : slot_list) {
  598. // add a TensorData entry (data will be read when needed)
  599. std::vector<int64_t> shape;
  600. std::string orig_name = std::get<0>(node);
  601. auto tensor_data = std::make_shared<TensorData>();
  602. tensor_data->SetName(orig_name);
  603. tensor_data->SetExecutionOrder(0);
  604. tensor_data->SetSlot(slot);
  605. tensor_data->SetIteration(iteration);
  606. tensor_data->SetDeviceId(device_id);
  607. tensor_data->SetRootGraphId(root_graph_id);
  608. tensor_data->SetDataPtr(NULL);
  609. tensor_data->SetByteSize(0);
  610. tensor_data->SetType("");
  611. tensor_data->SetShape(shape);
  612. tensor_list.push_back(tensor_data);
  613. }
  614. break;
  615. }
  616. }
  617. }
  618. }
  619. }
  620. }
  621. return tensor_list;
  622. }
  623. #endif
  624. void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
  625. std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
  626. std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *shape) {
  627. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  628. tensor_loader_->SearchTensors(name, &result_list);
  629. for (auto result : result_list) {
  630. if (!std::get<1>(result)) {
  631. continue;
  632. }
  633. ret_name->push_back(std::get<0>(result));
  634. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  635. data_size->push_back(std::get<1>(result)->GetByteSize());
  636. dtype->push_back(std::get<1>(result)->GetType());
  637. shape->push_back(std::get<1>(result)->GetShape());
  638. }
  639. }
  640. #ifdef ONLINE_DBG_MODE
  641. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  642. bool ret = false;
  643. for (auto w_table_item : watchpoint_table) {
  644. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  645. for (auto check_node : check_node_list) {
  646. std::string w_name = std::get<0>(check_node);
  647. bool w_type = std::get<1>(check_node);
  648. if ((w_type == true &&
  649. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  650. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  651. ret = true;
  652. return ret;
  653. }
  654. }
  655. }
  656. return ret;
  657. }
  658. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  659. if (kernel) {
  660. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  661. for (size_t j = 0; j < input_size; ++j) {
  662. auto input_kernel = kernel->input(j + 1);
  663. std::string input_kernel_name = input_kernel->fullname_with_scope();
  664. auto found = w_name.find_last_of('/');
  665. if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true;
  666. }
  667. return false;
  668. } else {
  669. return false;
  670. }
  671. }
  672. #endif
  673. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  674. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  675. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensorMap(const std::string &node_name) const {
  676. return tensor_loader_->GetNodeTensorMap(node_name);
  677. }
  678. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  679. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  680. void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
  681. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  682. #ifdef ONLINE_DBG_MODE
  683. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  684. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  685. TypeId host_type, TypeId addr_type_id, const std::string &addr_format,
  686. size_t slot) const {
  687. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  688. addr_type_id, addr_format, slot);
  689. }
  690. #endif
  691. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  692. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  693. }
  694. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  695. return watchpoint_table;
  696. }
  697. void DebugServices::ResetLoadedTensors() {
  698. wp_id_cache.clear();
  699. MS_LOG(INFO) << "Resetting loaded tensors";
  700. tensor_loader_->MoveParametersCurrentToPrev();
  701. tensor_loader_->EmptyCurrentTensor();
  702. // will move parameters from previous to current map
  703. tensor_loader_->SwapCurrentPrev();
  704. }
  705. #ifdef ONLINE_DBG_MODE
  706. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  707. MS_EXCEPTION_IF_NULL(kernel);
  708. std::vector<std::shared_ptr<TensorData>> result;
  709. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  710. auto kernel_name = kernel->fullname_with_scope();
  711. for (size_t j = 0; j < output_size; ++j) {
  712. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  713. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  714. if (tensor) result.push_back(tensor);
  715. }
  716. return result;
  717. }
  718. #endif
  719. bool DebugServices::TensorExistsInCurrent(std::string tensor_name) {
  720. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  721. }
  722. void DebugServices::MoveTensorCurrentToPrev(std::string tensor_name) {
  723. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  724. }
  725. void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
  726. std::string DebugServices::GetNetName() { return net_name; }
  727. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
  728. std::string DebugServices::GetDumpDir() { return dump_dir; }
  729. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
  730. bool DebugServices::GetSyncMode() { return is_sync_mode; }
  731. #ifdef ONLINE_DBG_MODE
  732. } // namespace mindspore
  733. #endif