You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 42 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <iterator>
  22. #include <map>
  23. #include <numeric>
  24. #include <unordered_set>
  25. #ifdef ONLINE_DBG_MODE
  26. #include "backend/session/anf_runtime_algorithm.h"
  27. #endif
  28. #include "debug/debugger/tensor_summary.h"
  29. #ifdef ONLINE_DBG_MODE
  30. namespace mindspore {
  31. #endif
  32. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  33. DebugServices::DebugServices(const DebugServices &other) {
  34. tensor_loader_ = other.tensor_loader_;
  35. watchpoint_table = other.watchpoint_table;
  36. }
  37. DebugServices &DebugServices::operator=(const DebugServices &other) {
  38. if (this != &other) {
  39. tensor_loader_ = other.tensor_loader_;
  40. watchpoint_table = other.watchpoint_table;
  41. }
  42. return *this;
  43. }
  44. void DebugServices::AddWatchpoint(
  45. unsigned int id, unsigned int watch_condition, float parameter,
  46. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  47. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  48. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  49. std::lock_guard<std::mutex> lg(lock_);
  50. watchpoint_t watchpoint_item;
  51. watchpoint_item.id = id;
  52. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  53. watchpoint_item.condition.parameter = parameter;
  54. watchpoint_item.check_node_list = check_node_list;
  55. if (check_node_device_list != nullptr) {
  56. watchpoint_item.check_node_device_list = *check_node_device_list;
  57. }
  58. if (check_node_graph_list != nullptr) {
  59. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  60. }
  61. watchpoint_item.parameter_list = parameter_list;
  62. watchpoint_table[id] = watchpoint_item;
  63. }
  64. void DebugServices::RemoveWatchpoint(unsigned int id) {
  65. std::lock_guard<std::mutex> lg(lock_);
  66. watchpoint_table.erase(id);
  67. }
  68. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  69. void *const previous_tensor_ptr, uint32_t num_elements,
  70. int tensor_dtype) {
  71. switch (tensor_dtype) {
  72. case DbgDataType::DT_UINT8: {
  73. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  74. }
  75. case DbgDataType::DT_INT8: {
  76. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  77. }
  78. case DbgDataType::DT_UINT16: {
  79. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  80. }
  81. case DbgDataType::DT_INT16: {
  82. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  83. }
  84. case DbgDataType::DT_UINT32: {
  85. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  86. }
  87. case DbgDataType::DT_INT32:
  88. case DbgDataType::DT_BASE_INT: {
  89. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  90. }
  91. case DbgDataType::DT_UINT64: {
  92. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  93. }
  94. case DbgDataType::DT_INT64: {
  95. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  96. }
  97. case DbgDataType::DT_FLOAT16: {
  98. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  99. }
  100. case DbgDataType::DT_FLOAT32:
  101. case DbgDataType::DT_BASE_FLOAT: {
  102. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  103. }
  104. case DbgDataType::DT_FLOAT64: {
  105. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  106. }
  107. case DbgDataType::DT_BOOL: {
  108. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  109. }
  110. default:
  111. MS_LOG(INFO) << "Unsupported tensor type";
  112. // return a null pointer
  113. return std::unique_ptr<TensorSummary<int32_t>>{};
  114. }
  115. }
  116. #ifdef OFFLINE_DBG_MODE
  117. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
  118. void *previous_tensor_ptr = nullptr;
  119. std::shared_ptr<TensorData> tensor_prev;
  120. if (previous_iter_tensor_needed && tensor->GetIteration() > 1) {
  121. // read data in offline mode
  122. std::vector<std::string> file_paths;
  123. if (!is_sync_mode) {
  124. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  125. std::vector<unsigned int>{tensor->GetDeviceId()},
  126. std::vector<unsigned int>{tensor->GetIteration() - 1},
  127. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  128. }
  129. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  130. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  131. std::vector<unsigned int>{tensor->GetDeviceId()},
  132. std::vector<unsigned int>{tensor->GetIteration() - 1},
  133. std::vector<unsigned int>{tensor->GetRootGraphId()}, file_paths, &result_list_prev);
  134. tensor_prev = result_list_prev[0];
  135. if (!tensor_prev->GetByteSize()) {
  136. tensor_prev.reset();
  137. } else {
  138. previous_tensor_ptr = tensor_prev->GetDataPtr();
  139. }
  140. }
  141. return previous_tensor_ptr;
  142. }
  143. #endif
  144. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  145. const std::string &tensor_name, const std::string &tensor_name_no_slot,
  146. bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name,
  147. std::vector<watchpoint_t> *const watchpoints_to_check) {
  148. for (auto w_table_item : watchpoint_table) {
  149. auto wp = std::get<1>(w_table_item);
  150. // check ONLY init conditions on initial suspended state.
  151. // skip other conditions on initial suspended state
  152. if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
  153. // skip init condition if not init suspend
  154. if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
  155. // check change conditions only on step end.
  156. if (wp.change_condition() && !step_end) continue;
  157. // if recheck, ignore the cache results and reanalyze everything.
  158. // if not a recheck, check only unanalyzed tensors
  159. if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
  160. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
  161. if (!found.empty()) {
  162. *qualified_tensor_name = found;
  163. watchpoints_to_check->push_back(w_table_item.second);
  164. #ifdef OFFLINE_DBG_MODE
  165. if (wp.change_condition()) {
  166. *previous_iter_tensor_needed = true;
  167. }
  168. #endif
  169. }
  170. }
  171. }
  172. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  173. const std::string &tensor_name) {
  174. // add analyzed tensor to cache
  175. if (!recheck) {
  176. wp_id_cache[tensor_name].insert(id);
  177. }
  178. }
  179. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  180. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  181. std::vector<std::vector<parameter_t>> *const parameters,
  182. std::vector<int32_t> *const error_codes,
  183. const std::vector<std::string> &op_overflows,
  184. const std::vector<std::string> &async_file_pool,
  185. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  186. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  187. std::vector<unsigned int> *root_graph_id) {
  188. std::lock_guard<std::mutex> lg(lock_);
  189. if (watchpoint_table.empty()) return;
  190. // vector to store execution order of tensors hit
  191. std::vector<int> exec_order;
  192. for (auto &tensor : *tensor_list) {
  193. #ifdef OFFLINE_DBG_MODE
  194. // read data in offline mode
  195. std::vector<std::shared_ptr<TensorData>> result_list;
  196. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  197. std::vector<unsigned int>{tensor->GetDeviceId()},
  198. std::vector<unsigned int>{tensor->GetIteration()},
  199. std::vector<unsigned int>{tensor->GetRootGraphId()}, async_file_pool, &result_list);
  200. tensor = result_list[0];
  201. if (!tensor->GetByteSize()) {
  202. tensor.reset();
  203. continue;
  204. }
  205. #endif
  206. const auto tensor_name = tensor->GetName();
  207. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  208. const auto tensor_slot = std::to_string(tensor->GetSlot());
  209. // no elements to analyze
  210. if (tensor->GetByteSize() == 0) continue;
  211. int tensor_dtype = tensor->GetType();
  212. std::vector<watchpoint_t> watchpoints_to_check;
  213. std::string qualified_tensor_name;
  214. bool previous_iter_tensor_needed = false;
  215. // Add do nothing line in case offline debug is off, prevent unused var warning
  216. (void)previous_iter_tensor_needed;
  217. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
  218. &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
  219. // no wp set on current tensor
  220. if (watchpoints_to_check.empty()) continue;
  221. uint32_t num_elements = tensor->GetNumElements();
  222. #ifdef OFFLINE_DBG_MODE
  223. void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
  224. #else
  225. void *previous_tensor_ptr =
  226. tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
  227. #endif
  228. std::unique_ptr<ITensorSummary> base_summary_ptr;
  229. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  230. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
  231. if (base_summary_ptr != nullptr) {
  232. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  233. }
  234. }
  235. for (auto &wp : watchpoints_to_check) {
  236. bool is_hit = false;
  237. int error_code = 0;
  238. std::vector<parameter_t> parameter_list = {};
  239. if (wp.condition.type == IS_OVERFLOW) {
  240. is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
  241. } else if (base_summary_ptr != nullptr) {
  242. auto item = base_summary_ptr->IsWatchpointHit(wp);
  243. is_hit = std::get<0>(item);
  244. error_code = std::get<1>(item);
  245. parameter_list = std::get<2>(item);
  246. }
  247. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  248. if (is_hit || error_code) {
  249. std::vector<int>::iterator iter;
  250. // if the execution order is repeated,inserts the new one before the others with same execution order.
  251. iter = std::lower_bound(exec_order.begin(), exec_order.end(), tensor->GetExecutionOrder());
  252. int position = iter - exec_order.begin();
  253. exec_order.insert(iter, tensor->GetExecutionOrder());
  254. name->insert(name->begin() + position, qualified_tensor_name);
  255. slot->insert(slot->begin() + position, tensor_slot);
  256. condition->insert(condition->begin() + position, wp.condition.type);
  257. watchpoint_id->insert(watchpoint_id->begin() + position, wp.id);
  258. if (device_id != nullptr) {
  259. device_id->insert(device_id->begin() + position, tensor->GetDeviceId());
  260. }
  261. if (root_graph_id != nullptr) {
  262. root_graph_id->insert(root_graph_id->begin() + position, tensor->GetRootGraphId());
  263. }
  264. parameters->insert(parameters->begin() + position, parameter_list);
  265. error_codes->insert(error_codes->begin() + position, error_code);
  266. }
  267. }
  268. #ifdef OFFLINE_DBG_MODE
  269. // in offline mode remove the need for the data
  270. tensor.reset();
  271. #endif
  272. }
  273. }
  274. #ifdef OFFLINE_DBG_MODE
  275. void DebugServices::GetSlotInfo(const std::string &file_name, const std::string &dump_name,
  276. const std::string &specific_dump_dir, std::vector<size_t> *slot_list) {
  277. // get the slot from the name
  278. std::string delimiter = "_";
  279. unsigned int start_pos = dump_name.length();
  280. unsigned int end_pos = file_name.find(delimiter, start_pos);
  281. std::string item = file_name.substr(start_pos, end_pos - start_pos);
  282. slot_list->push_back(std::stoul(item));
  283. }
  284. void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size,
  285. std::vector<int64_t> *shape, std::vector<char> **data_buffer) {
  286. std::ifstream infile;
  287. std::string file_path = file_name;
  288. MS_LOG(INFO) << "Reading in file: " << file_path;
  289. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  290. if (!infile.is_open()) {
  291. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path;
  292. return;
  293. }
  294. uint64_t file_size = infile.tellg();
  295. infile.seekg(0, std::ios::beg);
  296. std::unique_ptr<std::vector<char>> buffer(new std::vector<char>(file_size));
  297. if (!infile.read(buffer->data(), file_size)) {
  298. MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path;
  299. return;
  300. }
  301. uint16_t header_len = *reinterpret_cast<uint16_t *>(buffer->data() + 8);
  302. std::string header(buffer->data() + 9, header_len);
  303. std::size_t type_i = header.find("descr") + 10;
  304. *tensor_type = header.substr(type_i, 2);
  305. std::size_t shape_i_open = header.find("(");
  306. std::size_t shape_i_close = header.find(")");
  307. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  308. std::string intermediate;
  309. std::stringstream check_shape(shape_str);
  310. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  311. while (getline(check_shape, intermediate, ',')) {
  312. shape->push_back(std::stoi(intermediate));
  313. }
  314. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  315. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  316. std::size_t data_size = data_len * word_size;
  317. infile.seekg(header_len + 10);
  318. *data_buffer = new std::vector<char>(data_size);
  319. if (!infile.read((*data_buffer)->data(), data_size)) {
  320. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  321. }
  322. *size = data_size;
  323. }
  324. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  325. std::vector<std::string> *result_list) {
  326. std::string file_format = "npy";
  327. for (auto const &d : dir_to_files_map) {
  328. std::vector<std::string> files_to_convert_in_dir;
  329. std::string dump_key = d.first;
  330. for (auto const &file_name : d.second) {
  331. bool already_converted = false;
  332. for (std::string &file_found : *result_list) {
  333. if (file_found.find(file_name) != std::string::npos) {
  334. already_converted = true;
  335. }
  336. }
  337. if (!already_converted) {
  338. files_to_convert_in_dir.push_back(dump_key + "/" + file_name);
  339. }
  340. }
  341. std::string current_working_dir(__FILE__);
  342. std::size_t pos = current_working_dir.find_last_of("\\/");
  343. current_working_dir = (std::string::npos == pos) ? "" : current_working_dir.substr(0, pos);
  344. MS_LOG(INFO) << current_working_dir;
  345. std::ostringstream input_file_o;
  346. const char *const delim = " ";
  347. std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(),
  348. std::ostream_iterator<std::string>(input_file_o, delim));
  349. std::string input_files = input_file_o.str();
  350. MS_LOG(INFO) << "Ops to convert: " << input_files;
  351. if (input_files != "") {
  352. std::string convert_command = "python " + current_working_dir + "/convert_async.py -out " + dump_key + " -t " +
  353. file_format + " -d " + dump_key + " -f NCHW -l " + input_files;
  354. (void)(system(convert_command.c_str()) + 1);
  355. DIR *d_handle;
  356. d_handle = opendir(dump_key.c_str());
  357. if (d_handle != nullptr) {
  358. struct dirent *dir = nullptr;
  359. while ((dir = readdir(d_handle)) != NULL) {
  360. if (dir->d_type == DT_REG) {
  361. std::string candidate = dir->d_name;
  362. for (const std::string &file_to_find : files_to_convert_in_dir) {
  363. std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1);
  364. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  365. // we found a converted file for this op
  366. result_list->push_back(dump_key + "/" + candidate);
  367. }
  368. }
  369. }
  370. }
  371. }
  372. }
  373. }
  374. }
  375. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  376. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  377. std::vector<unsigned int> root_graph_id, std::vector<std::string> *result_list) {
  378. std::string file_format = "npy";
  379. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  380. for (unsigned int i = 0; i < backend_name.size(); i++) {
  381. // form prefix of the tensor file to read from graph pb node name
  382. std::string dump_style_kernel_name = backend_name[i];
  383. const std::string strsrc = "/";
  384. std::string strdst = "_";
  385. std::string::size_type pos = 0;
  386. std::string::size_type srclen = strsrc.size();
  387. std::string::size_type dstlen = strdst.size();
  388. // remove slot from name
  389. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  390. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  391. while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) {
  392. dump_style_kernel_name.replace(pos, srclen, strdst);
  393. pos += dstlen;
  394. }
  395. std::string prefix_dump_file_name = dump_style_kernel_name;
  396. std::string specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id[i]) + "/" + net_name + "_graph_" +
  397. std::to_string(root_graph_id[i]) + "/" + std::to_string(root_graph_id[i]) + "/" +
  398. std::to_string(iteration[i]);
  399. // search files in dir for the one that meets the filename prefix and read the file into memory
  400. DIR *d;
  401. d = opendir(specific_dump_dir.c_str());
  402. if (d != nullptr) {
  403. struct dirent *dir = nullptr;
  404. while ((dir = readdir(d)) != NULL) {
  405. if (dir->d_type == DT_REG) {
  406. std::string file_name = dir->d_name;
  407. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  408. if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 &&
  409. file_name.rfind(file_format) == std::string::npos) {
  410. // if file matches prefix and is in device format add to candidate files to convert.
  411. dir_to_files_map[specific_dump_dir].push_back(file_name);
  412. } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 &&
  413. file_name.rfind(file_format) != std::string::npos) {
  414. // otherwise, if file matches prefix and already has been converted to host format
  415. // add to result of converted files.
  416. result_list->push_back(specific_dump_dir + "/" + file_name);
  417. }
  418. }
  419. }
  420. }
  421. closedir(d);
  422. }
  423. ConvertToHostFormat(dir_to_files_map, result_list);
  424. }
  425. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  426. const std::string &specific_dump_dir,
  427. std::vector<std::string> *result_list) {
  428. std::string file_format = "npy";
  429. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  430. for (const auto &node : proto_dump) {
  431. std::string dump_name = std::get<1>(node);
  432. // search files in dir for the one that meets the filename prefix and read the file into memory
  433. DIR *d;
  434. d = opendir(specific_dump_dir.c_str());
  435. if (d != nullptr) {
  436. struct dirent *dir = nullptr;
  437. while ((dir = readdir(d)) != NULL) {
  438. if (dir->d_type == DT_REG) {
  439. std::string file_name = dir->d_name;
  440. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  441. if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 && file_name.rfind(file_format) == std::string::npos) {
  442. // if file matches prefix and is in device format add to candidate files to convert.
  443. dir_to_files_map[specific_dump_dir].push_back(file_name);
  444. } else if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 &&
  445. file_name.rfind(file_format) != std::string::npos) {
  446. // otherwise, if file matches prefix and already has been converted to host format
  447. // add to result of converted files.
  448. result_list->push_back(specific_dump_dir + "/" + file_name);
  449. }
  450. }
  451. }
  452. }
  453. closedir(d);
  454. }
  455. ConvertToHostFormat(dir_to_files_map, result_list);
  456. }
  457. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  458. uint32_t iteration, uint32_t device_id, uint32_t root_graph_id,
  459. const std::vector<std::string> &async_file_pool,
  460. std::vector<std::shared_ptr<TensorData>> *tensor_list) {
  461. for (auto &node : proto_dump) {
  462. std::vector<size_t> slot_list;
  463. for (const std::string &file_name : async_file_pool) {
  464. std::string dump_name = std::get<1>(node);
  465. std::size_t found = file_name.find(dump_name);
  466. std::size_t found_out = file_name.find("output");
  467. std::size_t found_dot_start = file_name.find(".", found_out);
  468. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  469. if (found != std::string::npos && found_out != std::string::npos) {
  470. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  471. }
  472. }
  473. for (auto slot : slot_list) {
  474. // add a TensorData entry (data will be read when needed)
  475. std::vector<int64_t> shape;
  476. std::string orig_name = std::get<0>(node);
  477. auto tensor_data = std::make_shared<TensorData>();
  478. tensor_data->SetName(orig_name);
  479. tensor_data->SetExecutionOrder(0);
  480. tensor_data->SetSlot(slot);
  481. tensor_data->SetIteration(iteration);
  482. tensor_data->SetDeviceId(device_id);
  483. tensor_data->SetRootGraphId(root_graph_id);
  484. tensor_data->SetDataPtr(NULL);
  485. tensor_data->SetByteSize(0);
  486. tensor_data->SetType("");
  487. tensor_data->SetShape(shape);
  488. tensor_list->push_back(tensor_data);
  489. }
  490. }
  491. }
  492. std::size_t DebugServices::GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot,
  493. const std::string &prefix_dump_file_name, std::string *file_name,
  494. std::string *type_name, std::string *out_dir, std::vector<int64_t> *shape) {
  495. std::size_t found = 0;
  496. found = file_name->rfind(prefix_dump_file_name, 0);
  497. if (found != 0) {
  498. return found;
  499. }
  500. // found a file, now get the shape and type
  501. // find "_shape_" in the filename
  502. std::string shape_delimiter = "_shape_";
  503. unsigned int str_pos = file_name->find(shape_delimiter) + shape_delimiter.length();
  504. // read numbers with '_' delimter until you read a non-number, that will be the type name
  505. bool number_found = true;
  506. std::string delimiter = "_";
  507. while (number_found) {
  508. unsigned int end_pos = file_name->find(delimiter, str_pos);
  509. std::string item = file_name->substr(str_pos, end_pos - str_pos);
  510. bool is_number = !item.empty() && std::find_if(item.begin(), item.end(),
  511. [](unsigned char c) { return !std::isdigit(c); }) == item.end();
  512. if (is_number) {
  513. shape->push_back(std::stoul(item));
  514. str_pos = end_pos + 1;
  515. } else {
  516. *type_name = item;
  517. number_found = false;
  518. }
  519. }
  520. return 0;
  521. }
  522. void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot,
  523. const unsigned int iteration, const unsigned int device_id,
  524. const unsigned int root_graph_id, const std::size_t data_size,
  525. const std::string &type_name, const std::vector<int64_t> &shape,
  526. std::vector<char> *buffer, std::vector<std::shared_ptr<TensorData>> *result_list) {
  527. // call LoadNewTensor to store tensor in internal cache
  528. auto tensor_data = std::make_shared<TensorData>();
  529. tensor_data->SetName(backend_name);
  530. tensor_data->SetExecutionOrder(0);
  531. tensor_data->SetSlot(slot);
  532. tensor_data->SetIteration(iteration);
  533. tensor_data->SetDeviceId(device_id);
  534. tensor_data->SetRootGraphId(root_graph_id);
  535. if (data_size) {
  536. tensor_data->SetDataPtr(buffer->data());
  537. } else {
  538. tensor_data->SetDataPtr(NULL);
  539. }
  540. tensor_data->SetByteSize(data_size);
  541. tensor_data->SetType(type_name);
  542. tensor_data->SetShape(shape);
  543. if (data_size) {
  544. tensor_loader_->LoadNewTensor(tensor_data, false);
  545. }
  546. // add to result_list
  547. result_list->push_back(tensor_data);
  548. }
  549. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  550. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  551. std::vector<unsigned int> root_graph_id,
  552. const std::vector<std::string> &async_file_pool,
  553. std::vector<std::shared_ptr<TensorData>> *result_list) {
  554. for (unsigned int i = 0; i < backend_name.size(); i++) {
  555. // form prefix of the tensor file to read from graph pb node name
  556. std::string dump_style_kernel_name = backend_name[i];
  557. const std::string strsrc = "/";
  558. std::string strdst;
  559. if (is_sync_mode) {
  560. strdst = "--";
  561. } else {
  562. strdst = "_";
  563. }
  564. std::string::size_type pos = 0;
  565. std::string::size_type srclen = strsrc.size();
  566. std::string::size_type dstlen = strdst.size();
  567. // remove slot from name
  568. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  569. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  570. while ((pos = dump_style_kernel_name.find(strsrc, pos)) != std::string::npos) {
  571. dump_style_kernel_name.replace(pos, srclen, strdst);
  572. pos += dstlen;
  573. }
  574. std::string prefix_dump_file_name = dump_style_kernel_name;
  575. if (is_sync_mode) {
  576. prefix_dump_file_name += "_output_" + std::to_string(slot[i]) + "_";
  577. }
  578. std::string specific_dump_dir =
  579. dump_dir + "/device_" + std::to_string(device_id[i]) + "/iteration_" + std::to_string(iteration[i]);
  580. // search files in dir for the one that meets the filename prefix and read the file into memory
  581. std::vector<char> *buffer = NULL;
  582. std::string type_name = "";
  583. std::vector<int64_t> shape;
  584. uint64_t data_size = 0;
  585. if (is_sync_mode) {
  586. DIR *d;
  587. d = opendir(specific_dump_dir.c_str());
  588. if (d != nullptr) {
  589. struct dirent *dir = nullptr;
  590. while ((dir = readdir(d)) != NULL) {
  591. if (dir->d_type == DT_REG) {
  592. std::string file_name = dir->d_name;
  593. std::string out_dir;
  594. std::size_t found = GetShapeTypeInfo(specific_dump_dir, slot[i], prefix_dump_file_name, &file_name,
  595. &type_name, &out_dir, &shape);
  596. if (found != 0) {
  597. continue;
  598. }
  599. // read the tensor data from the file
  600. std::string file_path = specific_dump_dir + "/" + file_name;
  601. std::ifstream infile;
  602. infile.open(file_path.c_str(), std::ios::binary | std::ios::ate);
  603. if (!infile.is_open()) {
  604. MS_LOG(ERROR) << "Failed to open bin file " << file_name;
  605. break;
  606. }
  607. uint64_t file_size = infile.tellg();
  608. infile.seekg(0, std::ios::beg);
  609. buffer = new std::vector<char>(file_size);
  610. if (!infile.read(buffer->data(), file_size)) {
  611. MS_LOG(ERROR) << "Failed to read in bin file " << file_name;
  612. break;
  613. }
  614. data_size = file_size;
  615. infile.close();
  616. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], data_size,
  617. type_name, shape, buffer, result_list);
  618. }
  619. }
  620. } else {
  621. MS_LOG(INFO) << "directory does not exist!";
  622. }
  623. closedir(d);
  624. } else {
  625. bool found = false;
  626. // if async mode
  627. for (const std::string &file_path : async_file_pool) {
  628. if (file_path.find(prefix_dump_file_name) != std::string::npos &&
  629. file_path.find(".output." + std::to_string(slot[i])) != std::string::npos) {
  630. found = true;
  631. shape.clear();
  632. ReadTensorFromNpy(file_path, &type_name, &data_size, &shape, &buffer);
  633. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], data_size, type_name,
  634. shape, buffer, result_list);
  635. }
  636. }
  637. // If no npy file is found, add empty tensor data.
  638. if (!found) {
  639. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], 0, type_name, shape,
  640. buffer, result_list);
  641. }
  642. }
  643. }
  644. }
  645. void ReplaceSrcFileName(const bool is_sync_mode, std::string *dump_style_name) {
  646. const std::string strsrc = "/";
  647. std::string strdst;
  648. if (is_sync_mode) {
  649. strdst = "--";
  650. } else {
  651. strdst = "_";
  652. }
  653. std::string::size_type pos = 0;
  654. std::string::size_type srclen = strsrc.size();
  655. std::string::size_type dstlen = strdst.size();
  656. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  657. dump_style_name->replace(pos, srclen, strdst);
  658. pos += dstlen;
  659. }
  660. }
  661. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  662. unsigned int iteration, std::vector<std::string> *async_file_pool) {
  663. // get a list of nodes and the devices they are on to monitor
  664. std::vector<std::shared_ptr<TensorData>> tensor_list;
  665. std::map<std::tuple<uint32_t, uint32_t>, std::unordered_set<std::string>> device_and_graph_to_nodes;
  666. for (auto w_table_item : watchpoint_table) {
  667. auto wp = std::get<1>(w_table_item);
  668. for (auto check_node : wp.check_node_list) {
  669. unsigned int index = 0;
  670. std::string w_name = std::get<0>(check_node);
  671. bool w_is_param = std::get<1>(check_node);
  672. std::string node_name = w_name;
  673. if (w_is_param) {
  674. std::size_t found = node_name.find_last_of("/");
  675. node_name = node_name.substr(found + 1);
  676. }
  677. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  678. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  679. for (auto device : devices) {
  680. for (auto graph : graphs) {
  681. std::tuple<uint32_t, uint32_t> key(device, graph);
  682. device_and_graph_to_nodes[key].insert(node_name);
  683. }
  684. }
  685. index++;
  686. }
  687. }
  688. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  689. // as they are found
  690. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  691. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  692. uint32_t device_id = std::get<0>(device_and_graph);
  693. uint32_t root_graph_id = std::get<1>(device_and_graph);
  694. std::unordered_set<std::string> wp_nodes = device_and_graph_item.second;
  695. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  696. std::string specific_dump_dir;
  697. if (is_sync_mode) {
  698. specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/iteration_" + std::to_string(iteration);
  699. } else {
  700. specific_dump_dir = dump_dir + "/device_" + std::to_string(device_id) + "/" + net_name + "_graph_" +
  701. std::to_string(root_graph_id) + "/" + std::to_string(root_graph_id) + "/" +
  702. std::to_string(iteration);
  703. }
  704. // convert node names to dump style
  705. for (auto node : wp_nodes) {
  706. std::string orig_name = node;
  707. std::string dump_style_name = node;
  708. ReplaceSrcFileName(is_sync_mode, &dump_style_name);
  709. if (is_sync_mode) {
  710. dump_style_name.append("_output_");
  711. }
  712. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  713. }
  714. if (!is_sync_mode) {
  715. // convert all files in proto_to_dump to npy and add to pool of async file names
  716. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  717. }
  718. if (is_sync_mode) {
  719. // search files in dir for the one that meets the filename prefix and read the file into memory
  720. DIR *d;
  721. d = opendir(specific_dump_dir.c_str());
  722. if (d != nullptr) {
  723. struct dirent *dir = nullptr;
  724. while ((dir = readdir(d)) != NULL) {
  725. if (dir->d_type == DT_REG) {
  726. std::string file_name = dir->d_name;
  727. for (auto &node : proto_to_dump) {
  728. std::string dump_name = std::get<1>(node);
  729. std::size_t found = 0;
  730. found = file_name.rfind(dump_name, 0);
  731. if (found == 0) {
  732. std::vector<size_t> slot_list;
  733. GetSlotInfo(file_name, dump_name, specific_dump_dir, &slot_list);
  734. for (auto slot : slot_list) {
  735. // add a TensorData entry (data will be read when needed)
  736. std::vector<int64_t> shape;
  737. std::string orig_name = std::get<0>(node);
  738. auto tensor_data = std::make_shared<TensorData>();
  739. tensor_data->SetName(orig_name);
  740. tensor_data->SetExecutionOrder(0);
  741. tensor_data->SetSlot(slot);
  742. tensor_data->SetIteration(iteration);
  743. tensor_data->SetDeviceId(device_id);
  744. tensor_data->SetRootGraphId(root_graph_id);
  745. tensor_data->SetDataPtr(NULL);
  746. tensor_data->SetByteSize(0);
  747. tensor_data->SetType("");
  748. tensor_data->SetShape(shape);
  749. tensor_list.push_back(tensor_data);
  750. }
  751. break;
  752. }
  753. }
  754. }
  755. }
  756. }
  757. } else {
  758. GetTensorDataInfoAsync(proto_to_dump, iteration, device_id, root_graph_id, *async_file_pool, &tensor_list);
  759. }
  760. }
  761. return tensor_list;
  762. }
  763. #endif
  764. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  765. std::vector<char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  766. std::vector<unsigned int> *const dtype,
  767. std::vector<std::vector<int64_t>> *const shape) {
  768. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  769. tensor_loader_->SearchTensors(name, &result_list);
  770. for (auto result : result_list) {
  771. if (!std::get<1>(result)) {
  772. continue;
  773. }
  774. ret_name->push_back(std::get<0>(result));
  775. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  776. data_size->push_back(std::get<1>(result)->GetByteSize());
  777. dtype->push_back(std::get<1>(result)->GetType());
  778. shape->push_back(std::get<1>(result)->GetShape());
  779. }
  780. }
  781. #ifdef ONLINE_DBG_MODE
  782. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  783. bool ret = false;
  784. for (auto w_table_item : watchpoint_table) {
  785. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  786. for (auto check_node : check_node_list) {
  787. std::string w_name = std::get<0>(check_node);
  788. bool w_type = std::get<1>(check_node);
  789. if ((w_type == true &&
  790. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  791. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  792. ret = true;
  793. return ret;
  794. }
  795. }
  796. }
  797. return ret;
  798. }
  799. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  800. if (kernel) {
  801. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  802. for (size_t j = 0; j < input_size; ++j) {
  803. auto input_kernel = kernel->input(j + 1);
  804. std::string input_kernel_name = input_kernel->fullname_with_scope();
  805. auto found = w_name.find_last_of('/');
  806. if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true;
  807. }
  808. return false;
  809. } else {
  810. return false;
  811. }
  812. }
  813. #endif
  814. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  815. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  816. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensorMap(const std::string &node_name) const {
  817. return tensor_loader_->GetNodeTensorMap(node_name);
  818. }
  819. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  820. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  821. void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
  822. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  823. #ifdef ONLINE_DBG_MODE
  824. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  825. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  826. TypeId host_type, TypeId device_type, const std::string &addr_format,
  827. size_t slot) const {
  828. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  829. device_type, addr_format, slot);
  830. }
  831. #endif
  832. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  833. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  834. }
  835. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  836. return watchpoint_table;
  837. }
  838. void DebugServices::ResetLoadedTensors() {
  839. wp_id_cache.clear();
  840. MS_LOG(INFO) << "Resetting loaded tensors";
  841. tensor_loader_->MoveParametersCurrentToPrev();
  842. tensor_loader_->EmptyCurrentTensor();
  843. // will move parameters from previous to current map
  844. tensor_loader_->SwapCurrentPrev();
  845. }
  846. #ifdef ONLINE_DBG_MODE
  847. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  848. MS_EXCEPTION_IF_NULL(kernel);
  849. std::vector<std::shared_ptr<TensorData>> result;
  850. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  851. auto kernel_name = kernel->fullname_with_scope();
  852. for (size_t j = 0; j < output_size; ++j) {
  853. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  854. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  855. if (tensor) result.push_back(tensor);
  856. }
  857. return result;
  858. }
  859. #endif
  860. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  861. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  862. }
  863. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  864. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  865. }
  866. void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
  867. std::string DebugServices::GetNetName() { return net_name; }
  868. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
  869. std::string DebugServices::GetDumpDir() { return dump_dir; }
  870. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
  871. bool DebugServices::GetSyncMode() { return is_sync_mode; }
  872. #ifdef ONLINE_DBG_MODE
  873. } // namespace mindspore
  874. #endif