You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 48 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include "pybind11/embed.h"
  28. #ifdef ONLINE_DBG_MODE
  29. #include "debug/anf_ir_utils.h"
  30. #include "backend/session/anf_runtime_algorithm.h"
  31. #endif
  32. #include "debug/debugger/tensor_summary.h"
  33. #ifdef ONLINE_DBG_MODE
  34. namespace mindspore {
  35. #endif
  36. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  37. DebugServices::DebugServices(const DebugServices &other) {
  38. tensor_loader_ = other.tensor_loader_;
  39. watchpoint_table = other.watchpoint_table;
  40. }
  41. DebugServices &DebugServices::operator=(const DebugServices &other) {
  42. if (this != &other) {
  43. tensor_loader_ = other.tensor_loader_;
  44. watchpoint_table = other.watchpoint_table;
  45. }
  46. return *this;
  47. }
  48. void DebugServices::AddWatchpoint(
  49. unsigned int id, unsigned int watch_condition, float parameter,
  50. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  51. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  52. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  53. std::lock_guard<std::mutex> lg(lock_);
  54. watchpoint_t watchpoint_item;
  55. watchpoint_item.id = id;
  56. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  57. watchpoint_item.condition.parameter = parameter;
  58. watchpoint_item.check_node_list = check_node_list;
  59. if (check_node_device_list != nullptr) {
  60. watchpoint_item.check_node_device_list = *check_node_device_list;
  61. }
  62. if (check_node_graph_list != nullptr) {
  63. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  64. }
  65. watchpoint_item.parameter_list = parameter_list;
  66. watchpoint_table[id] = watchpoint_item;
  67. }
  68. void DebugServices::RemoveWatchpoint(unsigned int id) {
  69. std::lock_guard<std::mutex> lg(lock_);
  70. watchpoint_table.erase(id);
  71. }
  72. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  73. void *const previous_tensor_ptr, uint32_t num_elements,
  74. int tensor_dtype) {
  75. switch (tensor_dtype) {
  76. case DbgDataType::DT_UINT8: {
  77. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  78. }
  79. case DbgDataType::DT_INT8: {
  80. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  81. }
  82. case DbgDataType::DT_UINT16: {
  83. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  84. }
  85. case DbgDataType::DT_INT16: {
  86. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  87. }
  88. case DbgDataType::DT_UINT32: {
  89. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  90. }
  91. case DbgDataType::DT_INT32:
  92. case DbgDataType::DT_BASE_INT: {
  93. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  94. }
  95. case DbgDataType::DT_UINT64: {
  96. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  97. }
  98. case DbgDataType::DT_INT64: {
  99. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  100. }
  101. case DbgDataType::DT_FLOAT16: {
  102. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  103. }
  104. case DbgDataType::DT_FLOAT32:
  105. case DbgDataType::DT_BASE_FLOAT: {
  106. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  107. }
  108. case DbgDataType::DT_FLOAT64: {
  109. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  110. }
  111. case DbgDataType::DT_BOOL: {
  112. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  113. }
  114. default:
  115. MS_LOG(INFO) << "Unsupported tensor type";
  116. // return a null pointer
  117. return std::unique_ptr<TensorSummary<int32_t>>{};
  118. }
  119. }
  120. #ifdef OFFLINE_DBG_MODE
  121. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
  122. void *previous_tensor_ptr = nullptr;
  123. std::shared_ptr<TensorData> tensor_prev;
  124. if (previous_iter_tensor_needed && tensor->GetIteration() > 1) {
  125. // read data in offline mode
  126. std::vector<std::string> file_paths;
  127. if (!is_sync_mode) {
  128. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  129. std::vector<unsigned int>{tensor->GetDeviceId()},
  130. std::vector<unsigned int>{tensor->GetIteration() - 1},
  131. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  132. }
  133. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  134. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  135. std::vector<unsigned int>{tensor->GetDeviceId()},
  136. std::vector<unsigned int>{tensor->GetIteration() - 1},
  137. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  138. file_paths, &result_list_prev);
  139. tensor_prev = result_list_prev[0];
  140. if (!tensor_prev->GetByteSize()) {
  141. tensor_prev.reset();
  142. } else {
  143. previous_tensor_ptr = tensor_prev->GetDataPtr();
  144. }
  145. }
  146. return previous_tensor_ptr;
  147. }
  148. #endif
  149. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  150. const std::string &tensor_name, const std::string &tensor_name_no_slot,
  151. bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name,
  152. std::vector<watchpoint_t> *const watchpoints_to_check) {
  153. for (auto w_table_item : watchpoint_table) {
  154. auto wp = std::get<1>(w_table_item);
  155. // check ONLY init conditions on initial suspended state.
  156. // skip other conditions on initial suspended state
  157. if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
  158. // skip init condition if not init suspend
  159. if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
  160. // check change conditions only on step end.
  161. if (wp.change_condition() && !step_end) continue;
  162. // if recheck, ignore the cache results and reanalyze everything.
  163. // if not a recheck, check only unanalyzed tensors
  164. if (!recheck && wp_id_cache[tensor_name].count(wp.id)) continue;
  165. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
  166. if (!found.empty()) {
  167. *qualified_tensor_name = found;
  168. watchpoints_to_check->push_back(w_table_item.second);
  169. #ifdef OFFLINE_DBG_MODE
  170. if (wp.change_condition()) {
  171. *previous_iter_tensor_needed = true;
  172. }
  173. #endif
  174. }
  175. }
  176. }
  177. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  178. const std::string &tensor_name) {
  179. // add analyzed tensor to cache
  180. if (!recheck) {
  181. wp_id_cache[tensor_name].insert(id);
  182. }
  183. }
  184. void DebugServices::CheckWatchpointsForTensor(
  185. partitioned_names *chunk_names, partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions,
  186. partitioned_id *const chunk_watchpoint_id, partitioned_parameters *chunk_parameters,
  187. partitioned_error_code *chunk_error_codes, const std::vector<std::string> &op_overflows,
  188. const std::vector<std::string> &async_file_pool, partitioned_numbers *chunk_exec_orders,
  189. std::vector<std::shared_ptr<TensorData>> *tensor_list, int begin, int end, int chunk_id, const bool init_dbg_suspend,
  190. const bool step_end, const bool recheck, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  191. std::vector<uint64_t> *chunk_tensor_byte_size, std::vector<unsigned int> *device_id,
  192. std::vector<unsigned int> *root_graph_id) {
  193. for (int i = begin; i < end; i++) {
  194. auto &tensor = (*tensor_list)[i];
  195. #ifdef OFFLINE_DBG_MODE
  196. // read data in offline mode
  197. std::vector<std::shared_ptr<TensorData>> result_list;
  198. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  199. std::vector<unsigned int>{tensor->GetDeviceId()},
  200. std::vector<unsigned int>{tensor->GetIteration()},
  201. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  202. async_file_pool, &result_list);
  203. tensor = result_list[0];
  204. if (!tensor->GetByteSize()) {
  205. tensor.reset();
  206. continue;
  207. }
  208. #endif
  209. const auto tensor_name = tensor->GetName();
  210. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  211. const auto tensor_slot = std::to_string(tensor->GetSlot());
  212. // no elements to analyze
  213. if (tensor->GetByteSize() == 0) continue;
  214. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  215. int tensor_dtype = tensor->GetType();
  216. std::vector<watchpoint_t> watchpoints_to_check;
  217. std::string qualified_tensor_name;
  218. bool previous_iter_tensor_needed = false;
  219. // Add do nothing line in case offline debug is off, prevent unused var warning
  220. (void)previous_iter_tensor_needed;
  221. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
  222. &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
  223. // no wp set on current tensor
  224. if (watchpoints_to_check.empty()) continue;
  225. uint32_t num_elements = tensor->GetNumElements();
  226. #ifdef OFFLINE_DBG_MODE
  227. void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
  228. #else
  229. void *previous_tensor_ptr =
  230. tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
  231. #endif
  232. std::unique_ptr<ITensorSummary> base_summary_ptr;
  233. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  234. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
  235. if (base_summary_ptr != nullptr) {
  236. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  237. }
  238. }
  239. for (auto &wp : watchpoints_to_check) {
  240. bool is_hit = false;
  241. int error_code = 0;
  242. std::vector<parameter_t> parameter_list = {};
  243. if (wp.condition.type == IS_OVERFLOW) {
  244. is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
  245. } else if (base_summary_ptr != nullptr) {
  246. auto item = base_summary_ptr->IsWatchpointHit(wp);
  247. is_hit = std::get<ITensorSummary::eHitPos>(item);
  248. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  249. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  250. }
  251. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  252. if (is_hit || error_code) {
  253. (*chunk_exec_orders)[chunk_id].push_back(tensor->GetExecutionOrder());
  254. (*chunk_names)[chunk_id].push_back(qualified_tensor_name);
  255. (*chunk_slots)[chunk_id].push_back(tensor_slot);
  256. (*chunk_conditions)[chunk_id].push_back(wp.condition.type);
  257. (*chunk_watchpoint_id)[chunk_id].push_back(wp.id);
  258. if (device_id != nullptr) {
  259. (*chunk_device_id)[chunk_id].push_back(tensor->GetDeviceId());
  260. }
  261. if (root_graph_id != nullptr) {
  262. (*chunk_root_graph_id)[chunk_id].push_back(tensor->GetRootGraphId());
  263. }
  264. (*chunk_parameters)[chunk_id].push_back(parameter_list);
  265. (*chunk_error_codes)[chunk_id].push_back(error_code);
  266. }
  267. }
  268. #ifdef OFFLINE_DBG_MODE
  269. // in offline mode remove the need for the data
  270. tensor.reset();
  271. #endif
  272. }
  273. }
  274. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  275. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  276. std::vector<std::vector<parameter_t>> *const parameters,
  277. std::vector<int32_t> *const error_codes,
  278. const std::vector<std::string> &op_overflows,
  279. const std::vector<std::string> &async_file_pool,
  280. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  281. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  282. std::vector<unsigned int> *root_graph_id) {
  283. std::lock_guard<std::mutex> lg(lock_);
  284. auto t1 = std::chrono::high_resolution_clock::now();
  285. if (watchpoint_table.empty()) return;
  286. // vector to store execution order of tensors hit
  287. std::vector<int> exec_order;
  288. int tensor_list_size = tensor_list->size();
  289. uint64_t tensor_list_byte_size = 0;
  290. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  291. if (tensor_list_size == 0) return;
  292. // default value for number of threads
  293. int max_thread_num = 32;
  294. auto thread_num = getenv("MS_dbg_num_thread");
  295. if (thread_num != nullptr) {
  296. max_thread_num = std::stoi(thread_num);
  297. }
  298. if (max_thread_num > tensor_list_size) {
  299. max_thread_num = tensor_list_size;
  300. }
  301. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  302. int chunk_size = tensor_list_size / max_thread_num;
  303. int remainder = tensor_list_size % max_thread_num;
  304. partitioned_numbers chunk_exec_orders(max_thread_num);
  305. partitioned_names chunk_names(max_thread_num);
  306. partitioned_names chunk_slots(max_thread_num);
  307. partitioned_numbers chunk_conditions(max_thread_num);
  308. partitioned_id chunk_watchpoint_id(max_thread_num);
  309. partitioned_parameters chunk_parameters(max_thread_num);
  310. partitioned_error_code chunk_error_codes(max_thread_num);
  311. partitioned_id chunk_device_id(max_thread_num);
  312. partitioned_id chunk_root_graph_id(max_thread_num);
  313. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  314. std::vector<std::future<void>> tensor_future_vec;
  315. int begin = 0;
  316. int end = begin;
  317. for (int i = 0; i < max_thread_num; i++) {
  318. end += chunk_size;
  319. if (remainder > 0) {
  320. end++;
  321. remainder--;
  322. }
  323. tensor_future_vec.push_back(
  324. std::async(std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  325. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows,
  326. async_file_pool, &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck,
  327. &chunk_device_id, &chunk_root_graph_id, &chunk_tensor_byte_size, device_id, root_graph_id));
  328. begin = end;
  329. }
  330. for (unsigned int i = 0; i < tensor_future_vec.size(); i++) {
  331. tensor_future_vec[i].wait();
  332. tensor_future_vec[i].get();
  333. for (unsigned int j = 0; j < chunk_exec_orders[i].size(); j++) {
  334. std::vector<int>::iterator iter;
  335. iter = std::lower_bound(exec_order.begin(), exec_order.end(), chunk_exec_orders[i][j]);
  336. // if the execution order is repeated,inserts the new one before the others with same execution order.
  337. int position = iter - exec_order.begin();
  338. exec_order.insert(iter, chunk_exec_orders[i][j]);
  339. name->insert(name->begin() + position, chunk_names[i][j]);
  340. slot->insert(slot->begin() + position, chunk_slots[i][j]);
  341. condition->insert(condition->begin() + position, chunk_conditions[i][j]);
  342. watchpoint_id->insert(watchpoint_id->begin() + position, chunk_watchpoint_id[i][j]);
  343. if (device_id != nullptr) {
  344. device_id->insert(device_id->begin() + position, chunk_device_id[i][j]);
  345. }
  346. if (root_graph_id != nullptr) {
  347. root_graph_id->insert(root_graph_id->begin() + position, chunk_root_graph_id[i][j]);
  348. }
  349. parameters->insert(parameters->begin() + position, chunk_parameters[i][j]);
  350. error_codes->insert(error_codes->begin() + position, chunk_error_codes[i][j]);
  351. }
  352. // free the memory for used vectors
  353. std::vector<int>().swap(chunk_exec_orders[i]);
  354. std::vector<std::string>().swap(chunk_names[i]);
  355. std::vector<std::string>().swap(chunk_slots[i]);
  356. std::vector<int>().swap(chunk_conditions[i]);
  357. std::vector<unsigned int>().swap(chunk_watchpoint_id[i]);
  358. std::vector<std::vector<parameter_t>>().swap(chunk_parameters[i]);
  359. std::vector<int32_t>().swap(chunk_error_codes[i]);
  360. std::vector<unsigned int>().swap(chunk_device_id[i]);
  361. std::vector<unsigned int>().swap(chunk_root_graph_id[i]);
  362. tensor_list_byte_size += chunk_tensor_byte_size[i];
  363. }
  364. auto t2 = std::chrono::high_resolution_clock::now();
  365. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  366. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  367. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  368. }
  369. #ifdef OFFLINE_DBG_MODE
  370. void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size,
  371. std::vector<int64_t> *shape, std::vector<char> **data_buffer) {
  372. std::ifstream infile;
  373. std::string file_path = file_name;
  374. MS_LOG(INFO) << "Reading in file: " << file_path;
  375. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  376. if (!infile.is_open()) {
  377. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path;
  378. return;
  379. }
  380. uint64_t file_size = infile.tellg();
  381. infile.seekg(0, std::ios::beg);
  382. auto buffer = std::make_unique<std::vector<char>>(file_size);
  383. if (!infile.read(buffer->data(), file_size)) {
  384. MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path;
  385. return;
  386. }
  387. uint16_t header_len = *reinterpret_cast<uint16_t *>(buffer->data() + 8);
  388. std::string header(buffer->data() + 9, header_len);
  389. std::size_t type_i = header.find("descr") + 10;
  390. *tensor_type = header.substr(type_i, 2);
  391. std::size_t shape_i_open = header.find("(");
  392. std::size_t shape_i_close = header.find(")");
  393. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  394. std::string intermediate;
  395. std::stringstream check_shape(shape_str);
  396. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  397. while (getline(check_shape, intermediate, ',')) {
  398. shape->push_back(std::stoi(intermediate));
  399. }
  400. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  401. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  402. std::size_t data_size = data_len * word_size;
  403. infile.seekg(header_len + 10);
  404. *data_buffer = new std::vector<char>(data_size);
  405. if (data_buffer == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  406. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  407. }
  408. *size = data_size;
  409. }
  410. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  411. std::vector<std::string> *result_list) {
  412. std::string file_format = "npy";
  413. for (auto const &d : dir_to_files_map) {
  414. std::vector<std::string> files_to_convert_in_dir;
  415. std::string dump_key = d.first;
  416. for (auto const &file_name : d.second) {
  417. bool already_converted = false;
  418. for (std::string &file_found : *result_list) {
  419. if (file_found.find(file_name) != std::string::npos) {
  420. already_converted = true;
  421. }
  422. }
  423. if (!already_converted) {
  424. files_to_convert_in_dir.push_back(dump_key + "/" + file_name);
  425. }
  426. }
  427. std::ostringstream input_file_o;
  428. const char *const delim = " ";
  429. std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(),
  430. std::ostream_iterator<std::string>(input_file_o, delim));
  431. std::string input_files = input_file_o.str();
  432. MS_LOG(INFO) << "Ops to convert: " << input_files;
  433. if (input_files != "") {
  434. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  435. // later task.
  436. try {
  437. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  438. std::string convert_pkg_path = pkg.attr("__file__").cast<std::string>();
  439. MS_LOG(INFO) << "The file for converting async dump data is in " << convert_pkg_path;
  440. std::string convert_command = "python " + convert_pkg_path + " -out " + dump_key + " -t " + file_format +
  441. " -d " + dump_key + " -f NCHW -l " + input_files;
  442. (void)(system(convert_command.c_str()) + 1);
  443. } catch (pybind11::error_already_set &e) {
  444. MS_LOG(EXCEPTION) << "Can't find package mindspore.offline_debug.convert_async";
  445. }
  446. DIR *d_handle;
  447. d_handle = opendir(dump_key.c_str());
  448. if (d_handle != nullptr) {
  449. struct dirent *dir = nullptr;
  450. while ((dir = readdir(d_handle)) != NULL) {
  451. if (dir->d_type == DT_REG) {
  452. std::string candidate = dir->d_name;
  453. for (const std::string &file_to_find : files_to_convert_in_dir) {
  454. std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1);
  455. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  456. // we found a converted file for this op
  457. std::string found_file = dump_key + "/" + candidate;
  458. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  459. result_list->push_back(found_file);
  460. }
  461. }
  462. }
  463. }
  464. }
  465. }
  466. }
  467. }
  468. }
  469. void GetNodeNameWithoutScope(std::string *dump_style_name) {
  470. if (dump_style_name == nullptr) {
  471. return;
  472. }
  473. std::string node_name_without_scope = *dump_style_name;
  474. std::size_t last_scope_marker;
  475. std::string delim = "/";
  476. last_scope_marker = node_name_without_scope.rfind(delim);
  477. if (last_scope_marker != std::string::npos) {
  478. node_name_without_scope = node_name_without_scope.substr(last_scope_marker + delim.size());
  479. }
  480. *dump_style_name = node_name_without_scope;
  481. }
  482. void ReplaceSrcFileName(std::string *dump_style_name) {
  483. if (dump_style_name == nullptr) {
  484. return;
  485. }
  486. const std::string strsrc = "/";
  487. std::string strdst = "_";
  488. std::string::size_type pos = 0;
  489. std::string::size_type srclen = strsrc.size();
  490. std::string::size_type dstlen = strdst.size();
  491. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  492. dump_style_name->replace(pos, srclen, strdst);
  493. pos += dstlen;
  494. }
  495. }
  496. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  497. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  498. std::vector<unsigned int> root_graph_id, std::vector<std::string> *result_list) {
  499. std::string file_format = "npy";
  500. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  501. for (unsigned int i = 0; i < backend_name.size(); i++) {
  502. // form prefix of the tensor file to read from graph pb node name
  503. std::string dump_style_kernel_name = backend_name[i];
  504. ReplaceSrcFileName(&dump_style_kernel_name);
  505. // remove slot from name
  506. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  507. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  508. std::string prefix_dump_file_name = dump_style_kernel_name;
  509. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
  510. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  511. // search files in dir for the one that meets the filename prefix and read the file into memory
  512. DIR *d;
  513. d = opendir(specific_dump_dir.c_str());
  514. if (d != nullptr) {
  515. struct dirent *dir = nullptr;
  516. while ((dir = readdir(d)) != NULL) {
  517. if (dir->d_type == DT_REG) {
  518. std::string file_name = dir->d_name;
  519. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  520. if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 &&
  521. file_name.rfind(file_format) == std::string::npos) {
  522. // if file matches prefix and is in device format add to candidate files to convert.
  523. dir_to_files_map[specific_dump_dir].push_back(file_name);
  524. } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name, 0) == 0 &&
  525. file_name.rfind(file_format) != std::string::npos) {
  526. // otherwise, if file matches prefix and already has been converted to host format
  527. // add to result of converted files.
  528. std::string found_file = specific_dump_dir + "/" + file_name;
  529. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  530. result_list->push_back(found_file);
  531. }
  532. }
  533. }
  534. }
  535. }
  536. closedir(d);
  537. }
  538. ConvertToHostFormat(dir_to_files_map, result_list);
  539. }
  540. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  541. const std::string &specific_dump_dir,
  542. std::vector<std::string> *result_list) {
  543. std::string file_format = "npy";
  544. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  545. for (const auto &node : proto_dump) {
  546. std::string dump_name = std::get<1>(node);
  547. dump_name = dump_name.substr(0, dump_name.rfind("."));
  548. // search files in dir for the one that meets the filename prefix and read the file into memory
  549. DIR *d;
  550. d = opendir(specific_dump_dir.c_str());
  551. if (d != nullptr) {
  552. struct dirent *dir = nullptr;
  553. while ((dir = readdir(d)) != NULL) {
  554. if (dir->d_type == DT_REG) {
  555. std::string file_name = dir->d_name;
  556. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  557. if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 && file_name.rfind(file_format) == std::string::npos) {
  558. // if file matches prefix and is in device format add to candidate files to convert.
  559. dir_to_files_map[specific_dump_dir].push_back(file_name);
  560. } else if (file_name_w_o_perfix.rfind(dump_name, 0) == 0 &&
  561. file_name.rfind(file_format) != std::string::npos) {
  562. // otherwise, if file matches prefix and already has been converted to host format
  563. // add to result of converted files.
  564. std::string found_file = specific_dump_dir + "/" + file_name;
  565. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  566. result_list->push_back(found_file);
  567. }
  568. }
  569. }
  570. }
  571. }
  572. closedir(d);
  573. }
  574. ConvertToHostFormat(dir_to_files_map, result_list);
  575. }
  576. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  577. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  578. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  579. std::vector<std::shared_ptr<TensorData>> *tensor_list) {
  580. for (auto &node : proto_dump) {
  581. std::vector<size_t> slot_list;
  582. std::string dump_style_name = std::get<1>(node);
  583. // Get dump_name and output_str from the second element of tuple
  584. std::size_t found_dot = dump_style_name.rfind(".");
  585. std::string dump_name = dump_style_name.substr(0, found_dot);
  586. std::string output_str = dump_style_name.substr(found_dot + 1);
  587. bool output_flag = (output_str == "output");
  588. for (const std::string &file_name : async_file_pool) {
  589. std::size_t found = file_name.find(dump_name);
  590. std::size_t found_out = file_name.find(output_str);
  591. std::size_t found_dot_start = file_name.find(".", found_out);
  592. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  593. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  594. found_out != std::string::npos) {
  595. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  596. }
  597. }
  598. for (auto slot : slot_list) {
  599. // add a TensorData entry (data will be read when needed)
  600. std::vector<int64_t> shape;
  601. std::string orig_name = std::get<0>(node);
  602. auto tensor_data = std::make_shared<TensorData>();
  603. tensor_data->SetName(orig_name);
  604. tensor_data->SetExecutionOrder(0);
  605. tensor_data->SetSlot(slot);
  606. tensor_data->SetIteration(iteration);
  607. tensor_data->SetDeviceId(device_id);
  608. tensor_data->SetRootGraphId(root_graph_id);
  609. tensor_data->SetDataPtr(NULL);
  610. tensor_data->SetByteSize(0);
  611. tensor_data->SetType("");
  612. tensor_data->SetShape(shape);
  613. tensor_data->SetIsOutput(output_flag);
  614. tensor_list->push_back(tensor_data);
  615. }
  616. }
  617. }
  618. void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot,
  619. const unsigned int iteration, const unsigned int device_id,
  620. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  621. const std::string &type_name, const std::vector<int64_t> &shape,
  622. std::vector<char> *buffer, std::vector<std::shared_ptr<TensorData>> *result_list) {
  623. // call LoadNewTensor to store tensor in internal cache
  624. auto tensor_data = std::make_shared<TensorData>();
  625. tensor_data->SetName(backend_name);
  626. tensor_data->SetExecutionOrder(0);
  627. tensor_data->SetSlot(slot);
  628. tensor_data->SetIteration(iteration);
  629. tensor_data->SetDeviceId(device_id);
  630. tensor_data->SetRootGraphId(root_graph_id);
  631. tensor_data->SetIsOutput(is_output);
  632. if (data_size) {
  633. tensor_data->SetDataPtr(buffer->data());
  634. } else {
  635. tensor_data->SetDataPtr(NULL);
  636. }
  637. tensor_data->SetByteSize(data_size);
  638. tensor_data->SetType(type_name);
  639. tensor_data->SetShape(shape);
  640. if (data_size) {
  641. tensor_loader_->LoadNewTensor(tensor_data, false);
  642. }
  643. // add to result_list
  644. result_list->push_back(tensor_data);
  645. }
  646. void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *slot_string_to_check,
  647. std::string *dump_style_kernel_name, size_t slot, bool is_output) {
  648. std::string dump_style_name_part = *dump_style_kernel_name;
  649. GetNodeNameWithoutScope(&dump_style_name_part);
  650. std::string slot_str;
  651. if (is_output) {
  652. slot_str = ".output." + std::to_string(slot);
  653. } else {
  654. slot_str = ".input." + std::to_string(slot);
  655. }
  656. dump_style_name_part += slot_str;
  657. *prefix_dump_file_name = dump_style_name_part;
  658. *slot_string_to_check = slot_str;
  659. }
  660. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  661. // get file with the newest timestamp from the list.
  662. std::string newest_file;
  663. if (file_list.empty()) {
  664. return newest_file;
  665. }
  666. std::sort(file_list.begin(), file_list.end());
  667. return file_list.back();
  668. }
  669. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  670. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  671. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  672. const std::vector<std::string> &async_file_pool,
  673. std::vector<std::shared_ptr<TensorData>> *result_list) {
  674. for (unsigned int i = 0; i < backend_name.size(); i++) {
  675. // form prefix of the tensor file to read from graph pb node name
  676. std::string dump_style_kernel_name = backend_name[i];
  677. // remove slot from name
  678. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  679. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  680. std::string slot_string_to_check;
  681. std::string prefix_dump_file_name;
  682. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  683. std::string prefix_dump_to_check = dump_style_kernel_name;
  684. GetNodeNameWithoutScope(&prefix_dump_to_check);
  685. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
  686. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  687. // search files in dir for the one that meets the filename prefix and read the file into memory
  688. std::vector<char> *buffer = NULL;
  689. std::string type_name = "";
  690. std::vector<int64_t> shape;
  691. uint64_t data_size = 0;
  692. if (is_sync_mode) {
  693. DIR *d;
  694. d = opendir(specific_dump_dir.c_str());
  695. bool found_file = false;
  696. std::vector<std::string> matched_paths;
  697. if (d != nullptr) {
  698. struct dirent *dir = nullptr;
  699. while ((dir = readdir(d)) != NULL) {
  700. if (dir->d_type == DT_REG) {
  701. std::string file_name = dir->d_name;
  702. std::string stripped_file_name = GetStrippedFilename(file_name);
  703. if (stripped_file_name.empty()) {
  704. continue;
  705. }
  706. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  707. if (found != 0) {
  708. continue;
  709. }
  710. std::string full_path = specific_dump_dir + "/" + file_name;
  711. matched_paths.push_back(full_path);
  712. found_file = true;
  713. }
  714. }
  715. } else {
  716. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  717. }
  718. if (found_file) {
  719. shape.clear();
  720. std::string result_path = GetNewestFilePath(matched_paths);
  721. ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
  722. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size,
  723. type_name, shape, buffer, result_list);
  724. } else {
  725. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
  726. type_name, shape, buffer, result_list);
  727. MS_LOG(INFO) << "Target tensor has not been found.";
  728. }
  729. closedir(d);
  730. } else {
  731. bool found = false;
  732. // if async mode
  733. for (const std::string &file_path : async_file_pool) {
  734. if (file_path.find(specific_dump_dir) != std::string::npos &&
  735. file_path.find(prefix_dump_to_check) != std::string::npos &&
  736. file_path.find(slot_string_to_check) != std::string::npos) {
  737. found = true;
  738. shape.clear();
  739. ReadTensorFromNpy(file_path, &type_name, &data_size, &shape, &buffer);
  740. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i],
  741. data_size, type_name, shape, buffer, result_list);
  742. }
  743. }
  744. // If no npy file is found, add empty tensor data.
  745. if (!found) {
  746. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
  747. type_name, shape, buffer, result_list);
  748. }
  749. }
  750. }
  751. }
  752. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  753. // strip off the task_id, stream_id, and timestamp, then compare
  754. size_t first_dot = file_name.find(".");
  755. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  756. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  757. if (fifth_dot == std::string::npos) {
  758. return std::string();
  759. }
  760. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  761. size_t second_dot = fifth_dot;
  762. const int8_t kSecondDotPosition = 2;
  763. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  764. second_dot = file_name.rfind(".", second_dot - 1);
  765. }
  766. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  767. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  768. std::string stripped_file_name = start_string + end_string;
  769. return stripped_file_name;
  770. }
  771. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  772. unsigned int iteration, std::vector<std::string> *async_file_pool) {
  773. // get a list of nodes and the devices they are on to monitor
  774. std::vector<std::shared_ptr<TensorData>> tensor_list;
  775. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
  776. for (auto w_table_item : watchpoint_table) {
  777. auto wp = std::get<1>(w_table_item);
  778. for (auto check_node : wp.check_node_list) {
  779. unsigned int index = 0;
  780. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  781. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  782. for (auto device : devices) {
  783. for (auto graph : graphs) {
  784. std::tuple<uint32_t, uint32_t> key(device, graph);
  785. device_and_graph_to_nodes[key].push_back(check_node);
  786. }
  787. }
  788. index++;
  789. }
  790. }
  791. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  792. // as they are found
  793. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  794. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  795. uint32_t device_id = std::get<0>(device_and_graph);
  796. uint32_t root_graph_id = std::get<1>(device_and_graph);
  797. std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
  798. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  799. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
  800. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  801. // convert node names to dump style
  802. for (auto node : wp_nodes) {
  803. std::string orig_name = std::get<0>(node);
  804. std::string dump_style_name = orig_name;
  805. if (is_sync_mode) {
  806. // In sync mode, remove the scope from the fully qualified name to compare.
  807. GetNodeNameWithoutScope(&dump_style_name);
  808. } else {
  809. // In async mode, keep the scope but replace delimiter with '_' in node name to compare.
  810. ReplaceSrcFileName(&dump_style_name);
  811. }
  812. bool node_is_out = std::get<1>(node);
  813. if (node_is_out) {
  814. dump_style_name += ".output";
  815. } else {
  816. dump_style_name += ".input";
  817. }
  818. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  819. }
  820. if (!is_sync_mode) {
  821. // convert all files in proto_to_dump to npy and add to pool of async file names
  822. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  823. }
  824. if (is_sync_mode) {
  825. // search files in dir for the one that meets the filename prefix and read the file into memory
  826. DIR *d;
  827. d = opendir(specific_dump_dir.c_str());
  828. if (d != nullptr) {
  829. struct dirent *dir = nullptr;
  830. while ((dir = readdir(d)) != NULL) {
  831. if (dir->d_type == DT_REG) {
  832. std::string file_name = dir->d_name;
  833. for (auto &node : proto_to_dump) {
  834. std::string dump_name = std::get<1>(node);
  835. std::string stripped_file_name = GetStrippedFilename(file_name);
  836. if (stripped_file_name.empty()) {
  837. continue;
  838. }
  839. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  840. if (found == 0) {
  841. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  842. std::vector<int64_t> shape;
  843. std::string orig_name = std::get<0>(node);
  844. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  845. bool output_flag = (output_str == "output");
  846. AddToTensorData(orig_name, slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, NULL,
  847. &tensor_list);
  848. break;
  849. }
  850. }
  851. }
  852. }
  853. }
  854. } else {
  855. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
  856. &tensor_list);
  857. }
  858. }
  859. return tensor_list;
  860. }
  861. std::string DebugServices::IterationString(unsigned int iteration) {
  862. std::string iteration_string;
  863. bool init_dbg_suspend = (iteration == UINT_MAX);
  864. if (init_dbg_suspend) {
  865. iteration_string = "init";
  866. } else {
  867. iteration_string = std::to_string(iteration);
  868. }
  869. return iteration_string;
  870. }
  871. #endif
  872. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  873. std::vector<char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  874. std::vector<unsigned int> *const dtype,
  875. std::vector<std::vector<int64_t>> *const shape) {
  876. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  877. tensor_loader_->SearchTensors(name, &result_list);
  878. for (auto result : result_list) {
  879. if (!std::get<1>(result)) {
  880. continue;
  881. }
  882. ret_name->push_back(std::get<0>(result));
  883. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  884. data_size->push_back(std::get<1>(result)->GetByteSize());
  885. dtype->push_back(std::get<1>(result)->GetType());
  886. shape->push_back(std::get<1>(result)->GetShape());
  887. }
  888. }
  889. #ifdef ONLINE_DBG_MODE
  890. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  891. bool ret = false;
  892. for (auto w_table_item : watchpoint_table) {
  893. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  894. for (auto check_node : check_node_list) {
  895. std::string w_name = std::get<0>(check_node);
  896. bool w_type = std::get<1>(check_node);
  897. if ((w_type == true &&
  898. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  899. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  900. ret = true;
  901. return ret;
  902. }
  903. }
  904. }
  905. return ret;
  906. }
  907. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  908. if (kernel) {
  909. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  910. for (size_t j = 0; j < input_size; ++j) {
  911. auto input_kernel = kernel->input(j + 1);
  912. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  913. auto found = w_name.find_last_of('/');
  914. if (found != std::string::npos && w_name.substr(found + 1) == input_kernel_name) return true;
  915. }
  916. return false;
  917. } else {
  918. return false;
  919. }
  920. }
  921. #endif
  922. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  923. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  924. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensorMap(const std::string &node_name) const {
  925. return tensor_loader_->GetNodeTensorMap(node_name);
  926. }
  927. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  928. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  929. void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
  930. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  931. #ifdef ONLINE_DBG_MODE
  932. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  933. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  934. TypeId host_type, TypeId device_type, const std::string &addr_format,
  935. size_t slot) const {
  936. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  937. device_type, addr_format, slot);
  938. }
  939. #endif
  940. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  941. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  942. }
  943. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  944. return watchpoint_table;
  945. }
  946. void DebugServices::ResetLoadedTensors() {
  947. wp_id_cache.clear();
  948. MS_LOG(INFO) << "Resetting loaded tensors";
  949. tensor_loader_->MoveParametersCurrentToPrev();
  950. tensor_loader_->EmptyCurrentTensor();
  951. // will move parameters from previous to current map
  952. tensor_loader_->SwapCurrentPrev();
  953. }
  954. #ifdef ONLINE_DBG_MODE
  955. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  956. MS_EXCEPTION_IF_NULL(kernel);
  957. std::vector<std::shared_ptr<TensorData>> result;
  958. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  959. auto kernel_name = GetKernelNodeName(kernel);
  960. for (size_t j = 0; j < output_size; ++j) {
  961. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  962. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  963. if (tensor) result.push_back(tensor);
  964. }
  965. return result;
  966. }
  967. #endif
  968. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  969. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  970. }
  971. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  972. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  973. }
  974. void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
  975. std::string DebugServices::GetNetName() { return net_name; }
  976. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
  977. std::string DebugServices::GetDumpDir() { return dump_dir; }
  978. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
  979. bool DebugServices::GetSyncMode() { return is_sync_mode; }
  980. #ifdef ONLINE_DBG_MODE
  981. } // namespace mindspore
  982. #endif