You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 49 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include "pybind11/embed.h"
  28. #ifdef ONLINE_DBG_MODE
  29. #include "debug/anf_ir_utils.h"
  30. #include "backend/session/anf_runtime_algorithm.h"
  31. #endif
  32. #include "debug/debugger/tensor_summary.h"
  33. #ifdef ONLINE_DBG_MODE
  34. namespace mindspore {
  35. #endif
  36. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  37. DebugServices::DebugServices(const DebugServices &other) {
  38. tensor_loader_ = other.tensor_loader_;
  39. watchpoint_table = other.watchpoint_table;
  40. }
  41. DebugServices &DebugServices::operator=(const DebugServices &other) {
  42. if (this != &other) {
  43. tensor_loader_ = other.tensor_loader_;
  44. watchpoint_table = other.watchpoint_table;
  45. }
  46. return *this;
  47. }
  48. void DebugServices::AddWatchpoint(
  49. unsigned int id, unsigned int watch_condition, float parameter,
  50. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  51. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  52. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  53. std::lock_guard<std::mutex> lg(lock_);
  54. watchpoint_t watchpoint_item;
  55. watchpoint_item.id = id;
  56. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  57. watchpoint_item.condition.parameter = parameter;
  58. watchpoint_item.check_node_list = check_node_list;
  59. if (check_node_device_list != nullptr) {
  60. watchpoint_item.check_node_device_list = *check_node_device_list;
  61. }
  62. if (check_node_graph_list != nullptr) {
  63. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  64. }
  65. watchpoint_item.parameter_list = parameter_list;
  66. watchpoint_table[id] = watchpoint_item;
  67. }
  68. void DebugServices::RemoveWatchpoint(unsigned int id) {
  69. std::lock_guard<std::mutex> lg(lock_);
  70. watchpoint_table.erase(id);
  71. }
  72. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  73. void *const previous_tensor_ptr, uint32_t num_elements,
  74. int tensor_dtype) {
  75. switch (tensor_dtype) {
  76. case DbgDataType::DT_UINT8: {
  77. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  78. }
  79. case DbgDataType::DT_INT8: {
  80. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  81. }
  82. case DbgDataType::DT_UINT16: {
  83. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  84. }
  85. case DbgDataType::DT_INT16: {
  86. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  87. }
  88. case DbgDataType::DT_UINT32: {
  89. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  90. }
  91. case DbgDataType::DT_INT32:
  92. case DbgDataType::DT_BASE_INT: {
  93. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  94. }
  95. case DbgDataType::DT_UINT64: {
  96. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  97. }
  98. case DbgDataType::DT_INT64: {
  99. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  100. }
  101. case DbgDataType::DT_FLOAT16: {
  102. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  103. }
  104. case DbgDataType::DT_FLOAT32:
  105. case DbgDataType::DT_BASE_FLOAT: {
  106. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  107. }
  108. case DbgDataType::DT_FLOAT64: {
  109. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  110. }
  111. case DbgDataType::DT_BOOL: {
  112. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  113. }
  114. default:
  115. MS_LOG(INFO) << "Unsupported tensor type";
  116. // return a null pointer
  117. return std::unique_ptr<TensorSummary<int32_t>>{};
  118. }
  119. }
  120. #ifdef OFFLINE_DBG_MODE
  121. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
  122. void *previous_tensor_ptr = nullptr;
  123. std::shared_ptr<TensorData> tensor_prev;
  124. if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
  125. // read data in offline mode
  126. std::vector<std::string> file_paths;
  127. if (!is_sync_mode) {
  128. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  129. std::vector<unsigned int>{tensor->GetDeviceId()},
  130. std::vector<unsigned int>{tensor->GetIteration() - 1},
  131. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  132. }
  133. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  134. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  135. std::vector<unsigned int>{tensor->GetDeviceId()},
  136. std::vector<unsigned int>{tensor->GetIteration() - 1},
  137. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  138. file_paths, &result_list_prev);
  139. tensor_prev = result_list_prev[0];
  140. if (!tensor_prev->GetByteSize()) {
  141. tensor_prev.reset();
  142. } else {
  143. previous_tensor_ptr = tensor_prev->GetDataPtr();
  144. }
  145. }
  146. return previous_tensor_ptr;
  147. }
  148. #endif
  149. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  150. const std::string &tensor_name, const std::string &tensor_name_no_slot,
  151. bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name,
  152. std::vector<watchpoint_t> *const watchpoints_to_check) {
  153. for (auto w_table_item : watchpoint_table) {
  154. auto wp = std::get<1>(w_table_item);
  155. // check ONLY init conditions on initial suspended state.
  156. // skip other conditions on initial suspended state
  157. if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
  158. // skip init condition if not init suspend
  159. if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
  160. // check change conditions only on step end.
  161. if (wp.change_condition() && !step_end) continue;
  162. // if recheck, ignore the cache results and reanalyze everything.
  163. // if not a recheck, check only unanalyzed tensors
  164. if (!recheck) {
  165. wp_lock_.lock();
  166. bool wp_cache_hit = wp_id_cache[tensor_name].count(wp.id);
  167. wp_lock_.unlock();
  168. if (wp_cache_hit) continue;
  169. }
  170. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
  171. if (!found.empty()) {
  172. *qualified_tensor_name = found;
  173. watchpoints_to_check->push_back(w_table_item.second);
  174. #ifdef OFFLINE_DBG_MODE
  175. if (wp.change_condition()) {
  176. *previous_iter_tensor_needed = true;
  177. }
  178. #endif
  179. }
  180. }
  181. }
  182. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  183. const std::string &tensor_name) {
  184. // add analyzed tensor to cache
  185. if (!recheck) {
  186. wp_lock_.lock();
  187. wp_id_cache[tensor_name].insert(id);
  188. wp_lock_.unlock();
  189. }
  190. }
  191. void DebugServices::CheckWatchpointsForTensor(
  192. partitioned_names *chunk_names, partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions,
  193. partitioned_id *const chunk_watchpoint_id, partitioned_parameters *chunk_parameters,
  194. partitioned_error_code *chunk_error_codes, const std::vector<std::string> &op_overflows,
  195. const std::vector<std::string> &async_file_pool, partitioned_numbers *chunk_exec_orders,
  196. std::vector<std::shared_ptr<TensorData>> *tensor_list, int begin, int end, int chunk_id, const bool init_dbg_suspend,
  197. const bool step_end, const bool recheck, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  198. std::vector<uint64_t> *chunk_tensor_byte_size, std::vector<unsigned int> *device_id,
  199. std::vector<unsigned int> *root_graph_id) {
  200. for (int i = begin; i < end; i++) {
  201. auto &tensor = (*tensor_list)[i];
  202. #ifdef OFFLINE_DBG_MODE
  203. // read data in offline mode
  204. std::vector<std::shared_ptr<TensorData>> result_list;
  205. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  206. std::vector<unsigned int>{tensor->GetDeviceId()},
  207. std::vector<unsigned int>{tensor->GetIteration()},
  208. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  209. async_file_pool, &result_list);
  210. tensor = result_list[0];
  211. if (!tensor->GetByteSize()) {
  212. tensor.reset();
  213. continue;
  214. }
  215. #endif
  216. const auto tensor_name = tensor->GetName();
  217. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  218. const auto tensor_slot = std::to_string(tensor->GetSlot());
  219. // no elements to analyze
  220. if (tensor->GetByteSize() == 0) continue;
  221. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  222. int tensor_dtype = tensor->GetType();
  223. std::vector<watchpoint_t> watchpoints_to_check;
  224. std::string qualified_tensor_name;
  225. bool previous_iter_tensor_needed = false;
  226. // Add do nothing line in case offline debug is off, prevent unused var warning
  227. (void)previous_iter_tensor_needed;
  228. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
  229. &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
  230. // no wp set on current tensor
  231. if (watchpoints_to_check.empty()) continue;
  232. uint32_t num_elements = tensor->GetNumElements();
  233. #ifdef OFFLINE_DBG_MODE
  234. void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
  235. #else
  236. void *previous_tensor_ptr =
  237. tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
  238. #endif
  239. std::unique_ptr<ITensorSummary> base_summary_ptr;
  240. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  241. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
  242. if (base_summary_ptr != nullptr) {
  243. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  244. }
  245. }
  246. for (auto &wp : watchpoints_to_check) {
  247. bool is_hit = false;
  248. int error_code = 0;
  249. std::vector<parameter_t> parameter_list = {};
  250. if (wp.condition.type == IS_OVERFLOW) {
  251. is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end());
  252. } else if (base_summary_ptr != nullptr) {
  253. auto item = base_summary_ptr->IsWatchpointHit(wp);
  254. is_hit = std::get<ITensorSummary::eHitPos>(item);
  255. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  256. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  257. }
  258. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  259. if (is_hit || error_code) {
  260. (*chunk_exec_orders)[chunk_id].push_back(tensor->GetExecutionOrder());
  261. (*chunk_names)[chunk_id].push_back(qualified_tensor_name);
  262. (*chunk_slots)[chunk_id].push_back(tensor_slot);
  263. (*chunk_conditions)[chunk_id].push_back(wp.condition.type);
  264. (*chunk_watchpoint_id)[chunk_id].push_back(wp.id);
  265. if (device_id != nullptr) {
  266. (*chunk_device_id)[chunk_id].push_back(tensor->GetDeviceId());
  267. }
  268. if (root_graph_id != nullptr) {
  269. (*chunk_root_graph_id)[chunk_id].push_back(tensor->GetRootGraphId());
  270. }
  271. (*chunk_parameters)[chunk_id].push_back(parameter_list);
  272. (*chunk_error_codes)[chunk_id].push_back(error_code);
  273. }
  274. }
  275. #ifdef OFFLINE_DBG_MODE
  276. // in offline mode remove the need for the data
  277. tensor.reset();
  278. #endif
  279. }
  280. }
  281. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  282. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  283. std::vector<std::vector<parameter_t>> *const parameters,
  284. std::vector<int32_t> *const error_codes,
  285. const std::vector<std::string> &op_overflows,
  286. const std::vector<std::string> &async_file_pool,
  287. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  288. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  289. std::vector<unsigned int> *root_graph_id) {
  290. std::lock_guard<std::mutex> lg(lock_);
  291. auto t1 = std::chrono::high_resolution_clock::now();
  292. if (watchpoint_table.empty()) return;
  293. // vector to store execution order of tensors hit
  294. std::vector<int> exec_order;
  295. int tensor_list_size = tensor_list->size();
  296. uint64_t tensor_list_byte_size = 0;
  297. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  298. if (tensor_list_size == 0) return;
  299. // default value for number of threads
  300. int max_thread_num = 32;
  301. auto thread_num = getenv("MS_dbg_num_thread");
  302. if (thread_num != nullptr) {
  303. max_thread_num = std::stoi(thread_num);
  304. }
  305. if (max_thread_num > tensor_list_size) {
  306. max_thread_num = tensor_list_size;
  307. }
  308. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  309. int chunk_size = tensor_list_size / max_thread_num;
  310. int remainder = tensor_list_size % max_thread_num;
  311. partitioned_numbers chunk_exec_orders(max_thread_num);
  312. partitioned_names chunk_names(max_thread_num);
  313. partitioned_names chunk_slots(max_thread_num);
  314. partitioned_numbers chunk_conditions(max_thread_num);
  315. partitioned_id chunk_watchpoint_id(max_thread_num);
  316. partitioned_parameters chunk_parameters(max_thread_num);
  317. partitioned_error_code chunk_error_codes(max_thread_num);
  318. partitioned_id chunk_device_id(max_thread_num);
  319. partitioned_id chunk_root_graph_id(max_thread_num);
  320. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  321. std::vector<std::future<void>> tensor_future_vec;
  322. int begin = 0;
  323. int end = begin;
  324. for (int i = 0; i < max_thread_num; i++) {
  325. end += chunk_size;
  326. if (remainder > 0) {
  327. end++;
  328. remainder--;
  329. }
  330. tensor_future_vec.push_back(
  331. std::async(std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  332. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows,
  333. async_file_pool, &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck,
  334. &chunk_device_id, &chunk_root_graph_id, &chunk_tensor_byte_size, device_id, root_graph_id));
  335. begin = end;
  336. }
  337. for (unsigned int i = 0; i < tensor_future_vec.size(); i++) {
  338. tensor_future_vec[i].wait();
  339. tensor_future_vec[i].get();
  340. for (unsigned int j = 0; j < chunk_exec_orders[i].size(); j++) {
  341. std::vector<int>::iterator iter;
  342. iter = std::lower_bound(exec_order.begin(), exec_order.end(), chunk_exec_orders[i][j]);
  343. // if the execution order is repeated,inserts the new one before the others with same execution order.
  344. int position = iter - exec_order.begin();
  345. exec_order.insert(iter, chunk_exec_orders[i][j]);
  346. name->insert(name->begin() + position, chunk_names[i][j]);
  347. slot->insert(slot->begin() + position, chunk_slots[i][j]);
  348. condition->insert(condition->begin() + position, chunk_conditions[i][j]);
  349. watchpoint_id->insert(watchpoint_id->begin() + position, chunk_watchpoint_id[i][j]);
  350. if (device_id != nullptr) {
  351. device_id->insert(device_id->begin() + position, chunk_device_id[i][j]);
  352. }
  353. if (root_graph_id != nullptr) {
  354. root_graph_id->insert(root_graph_id->begin() + position, chunk_root_graph_id[i][j]);
  355. }
  356. parameters->insert(parameters->begin() + position, chunk_parameters[i][j]);
  357. error_codes->insert(error_codes->begin() + position, chunk_error_codes[i][j]);
  358. }
  359. // free the memory for used vectors
  360. std::vector<int>().swap(chunk_exec_orders[i]);
  361. std::vector<std::string>().swap(chunk_names[i]);
  362. std::vector<std::string>().swap(chunk_slots[i]);
  363. std::vector<int>().swap(chunk_conditions[i]);
  364. std::vector<unsigned int>().swap(chunk_watchpoint_id[i]);
  365. std::vector<std::vector<parameter_t>>().swap(chunk_parameters[i]);
  366. std::vector<int32_t>().swap(chunk_error_codes[i]);
  367. std::vector<unsigned int>().swap(chunk_device_id[i]);
  368. std::vector<unsigned int>().swap(chunk_root_graph_id[i]);
  369. tensor_list_byte_size += chunk_tensor_byte_size[i];
  370. }
  371. auto t2 = std::chrono::high_resolution_clock::now();
  372. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  373. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  374. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  375. }
  376. #ifdef OFFLINE_DBG_MODE
  377. void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size,
  378. std::vector<int64_t> *shape, std::vector<char> **data_buffer) {
  379. std::ifstream infile;
  380. std::string file_path = file_name;
  381. MS_LOG(INFO) << "Reading in file: " << file_path;
  382. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  383. if (!infile.is_open()) {
  384. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path;
  385. return;
  386. }
  387. uint64_t file_size = infile.tellg();
  388. infile.seekg(0, std::ios::beg);
  389. auto buffer = std::make_unique<std::vector<char>>(file_size);
  390. if (!infile.read(buffer->data(), file_size)) {
  391. MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path;
  392. return;
  393. }
  394. constexpr int header_len_offset = 8;
  395. uint16_t header_len = *reinterpret_cast<uint16_t *>(buffer->data() + header_len_offset);
  396. std::string header(buffer->data() + header_len_offset + 1, header_len);
  397. std::size_t type_i = header.find("descr") + 10;
  398. *tensor_type = header.substr(type_i, 2);
  399. std::size_t shape_i_open = header.find("(");
  400. std::size_t shape_i_close = header.find(")");
  401. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  402. std::string intermediate;
  403. std::stringstream check_shape(shape_str);
  404. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  405. while (getline(check_shape, intermediate, ',')) {
  406. shape->push_back(std::stoi(intermediate));
  407. }
  408. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  409. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  410. std::size_t data_size = data_len * word_size;
  411. infile.seekg(header_len + 10);
  412. *data_buffer = new std::vector<char>(data_size);
  413. if (data_buffer == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  414. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  415. }
  416. *size = data_size;
  417. }
  418. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  419. std::vector<std::string> *result_list) {
  420. std::string file_format = "npy";
  421. for (auto const &d : dir_to_files_map) {
  422. std::vector<std::string> files_to_convert_in_dir;
  423. std::string dump_key = d.first;
  424. for (auto const &file_name : d.second) {
  425. bool already_converted = false;
  426. // Remove scope from the file_name for matching files converted by mindinsight tool.
  427. std::size_t found_first_dot = file_name.find(".");
  428. std::size_t found_last_underscore = file_name.find_last_of("_");
  429. std::string file_name_without_scope = file_name;
  430. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  431. file_name_without_scope =
  432. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  433. }
  434. for (std::string &file_found : *result_list) {
  435. if (file_found.find(file_name_without_scope) != std::string::npos) {
  436. already_converted = true;
  437. }
  438. }
  439. if (!already_converted) {
  440. files_to_convert_in_dir.push_back(dump_key + "/" + file_name);
  441. }
  442. }
  443. std::ostringstream input_file_o;
  444. const char *const delim = " ";
  445. std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(),
  446. std::ostream_iterator<std::string>(input_file_o, delim));
  447. std::string input_files = input_file_o.str();
  448. MS_LOG(INFO) << "Ops to convert: " << input_files;
  449. if (input_files != "") {
  450. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  451. // later task.
  452. try {
  453. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  454. std::string convert_pkg_path = pkg.attr("__file__").cast<std::string>();
  455. MS_LOG(INFO) << "The file for converting async dump data is in " << convert_pkg_path;
  456. std::string convert_command = "python " + convert_pkg_path + " -out " + dump_key + " -t " + file_format +
  457. " -d " + dump_key + " -f NCHW -l " + input_files;
  458. (void)(system(convert_command.c_str()) + 1);
  459. } catch (pybind11::error_already_set &e) {
  460. MS_LOG(EXCEPTION) << "Can't find package mindspore.offline_debug.convert_async";
  461. }
  462. DIR *d_handle = opendir(dump_key.c_str());
  463. if (d_handle != nullptr) {
  464. struct dirent *dir = nullptr;
  465. while ((dir = readdir(d_handle)) != NULL) {
  466. if (dir->d_type == DT_REG) {
  467. std::string candidate = dir->d_name;
  468. for (const std::string &file_to_find : files_to_convert_in_dir) {
  469. std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1);
  470. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  471. // we found a converted file for this op
  472. std::string found_file = dump_key + "/" + candidate;
  473. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  474. result_list->push_back(found_file);
  475. }
  476. }
  477. }
  478. }
  479. }
  480. }
  481. }
  482. }
  483. }
  484. void GetNodeNameWithoutScope(std::string *dump_style_name) {
  485. if (dump_style_name == nullptr) {
  486. return;
  487. }
  488. std::string node_name_without_scope = *dump_style_name;
  489. std::size_t last_scope_marker;
  490. std::string delim = "/";
  491. last_scope_marker = node_name_without_scope.rfind(delim);
  492. if (last_scope_marker != std::string::npos) {
  493. node_name_without_scope = node_name_without_scope.substr(last_scope_marker + delim.size());
  494. }
  495. *dump_style_name = node_name_without_scope;
  496. }
  497. void ReplaceSrcFileName(std::string *dump_style_name) {
  498. if (dump_style_name == nullptr) {
  499. return;
  500. }
  501. const std::string strsrc = "/";
  502. std::string strdst = "_";
  503. std::string::size_type pos = 0;
  504. std::string::size_type srclen = strsrc.size();
  505. std::string::size_type dstlen = strdst.size();
  506. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  507. dump_style_name->replace(pos, srclen, strdst);
  508. pos += dstlen;
  509. }
  510. }
  511. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  512. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  513. std::vector<unsigned int> root_graph_id, std::vector<std::string> *result_list) {
  514. std::string file_format = "npy";
  515. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  516. for (unsigned int i = 0; i < backend_name.size(); i++) {
  517. // form prefix of the tensor file to read from graph pb node name
  518. std::string dump_style_kernel_name = backend_name[i];
  519. // remove slot from name
  520. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  521. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  522. std::string prefix_dump_file_name = dump_style_kernel_name;
  523. GetNodeNameWithoutScope(&prefix_dump_file_name);
  524. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
  525. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  526. // search files in dir for the one that meets the filename prefix and read the file into memory
  527. DIR *d;
  528. d = opendir(specific_dump_dir.c_str());
  529. if (d != nullptr) {
  530. struct dirent *dir = nullptr;
  531. while ((dir = readdir(d)) != NULL) {
  532. if (dir->d_type == DT_REG) {
  533. std::string file_name = dir->d_name;
  534. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  535. if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  536. file_name.rfind(file_format) == std::string::npos) {
  537. // if file matches prefix and is in device format add to candidate files to convert.
  538. dir_to_files_map[specific_dump_dir].push_back(file_name);
  539. } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  540. file_name.rfind(file_format) != std::string::npos) {
  541. // otherwise, if file matches prefix and already has been converted to host format
  542. // add to result of converted files.
  543. std::string found_file = specific_dump_dir + "/" + file_name;
  544. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  545. result_list->push_back(found_file);
  546. }
  547. }
  548. }
  549. }
  550. }
  551. closedir(d);
  552. }
  553. ConvertToHostFormat(dir_to_files_map, result_list);
  554. }
  555. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  556. const std::string &specific_dump_dir,
  557. std::vector<std::string> *result_list) {
  558. std::string file_format = "npy";
  559. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  560. for (const auto &node : proto_dump) {
  561. std::string dump_name = std::get<1>(node);
  562. dump_name = dump_name.substr(0, dump_name.rfind("."));
  563. // search files in dir for the one that meets the filename prefix and read the file into memory
  564. DIR *d;
  565. d = opendir(specific_dump_dir.c_str());
  566. if (d != nullptr) {
  567. struct dirent *dir = nullptr;
  568. while ((dir = readdir(d)) != NULL) {
  569. if (dir->d_type == DT_REG) {
  570. std::string file_name = dir->d_name;
  571. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  572. if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  573. file_name.rfind(file_format) == std::string::npos) {
  574. // if file matches prefix and is in device format add to candidate files to convert.
  575. dir_to_files_map[specific_dump_dir].push_back(file_name);
  576. } else if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  577. file_name.rfind(file_format) != std::string::npos) {
  578. // otherwise, if file matches prefix and already has been converted to host format
  579. // add to result of converted files.
  580. std::string found_file = specific_dump_dir + "/" + file_name;
  581. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  582. result_list->push_back(found_file);
  583. }
  584. }
  585. }
  586. }
  587. }
  588. closedir(d);
  589. }
  590. ConvertToHostFormat(dir_to_files_map, result_list);
  591. }
  592. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  593. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  594. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  595. std::vector<std::shared_ptr<TensorData>> *tensor_list) {
  596. for (auto &node : proto_dump) {
  597. std::vector<size_t> slot_list;
  598. std::string dump_style_name = std::get<1>(node);
  599. // Get dump_name and output_str from the second element of tuple
  600. std::size_t found_dot = dump_style_name.rfind(".");
  601. std::string dump_name = dump_style_name.substr(0, found_dot);
  602. std::string output_str = dump_style_name.substr(found_dot + 1);
  603. bool output_flag = (output_str == "output");
  604. for (const std::string &file_name : async_file_pool) {
  605. std::size_t found = file_name.find(dump_name);
  606. std::size_t found_out = file_name.find(output_str);
  607. std::size_t found_dot_start = file_name.find(".", found_out);
  608. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  609. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  610. found_out != std::string::npos) {
  611. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  612. }
  613. }
  614. for (auto slot : slot_list) {
  615. // add a TensorData entry (data will be read when needed)
  616. std::vector<int64_t> shape;
  617. std::string orig_name = std::get<0>(node);
  618. auto tensor_data = std::make_shared<TensorData>();
  619. tensor_data->SetName(orig_name);
  620. tensor_data->SetExecutionOrder(0);
  621. tensor_data->SetSlot(slot);
  622. tensor_data->SetIteration(iteration);
  623. tensor_data->SetDeviceId(device_id);
  624. tensor_data->SetRootGraphId(root_graph_id);
  625. tensor_data->SetDataPtr(NULL);
  626. tensor_data->SetByteSize(0);
  627. tensor_data->SetType("");
  628. tensor_data->SetShape(shape);
  629. tensor_data->SetIsOutput(output_flag);
  630. tensor_list->push_back(tensor_data);
  631. }
  632. }
  633. }
  634. void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot,
  635. const unsigned int iteration, const unsigned int device_id,
  636. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  637. const std::string &type_name, const std::vector<int64_t> &shape,
  638. std::vector<char> *buffer, std::vector<std::shared_ptr<TensorData>> *result_list) {
  639. // call LoadNewTensor to store tensor in internal cache
  640. auto tensor_data = std::make_shared<TensorData>();
  641. tensor_data->SetName(backend_name);
  642. tensor_data->SetExecutionOrder(0);
  643. tensor_data->SetSlot(slot);
  644. tensor_data->SetIteration(iteration);
  645. tensor_data->SetDeviceId(device_id);
  646. tensor_data->SetRootGraphId(root_graph_id);
  647. tensor_data->SetIsOutput(is_output);
  648. if (data_size) {
  649. tensor_data->SetDataPtr(buffer->data());
  650. } else {
  651. tensor_data->SetDataPtr(NULL);
  652. }
  653. tensor_data->SetByteSize(data_size);
  654. tensor_data->SetType(type_name);
  655. tensor_data->SetShape(shape);
  656. if (data_size) {
  657. tensor_loader_->LoadNewTensor(tensor_data, false);
  658. }
  659. // add to result_list
  660. result_list->push_back(tensor_data);
  661. }
  662. void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *slot_string_to_check,
  663. std::string *dump_style_kernel_name, size_t slot, bool is_output) {
  664. std::string dump_style_name_part = *dump_style_kernel_name;
  665. GetNodeNameWithoutScope(&dump_style_name_part);
  666. std::string slot_str;
  667. if (is_output) {
  668. slot_str = ".output." + std::to_string(slot);
  669. } else {
  670. slot_str = ".input." + std::to_string(slot);
  671. }
  672. dump_style_name_part += slot_str;
  673. *prefix_dump_file_name = dump_style_name_part;
  674. *slot_string_to_check = slot_str;
  675. }
  676. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  677. // get file with the newest timestamp from the list.
  678. std::string newest_file;
  679. if (file_list.empty()) {
  680. return newest_file;
  681. }
  682. std::sort(file_list.begin(), file_list.end());
  683. return file_list.back();
  684. }
  685. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  686. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  687. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  688. const std::vector<std::string> &async_file_pool,
  689. std::vector<std::shared_ptr<TensorData>> *result_list) {
  690. for (unsigned int i = 0; i < backend_name.size(); i++) {
  691. // form prefix of the tensor file to read from graph pb node name
  692. std::string dump_style_kernel_name = backend_name[i];
  693. // remove slot from name
  694. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  695. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  696. std::string slot_string_to_check;
  697. std::string prefix_dump_file_name;
  698. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  699. std::string prefix_dump_to_check = dump_style_kernel_name;
  700. GetNodeNameWithoutScope(&prefix_dump_to_check);
  701. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id[i]) + "/" + net_name + "/" +
  702. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  703. // search files in dir for the one that meets the filename prefix and read the file into memory
  704. std::vector<char> *buffer = NULL;
  705. std::string type_name = "";
  706. std::vector<int64_t> shape;
  707. uint64_t data_size = 0;
  708. if (is_sync_mode) {
  709. DIR *d;
  710. d = opendir(specific_dump_dir.c_str());
  711. bool found_file = false;
  712. std::vector<std::string> matched_paths;
  713. if (d != nullptr) {
  714. struct dirent *dir = nullptr;
  715. while ((dir = readdir(d)) != NULL) {
  716. if (dir->d_type == DT_REG) {
  717. std::string file_name = dir->d_name;
  718. std::string stripped_file_name = GetStrippedFilename(file_name);
  719. if (stripped_file_name.empty()) {
  720. continue;
  721. }
  722. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  723. if (found != 0) {
  724. continue;
  725. }
  726. std::string full_path = specific_dump_dir + "/" + file_name;
  727. matched_paths.push_back(full_path);
  728. found_file = true;
  729. }
  730. }
  731. } else {
  732. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  733. }
  734. if (found_file) {
  735. shape.clear();
  736. std::string result_path = GetNewestFilePath(matched_paths);
  737. ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
  738. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size,
  739. type_name, shape, buffer, result_list);
  740. } else {
  741. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
  742. type_name, shape, buffer, result_list);
  743. MS_LOG(INFO) << "Target tensor has not been found.";
  744. }
  745. closedir(d);
  746. } else {
  747. bool found = false;
  748. std::vector<std::string> matched_paths;
  749. // if async mode
  750. for (const std::string &file_path : async_file_pool) {
  751. if (file_path.find(specific_dump_dir) != std::string::npos &&
  752. file_path.find(prefix_dump_to_check) != std::string::npos &&
  753. file_path.find(slot_string_to_check) != std::string::npos) {
  754. matched_paths.push_back(file_path);
  755. found = true;
  756. }
  757. }
  758. if (found) {
  759. shape.clear();
  760. std::string result_path = GetNewestFilePath(matched_paths);
  761. ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
  762. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], data_size,
  763. type_name, shape, buffer, result_list);
  764. } else {
  765. // If no npy file is found, add empty tensor data.
  766. AddToTensorData(backend_name[i], slot[i], iteration[i], device_id[i], root_graph_id[i], is_output[i], 0,
  767. type_name, shape, buffer, result_list);
  768. MS_LOG(INFO) << "Target tensor has not been found.";
  769. }
  770. }
  771. }
  772. }
  773. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  774. // strip off the task_id, stream_id, and timestamp, then compare
  775. size_t first_dot = file_name.find(".");
  776. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  777. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  778. if (fifth_dot == std::string::npos) {
  779. return std::string();
  780. }
  781. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  782. size_t second_dot = fifth_dot;
  783. const int8_t kSecondDotPosition = 2;
  784. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  785. second_dot = file_name.rfind(".", second_dot - 1);
  786. }
  787. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  788. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  789. std::string stripped_file_name = start_string + end_string;
  790. return stripped_file_name;
  791. }
  792. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  793. unsigned int iteration, std::vector<std::string> *async_file_pool) {
  794. // get a list of nodes and the devices they are on to monitor
  795. std::vector<std::shared_ptr<TensorData>> tensor_list;
  796. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
  797. for (auto w_table_item : watchpoint_table) {
  798. auto wp = std::get<1>(w_table_item);
  799. unsigned int index = 0;
  800. for (auto check_node : wp.check_node_list) {
  801. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  802. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  803. for (auto device : devices) {
  804. for (auto graph : graphs) {
  805. std::tuple<uint32_t, uint32_t> key(device, graph);
  806. device_and_graph_to_nodes[key].push_back(check_node);
  807. }
  808. }
  809. index++;
  810. }
  811. }
  812. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  813. // as they are found
  814. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  815. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  816. uint32_t device_id = std::get<0>(device_and_graph);
  817. uint32_t root_graph_id = std::get<1>(device_and_graph);
  818. std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
  819. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  820. std::string specific_dump_dir = dump_dir + "/rank_" + std::to_string(device_id) + "/" + net_name + "/" +
  821. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  822. // convert node names to dump style
  823. for (auto node : wp_nodes) {
  824. std::string orig_name = std::get<0>(node);
  825. std::string dump_style_name = orig_name;
  826. // Remove the scope from the fully qualified name to compare for both sync and async case.
  827. GetNodeNameWithoutScope(&dump_style_name);
  828. bool node_is_out = std::get<1>(node);
  829. if (node_is_out) {
  830. dump_style_name += ".output";
  831. } else {
  832. dump_style_name += ".input";
  833. }
  834. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  835. }
  836. if (!is_sync_mode) {
  837. // convert all files in proto_to_dump to npy and add to pool of async file names
  838. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  839. }
  840. if (is_sync_mode) {
  841. // search files in dir for the one that meets the filename prefix and read the file into memory
  842. DIR *d;
  843. d = opendir(specific_dump_dir.c_str());
  844. if (d != nullptr) {
  845. struct dirent *dir = nullptr;
  846. while ((dir = readdir(d)) != NULL) {
  847. if (dir->d_type == DT_REG) {
  848. std::string file_name = dir->d_name;
  849. for (auto &node : proto_to_dump) {
  850. std::string dump_name = std::get<1>(node);
  851. std::string stripped_file_name = GetStrippedFilename(file_name);
  852. if (stripped_file_name.empty()) {
  853. continue;
  854. }
  855. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  856. if (found == 0) {
  857. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  858. std::vector<int64_t> shape;
  859. std::string orig_name = std::get<0>(node);
  860. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  861. bool output_flag = (output_str == "output");
  862. AddToTensorData(orig_name, slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, NULL,
  863. &tensor_list);
  864. break;
  865. }
  866. }
  867. }
  868. }
  869. }
  870. } else {
  871. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
  872. &tensor_list);
  873. }
  874. }
  875. return tensor_list;
  876. }
  877. std::string DebugServices::IterationString(unsigned int iteration) {
  878. std::string iteration_string;
  879. bool init_dbg_suspend = (iteration == UINT_MAX);
  880. if (init_dbg_suspend) {
  881. iteration_string = "init";
  882. } else {
  883. iteration_string = std::to_string(iteration);
  884. }
  885. return iteration_string;
  886. }
  887. #endif
  888. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  889. std::vector<char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  890. std::vector<unsigned int> *const dtype,
  891. std::vector<std::vector<int64_t>> *const shape) {
  892. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  893. tensor_loader_->SearchTensors(name, &result_list);
  894. for (auto result : result_list) {
  895. if (!std::get<1>(result)) {
  896. continue;
  897. }
  898. ret_name->push_back(std::get<0>(result));
  899. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  900. data_size->push_back(std::get<1>(result)->GetByteSize());
  901. dtype->push_back(std::get<1>(result)->GetType());
  902. shape->push_back(std::get<1>(result)->GetShape());
  903. }
  904. }
  905. #ifdef ONLINE_DBG_MODE
  906. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  907. bool ret = false;
  908. for (auto w_table_item : watchpoint_table) {
  909. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  910. for (auto check_node : check_node_list) {
  911. std::string w_name = std::get<0>(check_node);
  912. bool w_type = std::get<1>(check_node);
  913. if ((w_type == true &&
  914. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  915. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  916. ret = true;
  917. return ret;
  918. }
  919. }
  920. }
  921. return ret;
  922. }
  923. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  924. if (kernel) {
  925. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  926. for (size_t j = 0; j < input_size; ++j) {
  927. auto input_kernel = kernel->input(j + 1);
  928. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  929. auto found = w_name.find_last_of('/');
  930. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  931. return true;
  932. }
  933. return false;
  934. } else {
  935. return false;
  936. }
  937. }
  938. #endif
  939. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  940. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  941. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensorMap(const std::string &node_name) const {
  942. return tensor_loader_->GetNodeTensorMap(node_name);
  943. }
  944. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  945. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  946. void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
  947. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  948. #ifdef ONLINE_DBG_MODE
  949. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  950. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  951. TypeId host_type, TypeId device_type, const std::string &addr_format,
  952. size_t slot) const {
  953. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  954. device_type, addr_format, slot);
  955. }
  956. #endif
  957. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  958. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  959. }
  960. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  961. return watchpoint_table;
  962. }
  963. void DebugServices::ResetLoadedTensors() {
  964. wp_id_cache.clear();
  965. MS_LOG(INFO) << "Resetting loaded tensors";
  966. tensor_loader_->MoveParametersCurrentToPrev();
  967. tensor_loader_->EmptyCurrentTensor();
  968. // will move parameters from previous to current map
  969. tensor_loader_->SwapCurrentPrev();
  970. }
  971. #ifdef ONLINE_DBG_MODE
  972. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  973. MS_EXCEPTION_IF_NULL(kernel);
  974. std::vector<std::shared_ptr<TensorData>> result;
  975. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  976. auto kernel_name = GetKernelNodeName(kernel);
  977. for (size_t j = 0; j < output_size; ++j) {
  978. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  979. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  980. if (tensor) result.push_back(tensor);
  981. }
  982. return result;
  983. }
  984. #endif
  985. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  986. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  987. }
  988. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  989. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  990. }
  991. void DebugServices::SetNetName(std::string net_name) { this->net_name = net_name; }
  992. std::string DebugServices::GetNetName() { return net_name; }
  993. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir = dump_dir; }
  994. std::string DebugServices::GetDumpDir() { return dump_dir; }
  995. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode = is_sync_mode; }
  996. bool DebugServices::GetSyncMode() { return is_sync_mode; }
  997. #ifdef ONLINE_DBG_MODE
  998. } // namespace mindspore
  999. #endif