You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 62 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include "pybind11/embed.h"
  29. #ifdef ONLINE_DBG_MODE
  30. #include "debug/common.h"
  31. #include "debug/debugger/debugger.h"
  32. #include "debug/anf_ir_utils.h"
  33. #include "backend/session/anf_runtime_algorithm.h"
  34. #endif
  35. #include "debug/debugger/tensor_summary.h"
  36. #ifdef ONLINE_DBG_MODE
  37. namespace mindspore {
  38. #endif
  39. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  40. DebugServices::DebugServices(const DebugServices &other) {
  41. wp_id_cache_ = other.wp_id_cache_;
  42. net_name_ = other.net_name_;
  43. dump_dir_ = other.dump_dir_;
  44. is_sync_mode_ = other.is_sync_mode_;
  45. tensor_loader_ = other.tensor_loader_;
  46. watchpoint_table_ = other.watchpoint_table_;
  47. }
  48. DebugServices &DebugServices::operator=(const DebugServices &other) {
  49. if (this != &other) {
  50. tensor_loader_ = other.tensor_loader_;
  51. watchpoint_table_ = other.watchpoint_table_;
  52. }
  53. return *this;
  54. }
  55. void DebugServices::AddWatchpoint(
  56. unsigned int id, unsigned int watch_condition, float parameter,
  57. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  58. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  59. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  60. std::lock_guard<std::mutex> lg(lock_);
  61. watchpoint_t watchpoint_item;
  62. watchpoint_item.id = id;
  63. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  64. watchpoint_item.condition.parameter = parameter;
  65. watchpoint_item.check_node_list = check_node_list;
  66. if (check_node_device_list != nullptr) {
  67. watchpoint_item.check_node_device_list = *check_node_device_list;
  68. }
  69. if (check_node_graph_list != nullptr) {
  70. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  71. }
  72. watchpoint_item.parameter_list = parameter_list;
  73. watchpoint_table_[id] = watchpoint_item;
  74. }
  75. void DebugServices::RemoveWatchpoint(unsigned int id) {
  76. std::lock_guard<std::mutex> lg(lock_);
  77. watchpoint_table_.erase(id);
  78. }
  79. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  80. void *const previous_tensor_ptr, uint32_t num_elements,
  81. int tensor_dtype) {
  82. switch (tensor_dtype) {
  83. case DbgDataType::DT_UINT8: {
  84. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  85. }
  86. case DbgDataType::DT_INT8: {
  87. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  88. }
  89. case DbgDataType::DT_UINT16: {
  90. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  91. }
  92. case DbgDataType::DT_INT16: {
  93. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  94. }
  95. case DbgDataType::DT_UINT32: {
  96. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  97. }
  98. case DbgDataType::DT_INT32:
  99. case DbgDataType::DT_BASE_INT: {
  100. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  101. }
  102. case DbgDataType::DT_UINT64: {
  103. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  104. }
  105. case DbgDataType::DT_INT64: {
  106. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  107. }
  108. case DbgDataType::DT_FLOAT16: {
  109. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  110. }
  111. case DbgDataType::DT_FLOAT32:
  112. case DbgDataType::DT_BASE_FLOAT: {
  113. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  114. }
  115. case DbgDataType::DT_FLOAT64: {
  116. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  117. }
  118. case DbgDataType::DT_BOOL: {
  119. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements);
  120. }
  121. default:
  122. MS_LOG(INFO) << "Unsupported tensor type";
  123. // return a null pointer
  124. return std::unique_ptr<TensorSummary<int32_t>>{};
  125. }
  126. }
  127. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  128. if (tensor == nullptr) {
  129. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  130. TensorStat empty_tensor_stat_data;
  131. return empty_tensor_stat_data;
  132. }
  133. std::unique_ptr<ITensorSummary> base_summary_ptr;
  134. void *previous_tensor_ptr = nullptr;
  135. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), tensor->GetType());
  136. if (base_summary_ptr == nullptr) {
  137. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  138. TensorStat empty_tensor_stat_data;
  139. return empty_tensor_stat_data;
  140. }
  141. base_summary_ptr->TensorStatistics(tensor->GetType());
  142. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  143. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  144. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  145. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  146. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  147. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  148. return tensor_stat_data;
  149. }
  150. #ifdef OFFLINE_DBG_MODE
  151. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed) {
  152. void *previous_tensor_ptr = nullptr;
  153. std::shared_ptr<TensorData> tensor_prev;
  154. if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
  155. // read data in offline mode
  156. std::vector<std::string> file_paths;
  157. if (!is_sync_mode_) {
  158. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  159. std::vector<unsigned int>{tensor->GetDeviceId()},
  160. std::vector<unsigned int>{tensor->GetIteration() - 1},
  161. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  162. }
  163. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  164. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  165. std::vector<unsigned int>{tensor->GetDeviceId()},
  166. std::vector<unsigned int>{tensor->GetIteration() - 1},
  167. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  168. file_paths, &result_list_prev);
  169. tensor_prev = result_list_prev[0];
  170. if (!tensor_prev->GetByteSize()) {
  171. tensor_prev.reset();
  172. } else {
  173. previous_tensor_ptr = tensor_prev->GetDataPtr();
  174. }
  175. }
  176. return previous_tensor_ptr;
  177. }
  178. #endif
  179. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  180. const std::string &tensor_name, const std::string &tensor_name_no_slot,
  181. bool *previous_iter_tensor_needed, std::string *const qualified_tensor_name,
  182. std::vector<watchpoint_t> *const watchpoints_to_check) {
  183. for (auto w_table_item : watchpoint_table_) {
  184. auto wp = std::get<1>(w_table_item);
  185. // check ONLY init conditions on initial suspended state.
  186. // skip other conditions on initial suspended state
  187. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  188. continue;
  189. }
  190. // skip init condition if not init suspend
  191. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  192. continue;
  193. }
  194. // check change conditions only on step end.
  195. if (wp.change_condition() && !step_end) {
  196. continue;
  197. }
  198. // if recheck, ignore the cache results and reanalyze everything.
  199. // if not a recheck, check only unanalyzed tensors
  200. if (!recheck) {
  201. wp_lock_.lock();
  202. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  203. wp_lock_.unlock();
  204. if (wp_cache_hit) {
  205. continue;
  206. }
  207. }
  208. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
  209. if (!found.empty()) {
  210. *qualified_tensor_name = found;
  211. watchpoints_to_check->push_back(w_table_item.second);
  212. #ifdef OFFLINE_DBG_MODE
  213. if (wp.change_condition()) {
  214. *previous_iter_tensor_needed = true;
  215. }
  216. #endif
  217. }
  218. }
  219. }
  220. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  221. const std::string &tensor_name) {
  222. // add analyzed tensor to cache
  223. if (!recheck) {
  224. wp_lock_.lock();
  225. wp_id_cache_[tensor_name].insert(id);
  226. wp_lock_.unlock();
  227. }
  228. }
  229. void DebugServices::CheckWatchpointsForTensor(
  230. partitioned_names *chunk_names, partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions,
  231. partitioned_id *const chunk_watchpoint_id, partitioned_parameters *chunk_parameters,
  232. partitioned_error_code *chunk_error_codes, const std::vector<std::string> &op_overflows,
  233. const std::vector<std::string> &async_file_pool, partitioned_numbers *chunk_exec_orders,
  234. std::vector<std::shared_ptr<TensorData>> *tensor_list, int begin, int end, int chunk_id, const bool init_dbg_suspend,
  235. const bool step_end, const bool recheck, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  236. std::vector<uint64_t> *chunk_tensor_byte_size, std::vector<unsigned int> *device_id,
  237. std::vector<unsigned int> *root_graph_id) {
  238. for (int i = begin; i < end; i++) {
  239. auto &tensor = (*tensor_list)[i];
  240. #ifdef OFFLINE_DBG_MODE
  241. // read data in offline mode
  242. std::vector<std::shared_ptr<TensorData>> result_list;
  243. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  244. std::vector<unsigned int>{tensor->GetDeviceId()},
  245. std::vector<unsigned int>{tensor->GetIteration()},
  246. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  247. async_file_pool, &result_list);
  248. tensor = result_list[0];
  249. if (!tensor->GetByteSize()) {
  250. tensor.reset();
  251. continue;
  252. }
  253. #endif
  254. const auto tensor_name = tensor->GetName();
  255. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  256. const auto tensor_slot = std::to_string(tensor->GetSlot());
  257. // no elements to analyze
  258. if (tensor->GetByteSize() == 0) {
  259. continue;
  260. }
  261. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  262. int tensor_dtype = tensor->GetType();
  263. std::vector<watchpoint_t> watchpoints_to_check;
  264. std::string qualified_tensor_name;
  265. bool previous_iter_tensor_needed = false;
  266. // Add do nothing line in case offline debug is off, prevent unused var warning
  267. (void)previous_iter_tensor_needed;
  268. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
  269. &previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
  270. // no wp set on current tensor
  271. if (watchpoints_to_check.empty()) {
  272. continue;
  273. }
  274. uint32_t num_elements = tensor->GetNumElements();
  275. #ifdef OFFLINE_DBG_MODE
  276. void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
  277. #else
  278. void *previous_tensor_ptr =
  279. tensor_loader_->GetPrevTensor(tensor_name) ? tensor_loader_->GetPrevTensor(tensor_name)->GetDataPtr() : nullptr;
  280. #endif
  281. std::unique_ptr<ITensorSummary> base_summary_ptr;
  282. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  283. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, tensor_dtype);
  284. if (base_summary_ptr != nullptr) {
  285. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  286. }
  287. }
  288. for (auto &wp : watchpoints_to_check) {
  289. bool is_hit = false;
  290. int error_code = 0;
  291. std::vector<parameter_t> parameter_list = {};
  292. if (wp.condition.type == IS_OVERFLOW) {
  293. is_hit =
  294. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  295. } else if (base_summary_ptr != nullptr) {
  296. auto item = base_summary_ptr->IsWatchpointHit(wp);
  297. is_hit = std::get<ITensorSummary::eHitPos>(item);
  298. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  299. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  300. }
  301. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  302. if (is_hit || error_code) {
  303. (*chunk_exec_orders)[chunk_id].push_back(tensor->GetExecutionOrder());
  304. (*chunk_names)[chunk_id].push_back(qualified_tensor_name);
  305. (*chunk_slots)[chunk_id].push_back(tensor_slot);
  306. (*chunk_conditions)[chunk_id].push_back(wp.condition.type);
  307. (*chunk_watchpoint_id)[chunk_id].push_back(wp.id);
  308. if (device_id != nullptr) {
  309. (*chunk_device_id)[chunk_id].push_back(tensor->GetDeviceId());
  310. }
  311. if (root_graph_id != nullptr) {
  312. (*chunk_root_graph_id)[chunk_id].push_back(tensor->GetRootGraphId());
  313. }
  314. (*chunk_parameters)[chunk_id].push_back(parameter_list);
  315. (*chunk_error_codes)[chunk_id].push_back(error_code);
  316. }
  317. }
  318. #ifdef OFFLINE_DBG_MODE
  319. // in offline mode remove the need for the data
  320. tensor.reset();
  321. #endif
  322. }
  323. }
  324. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  325. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  326. std::vector<std::vector<parameter_t>> *const parameters,
  327. std::vector<int32_t> *const error_codes,
  328. const std::vector<std::string> &op_overflows,
  329. const std::vector<std::string> &async_file_pool,
  330. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  331. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  332. std::vector<unsigned int> *root_graph_id) {
  333. std::lock_guard<std::mutex> lg(lock_);
  334. auto t1 = std::chrono::high_resolution_clock::now();
  335. if (watchpoint_table_.empty()) return;
  336. // vector to store execution order of tensors hit
  337. std::vector<int> exec_order;
  338. int tensor_list_size = tensor_list->size();
  339. uint64_t tensor_list_byte_size = 0;
  340. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  341. if (tensor_list_size == 0) return;
  342. // default value for number of threads
  343. const int default_thread_num = 32;
  344. int max_thread_num = default_thread_num;
  345. if (max_thread_num > tensor_list_size) {
  346. max_thread_num = tensor_list_size;
  347. }
  348. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  349. int chunk_size = tensor_list_size / max_thread_num;
  350. int remainder = tensor_list_size % max_thread_num;
  351. partitioned_numbers chunk_exec_orders(max_thread_num);
  352. partitioned_names chunk_names(max_thread_num);
  353. partitioned_names chunk_slots(max_thread_num);
  354. partitioned_numbers chunk_conditions(max_thread_num);
  355. partitioned_id chunk_watchpoint_id(max_thread_num);
  356. partitioned_parameters chunk_parameters(max_thread_num);
  357. partitioned_error_code chunk_error_codes(max_thread_num);
  358. partitioned_id chunk_device_id(max_thread_num);
  359. partitioned_id chunk_root_graph_id(max_thread_num);
  360. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  361. std::vector<std::future<void>> tensor_future_vec;
  362. int begin = 0;
  363. int end = begin;
  364. for (int i = 0; i < max_thread_num; i++) {
  365. end += chunk_size;
  366. if (remainder > 0) {
  367. end++;
  368. remainder--;
  369. }
  370. tensor_future_vec.push_back(
  371. std::async(std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  372. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows,
  373. async_file_pool, &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck,
  374. &chunk_device_id, &chunk_root_graph_id, &chunk_tensor_byte_size, device_id, root_graph_id));
  375. begin = end;
  376. }
  377. for (unsigned int i = 0; i < tensor_future_vec.size(); i++) {
  378. tensor_future_vec[i].wait();
  379. tensor_future_vec[i].get();
  380. for (unsigned int j = 0; j < chunk_exec_orders[i].size(); j++) {
  381. std::vector<int>::iterator iter = std::lower_bound(exec_order.begin(), exec_order.end(), chunk_exec_orders[i][j]);
  382. // if the execution order is repeated,inserts the new one before the others with same execution order.
  383. int position = iter - exec_order.begin();
  384. exec_order.insert(iter, chunk_exec_orders[i][j]);
  385. name->insert(name->begin() + position, chunk_names[i][j]);
  386. slot->insert(slot->begin() + position, chunk_slots[i][j]);
  387. condition->insert(condition->begin() + position, chunk_conditions[i][j]);
  388. watchpoint_id->insert(watchpoint_id->begin() + position, chunk_watchpoint_id[i][j]);
  389. if (device_id != nullptr) {
  390. device_id->insert(device_id->begin() + position, chunk_device_id[i][j]);
  391. }
  392. if (root_graph_id != nullptr) {
  393. root_graph_id->insert(root_graph_id->begin() + position, chunk_root_graph_id[i][j]);
  394. }
  395. parameters->insert(parameters->begin() + position, chunk_parameters[i][j]);
  396. error_codes->insert(error_codes->begin() + position, chunk_error_codes[i][j]);
  397. }
  398. // free the memory for used vectors
  399. std::vector<int>().swap(chunk_exec_orders[i]);
  400. std::vector<std::string>().swap(chunk_names[i]);
  401. std::vector<std::string>().swap(chunk_slots[i]);
  402. std::vector<int>().swap(chunk_conditions[i]);
  403. std::vector<unsigned int>().swap(chunk_watchpoint_id[i]);
  404. std::vector<std::vector<parameter_t>>().swap(chunk_parameters[i]);
  405. std::vector<int32_t>().swap(chunk_error_codes[i]);
  406. std::vector<unsigned int>().swap(chunk_device_id[i]);
  407. std::vector<unsigned int>().swap(chunk_root_graph_id[i]);
  408. tensor_list_byte_size += chunk_tensor_byte_size[i];
  409. }
  410. auto t2 = std::chrono::high_resolution_clock::now();
  411. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  412. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  413. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  414. }
  415. #ifdef OFFLINE_DBG_MODE
  416. void DebugServices::ReadTensorFromNpy(const std::string &file_name, std::string *tensor_type, std::size_t *size,
  417. std::vector<int64_t> *shape, std::vector<char> **data_buffer) {
  418. std::ifstream infile;
  419. std::string file_path = file_name;
  420. MS_LOG(INFO) << "Reading in file: " << file_path;
  421. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  422. if (!infile.is_open()) {
  423. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno
  424. << " ErrInfo:" << strerror(errno);
  425. return;
  426. }
  427. uint64_t file_size = infile.tellg();
  428. infile.seekg(0, std::ios::beg);
  429. auto buffer = std::make_unique<std::vector<char>>(file_size);
  430. if (!infile.read(buffer->data(), file_size)) {
  431. MS_LOG(ERROR) << "Failed to read file (In ReadTensorFromNpy) " << file_path;
  432. return;
  433. }
  434. const int substr_len = 2;
  435. const int header_len_offset = 8;
  436. const int header_offset = 9;
  437. const int type_offset = 10;
  438. uint16_t header_len = *reinterpret_cast<uint16_t *>(buffer->data() + header_len_offset);
  439. std::string header(buffer->data() + header_offset, header_len);
  440. std::size_t type_i = header.find("descr") + type_offset;
  441. if (header.length() < type_i + substr_len) {
  442. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  443. return;
  444. }
  445. *tensor_type = header.substr(type_i, substr_len);
  446. std::size_t shape_i_open = header.find("(");
  447. std::size_t shape_i_close = header.find(")");
  448. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  449. std::string intermediate;
  450. std::stringstream check_shape(shape_str);
  451. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  452. while (getline(check_shape, intermediate, ',')) {
  453. shape->push_back(std::stoi(intermediate));
  454. }
  455. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  456. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  457. std::size_t data_size = data_len * word_size;
  458. infile.seekg(header_len + type_offset);
  459. *data_buffer = new std::vector<char>(data_size);
  460. if (data_buffer == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  461. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  462. }
  463. *size = data_size;
  464. }
  465. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  466. std::vector<std::string> *result_list) {
  467. std::string file_format = "npy";
  468. for (auto const &d : dir_to_files_map) {
  469. std::vector<std::string> files_to_convert_in_dir;
  470. std::string dump_key = d.first;
  471. for (auto const &file_name : d.second) {
  472. bool already_converted = false;
  473. // Remove scope from the file_name for matching files converted by mindinsight tool.
  474. std::size_t found_first_dot = file_name.find(".");
  475. std::size_t found_last_underscore = file_name.find_last_of("_");
  476. std::string file_name_without_scope = file_name;
  477. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  478. file_name_without_scope =
  479. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  480. }
  481. for (std::string &file_found : *result_list) {
  482. if (file_found.find(file_name_without_scope) != std::string::npos) {
  483. already_converted = true;
  484. }
  485. }
  486. if (!already_converted) {
  487. files_to_convert_in_dir.push_back(dump_key + "/" + file_name);
  488. }
  489. }
  490. std::ostringstream input_file_o;
  491. const char *const delim = " ";
  492. std::copy(files_to_convert_in_dir.begin(), files_to_convert_in_dir.end(),
  493. std::ostream_iterator<std::string>(input_file_o, delim));
  494. std::string input_files = input_file_o.str();
  495. MS_LOG(INFO) << "Ops to convert: " << input_files;
  496. if (input_files != "") {
  497. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  498. // later task.
  499. try {
  500. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  501. std::string convert_pkg_path = pkg.attr("__file__").cast<std::string>();
  502. MS_LOG(INFO) << "The file for converting async dump data is in " << convert_pkg_path;
  503. std::string convert_command = "python " + convert_pkg_path + " -out " + dump_key + " -t " + file_format +
  504. " -d " + dump_key + " -f NCHW -l " + input_files;
  505. (void)(system(convert_command.c_str()) + 1);
  506. } catch (pybind11::error_already_set &e) {
  507. MS_LOG(EXCEPTION) << "Can't find package mindspore.offline_debug.convert_async";
  508. }
  509. std::string abspath = RealPath(dump_key);
  510. DIR *d_handle = opendir(abspath.c_str());
  511. if (d_handle == nullptr) {
  512. MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
  513. return;
  514. }
  515. struct dirent *dir = nullptr;
  516. while ((dir = readdir(d_handle)) != NULL) {
  517. if (dir->d_type == DT_REG) {
  518. std::string candidate = dir->d_name;
  519. for (const std::string &file_to_find : files_to_convert_in_dir) {
  520. std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1);
  521. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  522. // we found a converted file for this op
  523. std::string found_file = dump_key + "/" + candidate;
  524. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  525. result_list->push_back(found_file);
  526. }
  527. }
  528. }
  529. }
  530. }
  531. (void)closedir(d_handle);
  532. }
  533. }
  534. }
  535. void GetNodeNameWithoutScope(std::string *dump_style_name) {
  536. if (dump_style_name == nullptr) {
  537. return;
  538. }
  539. std::string node_name_without_scope = *dump_style_name;
  540. std::size_t last_scope_marker;
  541. std::string delim = "/";
  542. last_scope_marker = node_name_without_scope.rfind(delim);
  543. if (last_scope_marker != std::string::npos) {
  544. node_name_without_scope = node_name_without_scope.substr(last_scope_marker + delim.size());
  545. }
  546. *dump_style_name = node_name_without_scope;
  547. }
  548. void ReplaceSrcFileName(std::string *dump_style_name) {
  549. if (dump_style_name == nullptr) {
  550. return;
  551. }
  552. const std::string strsrc = "/";
  553. std::string strdst = "_";
  554. std::string::size_type pos = 0;
  555. std::string::size_type srclen = strsrc.size();
  556. std::string::size_type dstlen = strdst.size();
  557. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  558. dump_style_name->replace(pos, srclen, strdst);
  559. pos += dstlen;
  560. }
  561. }
  562. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  563. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  564. std::vector<unsigned int> root_graph_id, std::vector<std::string> *result_list) {
  565. std::string file_format = "npy";
  566. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  567. for (unsigned int i = 0; i < backend_name.size(); i++) {
  568. // form prefix of the tensor file to read from graph pb node name
  569. std::string dump_style_kernel_name = backend_name[i];
  570. // remove slot from name
  571. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  572. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  573. std::string prefix_dump_file_name = dump_style_kernel_name;
  574. GetNodeNameWithoutScope(&prefix_dump_file_name);
  575. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  576. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  577. // search files in dir for the one that meets the filename prefix and read the file into memory
  578. std::string abspath = RealPath(specific_dump_dir);
  579. DIR *d = opendir(abspath.c_str());
  580. if (d == nullptr) {
  581. MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
  582. return;
  583. } else {
  584. struct dirent *dir = nullptr;
  585. while ((dir = readdir(d)) != NULL) {
  586. if (dir->d_type == DT_REG) {
  587. std::string file_name = dir->d_name;
  588. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  589. if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  590. file_name.rfind(file_format) == std::string::npos) {
  591. // if file matches prefix and is in device format add to candidate files to convert.
  592. dir_to_files_map[specific_dump_dir].push_back(file_name);
  593. } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  594. file_name.rfind(file_format) != std::string::npos) {
  595. // otherwise, if file matches prefix and already has been converted to host format
  596. // add to result of converted files.
  597. std::string found_file = specific_dump_dir + "/" + file_name;
  598. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  599. result_list->push_back(found_file);
  600. }
  601. }
  602. }
  603. }
  604. (void)closedir(d);
  605. }
  606. }
  607. ConvertToHostFormat(dir_to_files_map, result_list);
  608. }
  609. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  610. const std::string &specific_dump_dir,
  611. std::vector<std::string> *result_list) {
  612. std::string file_format = "npy";
  613. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  614. for (const auto &node : proto_dump) {
  615. std::string dump_name = std::get<1>(node);
  616. dump_name = dump_name.substr(0, dump_name.rfind("."));
  617. // search files in dir for the one that meets the filename prefix and read the file into memory
  618. std::string abspath = RealPath(specific_dump_dir);
  619. DIR *d = opendir(abspath.c_str());
  620. if (d == nullptr) {
  621. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  622. return;
  623. } else {
  624. struct dirent *dir = nullptr;
  625. while ((dir = readdir(d)) != NULL) {
  626. if (dir->d_type == DT_REG) {
  627. std::string file_name = dir->d_name;
  628. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  629. if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  630. file_name.rfind(file_format) == std::string::npos) {
  631. // if file matches prefix and is in device format add to candidate files to convert.
  632. dir_to_files_map[specific_dump_dir].push_back(file_name);
  633. } else if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  634. file_name.rfind(file_format) != std::string::npos) {
  635. // otherwise, if file matches prefix and already has been converted to host format
  636. // add to result of converted files.
  637. std::string found_file = specific_dump_dir + "/" + file_name;
  638. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  639. result_list->push_back(found_file);
  640. }
  641. }
  642. }
  643. }
  644. (void)closedir(d);
  645. }
  646. }
  647. ConvertToHostFormat(dir_to_files_map, result_list);
  648. }
  649. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  650. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  651. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  652. std::vector<std::shared_ptr<TensorData>> *tensor_list) {
  653. for (auto &node : proto_dump) {
  654. std::vector<size_t> slot_list;
  655. std::string dump_style_name = std::get<1>(node);
  656. // Get dump_name and output_str from the second element of tuple
  657. std::size_t found_dot = dump_style_name.rfind(".");
  658. std::string dump_name = dump_style_name.substr(0, found_dot);
  659. std::string output_str = dump_style_name.substr(found_dot + 1);
  660. bool output_flag = (output_str == "output");
  661. for (const std::string &file_name : async_file_pool) {
  662. std::size_t found = file_name.find(dump_name);
  663. std::size_t found_out = file_name.find(output_str);
  664. std::size_t found_dot_start = file_name.find(".", found_out);
  665. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  666. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  667. found_out != std::string::npos) {
  668. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  669. }
  670. }
  671. for (auto slot : slot_list) {
  672. // add a TensorData entry (data will be read when needed)
  673. std::vector<int64_t> shape;
  674. std::string orig_name = std::get<0>(node);
  675. auto tensor_data = std::make_shared<TensorData>();
  676. tensor_data->SetName(orig_name);
  677. tensor_data->SetExecutionOrder(0);
  678. tensor_data->SetSlot(slot);
  679. tensor_data->SetIteration(iteration);
  680. tensor_data->SetDeviceId(device_id);
  681. tensor_data->SetRootGraphId(root_graph_id);
  682. tensor_data->SetDataPtr(NULL);
  683. tensor_data->SetByteSize(0);
  684. tensor_data->SetType("");
  685. tensor_data->SetShape(shape);
  686. tensor_data->SetIsOutput(output_flag);
  687. tensor_list->push_back(tensor_data);
  688. }
  689. }
  690. }
  691. void DebugServices::AddToTensorData(const std::string &backend_name, const std::size_t slot,
  692. const unsigned int iteration, const unsigned int device_id,
  693. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  694. const std::string &type_name, const std::vector<int64_t> &shape,
  695. std::vector<char> *buffer, std::vector<std::shared_ptr<TensorData>> *result_list) {
  696. // call LoadNewTensor to store tensor in internal cache
  697. auto tensor_data = std::make_shared<TensorData>();
  698. tensor_data->SetName(backend_name);
  699. tensor_data->SetExecutionOrder(0);
  700. tensor_data->SetSlot(slot);
  701. tensor_data->SetIteration(iteration);
  702. tensor_data->SetDeviceId(device_id);
  703. tensor_data->SetRootGraphId(root_graph_id);
  704. tensor_data->SetIsOutput(is_output);
  705. if (data_size) {
  706. tensor_data->SetDataPtr(buffer->data());
  707. } else {
  708. tensor_data->SetDataPtr(NULL);
  709. }
  710. tensor_data->SetByteSize(data_size);
  711. tensor_data->SetType(type_name);
  712. tensor_data->SetShape(shape);
  713. if (data_size) {
  714. tensor_loader_->LoadNewTensor(tensor_data, false);
  715. }
  716. // add to result_list
  717. result_list->push_back(tensor_data);
  718. }
  719. void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *slot_string_to_check,
  720. std::string *dump_style_kernel_name, size_t slot, bool is_output) {
  721. std::string dump_style_name_part = *dump_style_kernel_name;
  722. GetNodeNameWithoutScope(&dump_style_name_part);
  723. std::string slot_str;
  724. if (is_output) {
  725. slot_str = ".output." + std::to_string(slot);
  726. } else {
  727. slot_str = ".input." + std::to_string(slot);
  728. }
  729. dump_style_name_part += slot_str;
  730. *prefix_dump_file_name = dump_style_name_part;
  731. *slot_string_to_check = slot_str;
  732. }
  733. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  734. // get file with the newest timestamp from the list.
  735. std::string newest_file;
  736. if (file_list.empty()) {
  737. return newest_file;
  738. }
  739. std::sort(file_list.begin(), file_list.end());
  740. return file_list.back();
  741. }
  742. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  743. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  744. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  745. const std::vector<std::string> &async_file_pool,
  746. std::vector<std::shared_ptr<TensorData>> *result_list) {
  747. for (unsigned int i = 0; i < backend_name.size(); i++) {
  748. // form prefix of the tensor file to read from graph pb node name
  749. std::string dump_style_kernel_name = backend_name[i];
  750. // remove slot from name
  751. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  752. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  753. std::string slot_string_to_check;
  754. std::string prefix_dump_file_name;
  755. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  756. std::string prefix_dump_to_check = dump_style_kernel_name;
  757. GetNodeNameWithoutScope(&prefix_dump_to_check);
  758. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  759. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  760. // search files in dir for the one that meets the filename prefix and read the file into memory
  761. if (is_sync_mode_) {
  762. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  763. iteration[i], root_graph_id[i], is_output[i], result_list);
  764. } else {
  765. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  766. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list);
  767. }
  768. }
  769. }
  770. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  771. const std::string &backend_name, size_t slot, unsigned int device_id,
  772. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  773. std::vector<std::shared_ptr<TensorData>> *result_list) {
  774. std::vector<char> *buffer = NULL;
  775. std::string type_name = "";
  776. std::vector<int64_t> shape;
  777. uint64_t data_size = 0;
  778. std::string abspath = RealPath(specific_dump_dir);
  779. DIR *d = opendir(abspath.c_str());
  780. bool found_file = false;
  781. std::vector<std::string> matched_paths;
  782. if (d == nullptr) {
  783. MS_LOG(ERROR) << "Directory " << specific_dump_dir << " does not exist!";
  784. return;
  785. }
  786. struct dirent *dir = nullptr;
  787. while ((dir = readdir(d)) != NULL) {
  788. if (dir->d_type == DT_REG) {
  789. std::string file_name = dir->d_name;
  790. std::string stripped_file_name = GetStrippedFilename(file_name);
  791. if (stripped_file_name.empty()) {
  792. continue;
  793. }
  794. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  795. if (found != 0) {
  796. continue;
  797. }
  798. std::string full_path = specific_dump_dir + "/" + file_name;
  799. matched_paths.push_back(full_path);
  800. found_file = true;
  801. }
  802. }
  803. if (found_file) {
  804. shape.clear();
  805. std::string result_path = GetNewestFilePath(matched_paths);
  806. ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
  807. AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape,
  808. buffer, result_list);
  809. } else {
  810. AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer,
  811. result_list);
  812. MS_LOG(INFO) << "Target tensor has not been found.";
  813. }
  814. (void)closedir(d);
  815. }
  816. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  817. const std::string &slot_string_to_check, const std::string &backend_name,
  818. size_t slot, unsigned int device_id, unsigned int iteration,
  819. unsigned int root_graph_id, const bool &is_output,
  820. const std::vector<std::string> &async_file_pool,
  821. std::vector<std::shared_ptr<TensorData>> *result_list) {
  822. std::vector<char> *buffer = NULL;
  823. std::string type_name = "";
  824. std::vector<int64_t> shape;
  825. uint64_t data_size = 0;
  826. bool found = false;
  827. std::vector<std::string> matched_paths;
  828. // if async mode
  829. for (const std::string &file_path : async_file_pool) {
  830. if (file_path.find(specific_dump_dir) != std::string::npos &&
  831. file_path.find(prefix_dump_to_check) != std::string::npos &&
  832. file_path.find(slot_string_to_check) != std::string::npos) {
  833. matched_paths.push_back(file_path);
  834. found = true;
  835. }
  836. }
  837. if (found) {
  838. shape.clear();
  839. std::string result_path = GetNewestFilePath(matched_paths);
  840. ReadTensorFromNpy(result_path, &type_name, &data_size, &shape, &buffer);
  841. AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, data_size, type_name, shape,
  842. buffer, result_list);
  843. } else {
  844. // If no npy file is found, add empty tensor data.
  845. AddToTensorData(backend_name, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape, buffer,
  846. result_list);
  847. MS_LOG(INFO) << "Target tensor has not been found.";
  848. }
  849. }
  850. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  851. // strip off the task_id, stream_id, and timestamp, then compare
  852. size_t first_dot = file_name.find(".");
  853. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  854. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  855. if (fifth_dot == std::string::npos) {
  856. return std::string();
  857. }
  858. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  859. size_t second_dot = fifth_dot;
  860. const int8_t kSecondDotPosition = 2;
  861. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  862. second_dot = file_name.rfind(".", second_dot - 1);
  863. }
  864. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  865. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  866. std::string stripped_file_name = start_string + end_string;
  867. return stripped_file_name;
  868. }
  869. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  870. unsigned int iteration, std::vector<std::string> *async_file_pool) {
  871. // get a list of nodes and the devices they are on to monitor
  872. std::vector<std::shared_ptr<TensorData>> tensor_list;
  873. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
  874. for (auto w_table_item : watchpoint_table_) {
  875. auto wp = std::get<1>(w_table_item);
  876. unsigned int index = 0;
  877. for (auto check_node : wp.check_node_list) {
  878. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  879. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  880. for (auto device : devices) {
  881. for (auto graph : graphs) {
  882. std::tuple<uint32_t, uint32_t> key(device, graph);
  883. device_and_graph_to_nodes[key].push_back(check_node);
  884. }
  885. }
  886. index++;
  887. }
  888. }
  889. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  890. // as they are found
  891. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  892. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  893. uint32_t device_id = std::get<0>(device_and_graph);
  894. uint32_t root_graph_id = std::get<1>(device_and_graph);
  895. std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
  896. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  897. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  898. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  899. // convert node names to dump style
  900. for (auto node : wp_nodes) {
  901. std::string orig_name = std::get<0>(node);
  902. std::string dump_style_name = orig_name;
  903. // Remove the scope from the fully qualified name to compare for both sync and async case.
  904. GetNodeNameWithoutScope(&dump_style_name);
  905. bool node_is_out = std::get<1>(node);
  906. if (node_is_out) {
  907. dump_style_name += ".output";
  908. } else {
  909. dump_style_name += ".input";
  910. }
  911. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  912. }
  913. if (!is_sync_mode_) {
  914. // convert all files in proto_to_dump to npy and add to pool of async file names
  915. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  916. }
  917. if (is_sync_mode_) {
  918. // search files in dir for the one that meets the filename prefix and read the file into memory
  919. std::string abspath = RealPath(specific_dump_dir);
  920. DIR *d = opendir(abspath.c_str());
  921. if (d == nullptr) {
  922. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
  923. } else {
  924. struct dirent *dir = nullptr;
  925. while ((dir = readdir(d)) != NULL) {
  926. if (dir->d_type == DT_REG) {
  927. std::string file_name = dir->d_name;
  928. for (auto &node : proto_to_dump) {
  929. std::string dump_name = std::get<1>(node);
  930. std::string stripped_file_name = GetStrippedFilename(file_name);
  931. if (stripped_file_name.empty()) {
  932. continue;
  933. }
  934. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  935. if (found == 0) {
  936. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  937. std::vector<int64_t> shape;
  938. std::string orig_name = std::get<0>(node);
  939. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  940. bool output_flag = (output_str == "output");
  941. AddToTensorData(orig_name, slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, NULL,
  942. &tensor_list);
  943. break;
  944. }
  945. }
  946. }
  947. }
  948. (void)closedir(d);
  949. }
  950. } else {
  951. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
  952. &tensor_list);
  953. }
  954. }
  955. return tensor_list;
  956. }
  957. std::string DebugServices::IterationString(unsigned int iteration) {
  958. std::string iteration_string;
  959. bool init_dbg_suspend = (iteration == UINT_MAX);
  960. if (init_dbg_suspend) {
  961. iteration_string = "init";
  962. } else {
  963. iteration_string = std::to_string(iteration);
  964. }
  965. return iteration_string;
  966. }
  967. #endif
  968. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  969. std::vector<char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  970. std::vector<unsigned int> *const dtype,
  971. std::vector<std::vector<int64_t>> *const shape) {
  972. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  973. tensor_loader_->SearchTensors(name, &result_list);
  974. for (auto result : result_list) {
  975. if (!std::get<1>(result)) {
  976. continue;
  977. }
  978. ret_name->push_back(std::get<0>(result));
  979. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  980. data_size->push_back(std::get<1>(result)->GetByteSize());
  981. dtype->push_back(std::get<1>(result)->GetType());
  982. shape->push_back(std::get<1>(result)->GetShape());
  983. }
  984. }
  985. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  986. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  987. if (!result_list) {
  988. MS_LOG(DEBUG) << "result_list is nullptr.";
  989. return;
  990. }
  991. tensor_loader_->SearchTensors(name, result_list);
  992. }
  993. #ifdef ONLINE_DBG_MODE
  994. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  995. bool ret = false;
  996. for (auto w_table_item : watchpoint_table_) {
  997. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  998. for (auto check_node : check_node_list) {
  999. std::string w_name = std::get<0>(check_node);
  1000. bool w_type = std::get<1>(check_node);
  1001. if ((w_type == true &&
  1002. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1003. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1004. ret = true;
  1005. return ret;
  1006. }
  1007. }
  1008. }
  1009. return ret;
  1010. }
  1011. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1012. if (kernel && w_name.length() > 0) {
  1013. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1014. for (size_t j = 0; j < input_size; ++j) {
  1015. auto input_kernel = kernel->input(j + 1);
  1016. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1017. auto found = w_name.find_last_of('/');
  1018. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1019. return true;
  1020. }
  1021. return false;
  1022. } else {
  1023. return false;
  1024. }
  1025. }
  1026. #endif
  1027. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  1028. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1029. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensorMap(const std::string &node_name) const {
  1030. return tensor_loader_->GetNodeTensorMap(node_name);
  1031. }
  1032. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  1033. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  1034. void DebugServices::EmptyPrevTensor() { tensor_loader_->EmptyPrevTensor(); }
  1035. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1036. #ifdef ONLINE_DBG_MODE
  1037. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1038. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1039. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1040. size_t slot) const {
  1041. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1042. device_type, addr_format, slot);
  1043. }
  1044. #endif
  1045. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1046. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1047. }
  1048. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  1049. return watchpoint_table_;
  1050. }
  1051. void DebugServices::ResetLoadedTensors() {
  1052. wp_id_cache_.clear();
  1053. MS_LOG(INFO) << "Resetting loaded tensors";
  1054. tensor_loader_->MoveParametersCurrentToPrev();
  1055. tensor_loader_->EmptyCurrentTensor();
  1056. // will move parameters from previous to current map
  1057. tensor_loader_->SwapCurrentPrev();
  1058. overflow_ops_.clear();
  1059. }
  1060. #ifdef ONLINE_DBG_MODE
  1061. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1062. MS_EXCEPTION_IF_NULL(kernel);
  1063. std::vector<std::shared_ptr<TensorData>> result;
  1064. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1065. auto kernel_name = GetKernelNodeName(kernel);
  1066. for (size_t j = 0; j < output_size; ++j) {
  1067. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1068. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1069. if (tensor) result.push_back(tensor);
  1070. }
  1071. return result;
  1072. }
  1073. #endif
  1074. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1075. unsigned int iteration) {
  1076. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1077. std::vector<std::string> op_names;
  1078. std::string overflow_bin_path;
  1079. #ifdef ONLINE_DBG_MODE
  1080. auto debugger = Debugger::GetInstance();
  1081. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->graph_id());
  1082. auto realpath = Common::GetRealPath(overflow_bin_path);
  1083. if (!realpath.has_value()) {
  1084. MS_LOG(ERROR) << "Get real path failed for overflow_bin_path.";
  1085. return false;
  1086. }
  1087. overflow_bin_path = realpath.value();
  1088. #else
  1089. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1090. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1091. overflow_bin_path = RealPath(overflow_bin_path);
  1092. #endif
  1093. overflow_wp_lock_.lock();
  1094. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1095. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1096. if (found_overflows != overflow_ops_.end()) {
  1097. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1098. op_names = overflow_ops_[overflow_bin_path];
  1099. } else {
  1100. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1101. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1102. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1103. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1104. std::string abspath = RealPath(overflow_bin_path);
  1105. DIR *d = opendir(abspath.c_str());
  1106. if (d == nullptr) {
  1107. MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
  1108. } else {
  1109. struct dirent *dir = nullptr;
  1110. while ((dir = readdir(d)) != nullptr) {
  1111. if (dir->d_type == DT_REG) {
  1112. // form fully qualified filename
  1113. std::string file_path = overflow_bin_path;
  1114. std::string file_name = dir->d_name;
  1115. file_path.append(file_name);
  1116. // attempt to read the file
  1117. std::ifstream infile;
  1118. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1119. if (!infile.is_open()) {
  1120. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno
  1121. << " ErrInfo:" << strerror(errno);
  1122. continue;
  1123. }
  1124. std::string node_name;
  1125. uint64_t task_id = 0;
  1126. uint64_t stream_id = 0;
  1127. // detect overflow bin file
  1128. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1129. // start of op overflow data in bin file
  1130. const uint32_t offset = 321;
  1131. (void)infile.seekg(offset, std::ios::beg);
  1132. std::vector<char> buffer;
  1133. // size of op overflow info section
  1134. const size_t buf_size = 256;
  1135. buffer.resize(buf_size);
  1136. (void)infile.read(buffer.data(), buf_size);
  1137. if (infile.gcount() != buf_size) {
  1138. MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
  1139. continue;
  1140. }
  1141. const uint8_t stream_id_offset = 16;
  1142. const uint8_t task_id_offset = 24;
  1143. // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
  1144. // byte values currently.
  1145. stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
  1146. task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
  1147. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1148. << ".";
  1149. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1150. } else {
  1151. // regular bin file
  1152. bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
  1153. if (success_parse) {
  1154. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1155. }
  1156. }
  1157. infile.close();
  1158. }
  1159. }
  1160. (void)closedir(d);
  1161. }
  1162. // find the op_names with an overflow hit
  1163. for (auto &task_stream : task_stream_hit) {
  1164. auto op_name = task_stream_to_opname[task_stream];
  1165. if (!op_name.empty()) {
  1166. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1167. op_names.push_back(op_name);
  1168. }
  1169. }
  1170. overflow_ops_[overflow_bin_path] = op_names;
  1171. }
  1172. overflow_wp_lock_.unlock();
  1173. // determine if overflow wp has been triggered for node_name_to_find
  1174. if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) {
  1175. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1176. return true;
  1177. }
  1178. return false;
  1179. }
  1180. bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *node_name, uint64_t *task_id,
  1181. uint64_t *stream_id) {
  1182. // get the node_name, task_id, and stream_id from async dump filename
  1183. // node_type.node_name.task_id.stram_id.timestamp
  1184. // WARNING: node_name may have dots in it
  1185. size_t fourth_dot = file_name.rfind(".");
  1186. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1187. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1188. size_t first_dot = file_name.find(".");
  1189. // check if dots were found
  1190. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1191. fourth_dot == std::string::npos) {
  1192. return false;
  1193. }
  1194. // check if its not an async bin file
  1195. if (file_name.substr(fourth_dot) == ".npy") {
  1196. return false;
  1197. }
  1198. // get node_name
  1199. if (first_dot < second_dot) {
  1200. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1201. } else {
  1202. MS_LOG(ERROR) << "Async filename parse error to get node_name.";
  1203. return false;
  1204. }
  1205. // get task id
  1206. if (second_dot < third_dot) {
  1207. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1208. try {
  1209. *task_id = std::stoull(extracted_task_id);
  1210. } catch (...) {
  1211. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id.";
  1212. return false;
  1213. }
  1214. } else {
  1215. MS_LOG(ERROR) << "Async filename parse error to get task_id.";
  1216. return false;
  1217. }
  1218. // get stream id
  1219. if (third_dot < fourth_dot) {
  1220. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1221. try {
  1222. *stream_id = std::stoull(extracted_stream_id);
  1223. } catch (...) {
  1224. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id.";
  1225. return false;
  1226. }
  1227. } else {
  1228. MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
  1229. return false;
  1230. }
  1231. return true;
  1232. }
  1233. std::string DebugServices::RealPath(const std::string &input_path) {
  1234. if (input_path.length() >= PATH_MAX) {
  1235. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1236. }
  1237. size_t path_split_pos = input_path.find_last_of('/');
  1238. // get real path
  1239. char real_path[PATH_MAX] = {0};
  1240. // input_path is dir + file_name
  1241. if (path_split_pos != std::string::npos) {
  1242. std::string prefix_path = input_path.substr(0, path_split_pos);
  1243. std::string file_name = input_path.substr(path_split_pos);
  1244. if (file_name.length() > NAME_MAX) {
  1245. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1246. }
  1247. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1248. MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist.";
  1249. return "";
  1250. }
  1251. return std::string(real_path) + file_name;
  1252. }
  1253. // input_path is only file_name
  1254. if (input_path.length() > NAME_MAX) {
  1255. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1256. }
  1257. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1258. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1259. }
  1260. return std::string(real_path);
  1261. }
  1262. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1263. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1264. }
  1265. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1266. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1267. }
  1268. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1269. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1270. }
  1271. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1272. std::string DebugServices::GetNetName() { return net_name_; }
  1273. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1274. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1275. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1276. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1277. #ifdef ONLINE_DBG_MODE
  1278. } // namespace mindspore
  1279. #endif