You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 87 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949
  1. /**
  2. * Copyright 2019-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include <regex>
  29. #include "pybind11/embed.h"
  30. #include "pybind11/stl.h"
  31. #ifdef ONLINE_DBG_MODE
  32. #include "debug/common.h"
  33. #include "debug/debugger/debugger.h"
  34. #include "debug/anf_ir_utils.h"
  35. #include "backend/session/anf_runtime_algorithm.h"
  36. #endif
  37. #include "nlohmann/json.hpp"
  38. #include "debug/debugger/tensor_summary.h"
  39. #include "utils/file_utils.h"
  40. #include "climits"
  41. #ifdef ONLINE_DBG_MODE
  42. namespace mindspore {
  43. #endif
  44. static constexpr const char *constant_prefix = "Default--data-";
  45. namespace {
  46. #ifdef __APPLE__
  47. constexpr int kStrErrorNone = 0;
  48. #else
  49. constexpr char *kStrErrorNone = nullptr;
  50. #endif
  51. } // namespace
  52. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  53. DebugServices::DebugServices(const DebugServices &other) {
  54. wp_id_cache_ = other.wp_id_cache_;
  55. net_name_ = other.net_name_;
  56. dump_dir_ = other.dump_dir_;
  57. is_sync_mode_ = other.is_sync_mode_;
  58. tensor_loader_ = other.tensor_loader_;
  59. watchpoint_table_ = other.watchpoint_table_;
  60. }
  61. DebugServices &DebugServices::operator=(const DebugServices &other) {
  62. if (this != &other) {
  63. tensor_loader_ = other.tensor_loader_;
  64. watchpoint_table_ = other.watchpoint_table_;
  65. }
  66. return *this;
  67. }
  68. void DebugServices::AddWatchpoint(
  69. unsigned int id, unsigned int watch_condition, float parameter,
  70. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  71. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  72. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  73. std::lock_guard<std::mutex> lg(lock_);
  74. watchpoint_t watchpoint_item;
  75. watchpoint_item.id = id;
  76. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  77. watchpoint_item.condition.parameter = parameter;
  78. watchpoint_item.check_node_list = check_node_list;
  79. if (check_node_device_list != nullptr) {
  80. watchpoint_item.check_node_device_list = *check_node_device_list;
  81. }
  82. if (check_node_graph_list != nullptr) {
  83. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  84. }
  85. watchpoint_item.parameter_list = parameter_list;
  86. watchpoint_table_[id] = watchpoint_item;
  87. }
  88. void DebugServices::RemoveWatchpoint(unsigned int id) {
  89. std::lock_guard<std::mutex> lg(lock_);
  90. (void)watchpoint_table_.erase(id);
  91. }
  92. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  93. const void *const previous_tensor_ptr, uint32_t num_elements,
  94. uint32_t prev_num_elements, int tensor_dtype) {
  95. MS_EXCEPTION_IF_NULL(tensor);
  96. switch (tensor_dtype) {
  97. case DbgDataType::DT_UINT8: {
  98. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  99. prev_num_elements);
  100. }
  101. case DbgDataType::DT_INT8: {
  102. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  103. prev_num_elements);
  104. }
  105. case DbgDataType::DT_UINT16: {
  106. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  107. prev_num_elements);
  108. }
  109. case DbgDataType::DT_INT16: {
  110. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  111. prev_num_elements);
  112. }
  113. case DbgDataType::DT_UINT32: {
  114. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  115. prev_num_elements);
  116. }
  117. case DbgDataType::DT_INT32:
  118. case DbgDataType::DT_BASE_INT: {
  119. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  120. prev_num_elements);
  121. }
  122. case DbgDataType::DT_UINT64: {
  123. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  124. prev_num_elements);
  125. }
  126. case DbgDataType::DT_INT64: {
  127. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  128. prev_num_elements);
  129. }
  130. case DbgDataType::DT_FLOAT16: {
  131. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  132. prev_num_elements);
  133. }
  134. case DbgDataType::DT_FLOAT32:
  135. case DbgDataType::DT_BASE_FLOAT: {
  136. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  137. prev_num_elements);
  138. }
  139. case DbgDataType::DT_FLOAT64: {
  140. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  141. prev_num_elements);
  142. }
  143. case DbgDataType::DT_BOOL: {
  144. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  145. prev_num_elements);
  146. }
  147. default:
  148. MS_LOG(INFO) << "Unsupported tensor type";
  149. // return a null pointer
  150. return std::unique_ptr<TensorSummary<int32_t>>{};
  151. }
  152. }
  153. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  154. if (tensor == nullptr) {
  155. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  156. TensorStat empty_tensor_stat_data;
  157. return empty_tensor_stat_data;
  158. }
  159. std::unique_ptr<ITensorSummary> base_summary_ptr;
  160. void *previous_tensor_ptr = nullptr;
  161. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  162. if (base_summary_ptr == nullptr) {
  163. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  164. TensorStat empty_tensor_stat_data;
  165. return empty_tensor_stat_data;
  166. }
  167. base_summary_ptr->TensorStatistics(tensor->GetType());
  168. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  169. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  170. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  171. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  172. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  173. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  174. return tensor_stat_data;
  175. }
  176. #ifdef OFFLINE_DBG_MODE
  177. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  178. uint32_t *prev_num_elements, bool *history_not_found) {
  179. MS_EXCEPTION_IF_NULL(tensor);
  180. const void *previous_tensor_ptr = nullptr;
  181. std::shared_ptr<TensorData> tensor_prev;
  182. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  183. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  184. *history_not_found = 1;
  185. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  186. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  187. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  188. // read data in offline mode
  189. AsyncFilePool file_paths;
  190. if (!is_sync_mode_) {
  191. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  192. std::vector<unsigned int>{tensor->GetDeviceId()},
  193. std::vector<unsigned int>{tensor->GetPrevIteration()},
  194. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  195. }
  196. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  197. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  198. std::vector<unsigned int>{tensor->GetDeviceId()},
  199. std::vector<unsigned int>{tensor->GetPrevIteration()},
  200. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  201. file_paths, &result_list_prev);
  202. tensor_prev = result_list_prev[0];
  203. if (!tensor_prev->GetByteSize()) {
  204. tensor_prev.reset();
  205. } else {
  206. previous_tensor_ptr = tensor_prev->GetDataPtr();
  207. *prev_num_elements = tensor_prev->GetNumElements();
  208. }
  209. }
  210. return previous_tensor_ptr;
  211. }
  212. #endif
  213. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  214. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  215. std::string *const qualified_tensor_name,
  216. std::vector<watchpoint_t> *const watchpoints_to_check) {
  217. if (tensor == nullptr) {
  218. MS_LOG(DEBUG) << "tensor is nullptr.";
  219. return;
  220. }
  221. const auto tensor_name = tensor->GetName();
  222. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  223. const auto tensor_device_id = tensor->GetDeviceId();
  224. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  225. for (auto w_table_item : watchpoint_table_) {
  226. auto wp = std::get<1>(w_table_item);
  227. // check ONLY init conditions on initial suspended state.
  228. // skip other conditions on initial suspended state
  229. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  230. continue;
  231. }
  232. // skip init condition if not init suspend
  233. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  234. continue;
  235. }
  236. // check change conditions only on step end.
  237. if (wp.change_condition() && !step_end) {
  238. continue;
  239. }
  240. // if recheck, ignore the cache results and reanalyze everything.
  241. // if not a recheck, check only unanalyzed tensors
  242. if (!recheck) {
  243. wp_lock_.lock();
  244. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  245. wp_lock_.unlock();
  246. if (wp_cache_hit) {
  247. continue;
  248. }
  249. }
  250. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  251. if (!found.empty()) {
  252. *qualified_tensor_name = found;
  253. watchpoints_to_check->push_back(w_table_item.second);
  254. #ifdef OFFLINE_DBG_MODE
  255. if (wp.change_condition()) {
  256. *previous_iter_tensor_needed = true;
  257. }
  258. #endif
  259. }
  260. }
  261. }
  262. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  263. const std::string &tensor_name) {
  264. // add analyzed tensor to cache
  265. if (!recheck) {
  266. wp_lock_.lock();
  267. (void)wp_id_cache_[tensor_name].insert(id);
  268. wp_lock_.unlock();
  269. }
  270. }
  271. void DebugServices::SetCheckWatchpointsResult(
  272. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  273. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  274. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  275. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  276. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  277. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  278. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  279. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  280. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  281. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  282. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  283. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  284. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  285. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  286. if (device_id != nullptr) {
  287. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  288. }
  289. if (root_graph_id != nullptr) {
  290. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  291. }
  292. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  293. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  294. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  295. }
  296. #ifdef OFFLINE_DBG_MODE
  297. void DebugServices::CheckOutofMemoryandNoValue(
  298. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  299. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  300. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  301. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  302. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  303. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  304. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  305. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  306. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  307. const std::vector<parameter_t> &parameter_list) {
  308. bool set_is_needed = no_mem_to_read || error_on_no_value;
  309. int32_t error_code_to_set = 0;
  310. if (no_mem_to_read) {
  311. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  312. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  313. } else if (error_on_no_value) {
  314. error_code_to_set = ITensorSummary::NO_VALUE;
  315. }
  316. if (set_is_needed) {
  317. for (auto &wp : watchpoints_to_check) {
  318. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  319. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  320. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  321. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  322. parameter_list, error_code_to_set);
  323. }
  324. }
  325. }
  326. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  327. // set the tensor into not-in-use status in tensor_loader.
  328. auto tensor_name = tensor->GetName();
  329. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  330. std::to_string(tensor->GetRootGraphId()) + ":" +
  331. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  332. AppendToCacheEvictQueue(key_name_in_cache);
  333. if (previous_tensor_ptr != nullptr) {
  334. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  335. }
  336. }
  337. #endif
  338. #ifdef ONLINE_DBG_MODE
  339. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  340. auto debugger = Debugger::GetInstance();
  341. auto ms_context = MsContext::GetInstance();
  342. MS_EXCEPTION_IF_NULL(ms_context);
  343. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  344. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  345. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  346. device_target == kAscendDevice) {
  347. if (cur_root_graph_id != id) {
  348. return false;
  349. }
  350. }
  351. return true;
  352. }
  353. const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
  354. std::shared_ptr<TensorData> prev_tensor_data;
  355. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  356. // not supporting watchpoints that need prev tensor for multi root graph networks.
  357. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  358. prev_tensor_data = nullptr;
  359. } else {
  360. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  361. }
  362. if (prev_tensor_data) {
  363. *prev_num_elements = prev_tensor_data->GetNumElements();
  364. return prev_tensor_data->GetDataPtr();
  365. }
  366. return nullptr;
  367. }
  368. #endif
  369. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  370. // check history error_code only for offline debugger
  371. if (history_not_found) {
  372. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  373. }
  374. }
  375. void DebugServices::CheckWatchpointsForTensor(
  376. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  377. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  378. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  379. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  380. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  381. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  382. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  383. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  384. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  385. int list_size = tensor_list->size();
  386. if (end > list_size) {
  387. end = list_size;
  388. }
  389. for (int i = begin; i < end; i++) {
  390. auto &tensor = (*tensor_list)[i];
  391. const auto tensor_name = tensor->GetName();
  392. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  393. const auto tensor_slot = std::to_string(tensor->GetSlot());
  394. std::vector<watchpoint_t> watchpoints_to_check;
  395. std::string qualified_tensor_name;
  396. bool previous_iter_tensor_needed = false;
  397. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  398. &qualified_tensor_name, &watchpoints_to_check);
  399. // no wp set on current tensor
  400. if (watchpoints_to_check.empty()) {
  401. continue;
  402. }
  403. #ifdef OFFLINE_DBG_MODE
  404. // read data in offline mode
  405. bool no_mem_to_read = false;
  406. std::vector<std::shared_ptr<TensorData>> result_list;
  407. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  408. std::vector<unsigned int>{tensor->GetDeviceId()},
  409. std::vector<unsigned int>{tensor->GetIteration()},
  410. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  411. async_file_pool, &result_list, &no_mem_to_read);
  412. tensor = result_list[0];
  413. if (!tensor->GetByteSize()) {
  414. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  415. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  416. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  417. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  418. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  419. tensor->GetRootGraphId(), std::vector<parameter_t>());
  420. tensor.reset();
  421. continue;
  422. }
  423. #endif
  424. // no elements to analyze
  425. if (tensor->GetByteSize() == 0) {
  426. continue;
  427. }
  428. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  429. int tensor_dtype = tensor->GetType();
  430. uint32_t num_elements = tensor->GetNumElements();
  431. uint32_t prev_num_elements = 0;
  432. const void *previous_tensor_ptr = nullptr;
  433. #ifdef OFFLINE_DBG_MODE
  434. bool history_not_found = 0;
  435. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  436. #else
  437. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  438. MS_LOG(DEBUG)
  439. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  440. << tensor->GetName();
  441. continue;
  442. }
  443. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  444. #endif
  445. std::unique_ptr<ITensorSummary> base_summary_ptr;
  446. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  447. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  448. if (base_summary_ptr != nullptr) {
  449. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  450. }
  451. }
  452. for (auto &wp : watchpoints_to_check) {
  453. bool is_hit = false;
  454. int error_code = 0;
  455. std::vector<parameter_t> parameter_list = {};
  456. if (wp.condition.type == IS_OVERFLOW) {
  457. is_hit =
  458. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  459. } else if (base_summary_ptr != nullptr) {
  460. auto item = base_summary_ptr->IsWatchpointHit(wp);
  461. is_hit = std::get<ITensorSummary::eHitPos>(item);
  462. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  463. #ifdef OFFLINE_DBG_MODE
  464. CheckHistoryErrorCode(&error_code, history_not_found);
  465. #endif
  466. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  467. }
  468. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  469. if (is_hit || error_code) {
  470. SetCheckWatchpointsResult(
  471. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  472. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  473. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  474. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  475. }
  476. }
  477. #ifdef OFFLINE_DBG_MODE
  478. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  479. // in offline mode remove the need for the data
  480. tensor.reset();
  481. #endif
  482. }
  483. }
  484. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  485. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  486. std::vector<std::vector<parameter_t>> *const parameters,
  487. std::vector<int32_t> *const error_codes,
  488. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  489. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  490. const bool init_dbg_suspend, const bool step_end, const bool recheck,
  491. std::vector<unsigned int> *const device_id,
  492. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  493. std::lock_guard<std::mutex> lg(lock_);
  494. auto t1 = std::chrono::high_resolution_clock::now();
  495. if (watchpoint_table_.empty()) {
  496. return;
  497. }
  498. // vector to store execution order of tensors hit
  499. std::vector<int> exec_order;
  500. std::vector<std::string> time_stamps;
  501. int tensor_list_size = tensor_list->size();
  502. uint64_t tensor_list_byte_size = 0;
  503. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  504. if (tensor_list_size <= 0) {
  505. return;
  506. }
  507. // default value for number of threads
  508. const int default_thread_num = 16;
  509. int max_thread_num = default_thread_num;
  510. if (max_thread_num > tensor_list_size) {
  511. max_thread_num = tensor_list_size;
  512. }
  513. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  514. int chunk_size = tensor_list_size / max_thread_num;
  515. int remainder = tensor_list_size % max_thread_num;
  516. partitioned_numbers chunk_exec_orders(max_thread_num);
  517. partitioned_names chunk_names(max_thread_num);
  518. partitioned_names chunk_slots(max_thread_num);
  519. partitioned_numbers chunk_conditions(max_thread_num);
  520. partitioned_id chunk_watchpoint_id(max_thread_num);
  521. partitioned_parameters chunk_parameters(max_thread_num);
  522. partitioned_error_code chunk_error_codes(max_thread_num);
  523. partitioned_id chunk_device_id(max_thread_num);
  524. partitioned_id chunk_root_graph_id(max_thread_num);
  525. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  526. partitioned_names chunk_time_stamp(max_thread_num);
  527. std::vector<std::future<void>> tensor_future_vec;
  528. int begin = 0;
  529. int end = begin;
  530. for (int i = 0; i < max_thread_num; i++) {
  531. end += chunk_size;
  532. if (remainder > 0) {
  533. end++;
  534. remainder--;
  535. }
  536. (void)tensor_future_vec.emplace_back(std::async(
  537. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  538. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  539. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  540. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  541. begin = end;
  542. }
  543. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  544. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  545. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  546. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  547. root_graph_id);
  548. auto t2 = std::chrono::high_resolution_clock::now();
  549. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  550. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  551. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  552. }
  553. void DebugServices::SortWatchpointsInfo(
  554. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  555. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  556. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  557. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  558. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  559. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  560. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  561. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  562. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  563. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  564. std::vector<unsigned int> *const root_graph_id) {
  565. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  566. (*tensor_future_vec)[i].wait();
  567. (*tensor_future_vec)[i].get();
  568. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  569. #ifdef ONLINE_DBG_MODE
  570. // if the execution order is repeated,inserts the new one before the others with same execution order.
  571. std::vector<int>::iterator iter =
  572. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  573. int position = iter - exec_order->begin();
  574. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  575. #endif
  576. #ifdef OFFLINE_DBG_MODE
  577. std::vector<std::string>::iterator iter =
  578. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  579. int position = iter - time_stamps->begin();
  580. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  581. #endif
  582. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  583. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  584. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  585. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  586. if (device_id != nullptr) {
  587. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  588. }
  589. if (root_graph_id != nullptr) {
  590. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  591. }
  592. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  593. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  594. }
  595. // free the memory for used vectors
  596. std::vector<int>().swap((*chunk_exec_orders)[i]);
  597. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  598. std::vector<std::string>().swap((*chunk_names)[i]);
  599. std::vector<std::string>().swap((*chunk_slots)[i]);
  600. std::vector<int>().swap((*chunk_conditions)[i]);
  601. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  602. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  603. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  604. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  605. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  606. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  607. }
  608. }
  609. #ifdef OFFLINE_DBG_MODE
  610. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  611. std::string *const tensor_type, std::size_t *const size,
  612. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  613. bool *no_mem_to_read) {
  614. std::ifstream infile;
  615. std::string file_path = file_name;
  616. MS_LOG(INFO) << "Reading in file: " << file_path;
  617. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  618. if (!infile.is_open()) {
  619. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  620. const int kMaxFilenameLength = 128;
  621. char err_info[kMaxFilenameLength];
  622. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  623. if (ret != kStrErrorNone) {
  624. MS_LOG(ERROR) << " ErrInfo:" << ret;
  625. }
  626. return;
  627. }
  628. const int substr_len = 2;
  629. const int header_len_offset = 8;
  630. const int header_offset = 9;
  631. const int header_len_buffer_size = 2;
  632. const int type_offset = 10;
  633. // get header length
  634. (void)infile.seekg(0, std::ios::beg);
  635. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  636. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  637. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  638. return;
  639. }
  640. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  641. header_len_buffer.reset();
  642. // read in header
  643. (void)infile.seekg(0, std::ios::beg);
  644. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  645. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  646. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  647. return;
  648. }
  649. std::string header(header_buffer->data() + header_offset, header_len);
  650. header_buffer.reset();
  651. std::size_t type_i = header.find("descr") + type_offset;
  652. if (header.length() < type_i + substr_len) {
  653. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  654. return;
  655. }
  656. *tensor_type = header.substr(type_i, substr_len);
  657. std::size_t shape_i_open = header.find("(");
  658. std::size_t shape_i_close = header.find(")");
  659. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  660. std::string intermediate;
  661. std::stringstream check_shape(shape_str);
  662. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  663. while (getline(check_shape, intermediate, ',')) {
  664. shape->push_back(std::stoi(intermediate));
  665. }
  666. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  667. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  668. std::size_t data_size = data_len * word_size;
  669. if (!data_size) {
  670. return;
  671. }
  672. // Check memory available before loading tensor into host.
  673. bool has_enough_memory = true;
  674. if (tensor_loader_->EnableMemoryControl()) {
  675. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  676. }
  677. if (!has_enough_memory) {
  678. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  679. *no_mem_to_read = true;
  680. } else {
  681. (void)infile.seekg(header_len + type_offset);
  682. *data_buffer = new std::vector<char>(data_size);
  683. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  684. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  685. }
  686. *size = data_size;
  687. }
  688. }
  689. void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
  690. std::string file_format = "npy";
  691. for (auto const &d : dir_to_files_map) {
  692. std::vector<std::string> files_to_convert_in_dir;
  693. std::vector<std::string> files_after_convert_in_dir;
  694. std::string dump_key = d.first;
  695. for (auto const &pair : d.second) {
  696. std::string file_name = pair.first;
  697. std::string file_name_without_scope = pair.second;
  698. // skip the file that was converted to npy already.
  699. if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
  700. return file_found.find(file_name_without_scope) == std::string::npos;
  701. })) {
  702. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  703. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  704. }
  705. }
  706. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  707. if (!files_to_convert_in_dir.empty()) {
  708. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  709. // later task.
  710. {
  711. pybind11::gil_scoped_acquire acquire;
  712. try {
  713. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  714. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  715. (void)convert_obj.attr("convert_files")();
  716. } catch (pybind11::error_already_set &e) {
  717. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  718. }
  719. }
  720. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
  721. }
  722. }
  723. }
  724. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  725. const std::string &dump_key, AsyncFilePool *const result_list,
  726. const std::string &file_format) {
  727. std::string real_dump_iter_dir = RealPath(dump_key);
  728. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  729. if (d_handle == nullptr) {
  730. MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
  731. return;
  732. }
  733. struct dirent *dir = nullptr;
  734. while ((dir = readdir(d_handle)) != nullptr) {
  735. struct stat st;
  736. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  737. int ret = stat(name.c_str(), &st);
  738. if (ret != 0) {
  739. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  740. (void)closedir(d_handle);
  741. return;
  742. }
  743. if (S_ISREG(st.st_mode)) {
  744. std::string candidate = dir->d_name;
  745. for (const std::string &file_to_find : files_after_convert_in_dir) {
  746. std::string file_n = file_to_find;
  747. auto last_slash_pos = file_to_find.find_last_of("\\/");
  748. if (last_slash_pos != std::string::npos) {
  749. file_n = file_to_find.substr(last_slash_pos + 1);
  750. }
  751. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  752. // we found a converted file for this op
  753. std::string found_file = dump_key + "/" + candidate;
  754. result_list->insert(found_file);
  755. }
  756. }
  757. }
  758. }
  759. (void)closedir(d_handle);
  760. }
  761. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  762. if (dump_style_name.empty()) {
  763. return "";
  764. }
  765. std::size_t last_scope_marker;
  766. std::string delim = "/";
  767. last_scope_marker = dump_style_name.rfind(delim);
  768. if (last_scope_marker == std::string::npos) {
  769. return dump_style_name;
  770. }
  771. return dump_style_name.substr(last_scope_marker + delim.size());
  772. }
  773. void ReplaceSrcFileName(std::string *dump_style_name) {
  774. if (dump_style_name == nullptr) {
  775. return;
  776. }
  777. const std::string strsrc = "/";
  778. std::string strdst = "_";
  779. std::string::size_type pos = 0;
  780. std::string::size_type srclen = strsrc.size();
  781. std::string::size_type dstlen = strdst.size();
  782. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  783. (void)dump_style_name->replace(pos, srclen, strdst);
  784. pos += dstlen;
  785. }
  786. }
  787. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  788. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  789. std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
  790. std::string file_format = "npy";
  791. DirMap dir_to_files_map;
  792. for (unsigned int i = 0; i < backend_name.size(); i++) {
  793. // form prefix of the tensor file to read from graph pb node name
  794. std::string dump_style_kernel_name = backend_name[i];
  795. // remove slot from name
  796. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  797. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  798. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  799. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  800. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  801. // if node name is constant, skip
  802. if (prefix_dump_file_name.length() > (unsigned)strlen(constant_prefix) &&
  803. prefix_dump_file_name.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  804. continue;
  805. }
  806. // search files in dir for the one that meets the filename prefix and read the file into memory
  807. std::string abspath = RealPath(specific_dump_dir);
  808. DIR *d = opendir(abspath.c_str());
  809. if (d == nullptr) {
  810. MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
  811. return;
  812. }
  813. ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  814. (void)closedir(d);
  815. }
  816. ConvertToHostFormat(dir_to_files_map, result_list);
  817. }
  818. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  819. const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
  820. std::string file_format = "npy";
  821. DirMap dir_to_files_map;
  822. for (const auto &node : proto_dump) {
  823. std::string dump_name = std::get<1>(node);
  824. dump_name = dump_name.substr(0, dump_name.rfind("."));
  825. // search files in dir for the one that meets the filename prefix and read the file into memory
  826. std::string abspath = RealPath(specific_dump_dir);
  827. DIR *d = opendir(abspath.c_str());
  828. if (d == nullptr) {
  829. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  830. return;
  831. }
  832. ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  833. (void)closedir(d);
  834. }
  835. ConvertToHostFormat(dir_to_files_map, result_list);
  836. }
  837. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
  838. const std::string &specific_dump_dir, DirMap *dir_to_files_map,
  839. AsyncFilePool *const result_list) {
  840. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  841. DIR *d = opendir(specific_dump_dir.c_str());
  842. struct dirent *dir = nullptr;
  843. while ((dir = readdir(d)) != nullptr) {
  844. struct stat st;
  845. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  846. int ret = stat(name.c_str(), &st);
  847. if (ret != 0) {
  848. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  849. (void)closedir(d);
  850. return;
  851. }
  852. if (!(S_ISREG(st.st_mode))) {
  853. continue;
  854. }
  855. std::string file_name = dir->d_name;
  856. std::string file_name_w_o_perfix = file_name;
  857. auto type_pos = file_name.find('.');
  858. // adding dot to avoid problematic matching in the scope.
  859. if (type_pos == std::string::npos ||
  860. file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
  861. continue;
  862. }
  863. if (file_name.rfind(file_format) == std::string::npos) {
  864. std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
  865. file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
  866. // if file matches prefix and is in device format add to candidate files to convert.
  867. (*dir_to_files_map)[specific_dump_dir].push_back(std::make_pair(file_name, file_name_w_o_perfix));
  868. } else {
  869. // otherwise, if file matches prefix and already has been converted to host format
  870. // add to result of converted files.
  871. std::string found_file = specific_dump_dir + "/" + file_name;
  872. result_list->insert(found_file);
  873. }
  874. }
  875. (void)closedir(d);
  876. }
  877. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  878. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  879. uint32_t root_graph_id, const AsyncFilePool &async_file_pool,
  880. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  881. for (auto &node : proto_dump) {
  882. std::vector<size_t> slot_list;
  883. std::string dump_style_name = std::get<1>(node);
  884. // Get dump_name and output_str from the second element of tuple
  885. std::size_t found_dot = dump_style_name.rfind(".");
  886. std::string dump_name = dump_style_name.substr(0, found_dot);
  887. std::string output_str = dump_style_name.substr(found_dot + 1);
  888. bool output_flag = (output_str == "output");
  889. for (const std::string &file_name : async_file_pool) {
  890. std::string file_name_to_check = file_name;
  891. auto delim = file_name.rfind("/");
  892. if (delim != std::string::npos) {
  893. file_name_to_check = file_name.substr(delim + 1);
  894. }
  895. std::size_t found = file_name_to_check.find(dump_name);
  896. std::size_t found_out = file_name_to_check.find(output_str);
  897. std::size_t found_dot_start = file_name_to_check.find(".", found_out);
  898. std::size_t found_dot_end = file_name_to_check.find(".", found_dot_start);
  899. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  900. found_out != std::string::npos) {
  901. slot_list.push_back(
  902. std::stoul(file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  903. }
  904. }
  905. for (auto slot : slot_list) {
  906. // add a TensorData entry (data will be read when needed)
  907. std::vector<int64_t> shape;
  908. std::string orig_name = std::get<0>(node);
  909. auto tensor_data = std::make_shared<TensorData>();
  910. tensor_data->SetName(orig_name);
  911. tensor_data->SetExecutionOrder(0);
  912. tensor_data->SetSlot(slot);
  913. tensor_data->SetIteration(iteration);
  914. tensor_data->SetDeviceId(device_id);
  915. tensor_data->SetRootGraphId(root_graph_id);
  916. tensor_data->SetDataPtr(nullptr);
  917. tensor_data->SetByteSize(0);
  918. tensor_data->SetType("");
  919. tensor_data->SetShape(shape);
  920. tensor_data->SetIsOutput(output_flag);
  921. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  922. tensor_list->push_back(tensor_data);
  923. }
  924. }
  925. }
  926. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  927. std::regex re;
  928. if (mode == "rank") {
  929. re = "^rank_([0-9]+)$";
  930. } else if (mode == "graph") {
  931. re = "^([0-9]+)$";
  932. }
  933. std::smatch tokens;
  934. if (regex_match(name, tokens, re)) {
  935. return std::stoi(tokens[1]);
  936. } else {
  937. return UINT32_MAX;
  938. }
  939. }
  940. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  941. std::vector<uint32_t> rank_id_list;
  942. std::string dump_dir = GetDumpDir();
  943. DIR *d_handle = opendir(dump_dir.c_str());
  944. if (d_handle == nullptr) {
  945. MS_LOG(ERROR) << "Dump directory does not exist.";
  946. return rank_id_list;
  947. }
  948. struct dirent *dir = nullptr;
  949. while ((dir = readdir(d_handle)) != nullptr) {
  950. struct stat st;
  951. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  952. int ret = stat(name.c_str(), &st);
  953. if (ret != 0) {
  954. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  955. (void)closedir(d_handle);
  956. return rank_id_list;
  957. }
  958. if (S_ISDIR(st.st_mode)) {
  959. std::string rank_dir_name = dir->d_name;
  960. if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
  961. rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
  962. }
  963. }
  964. }
  965. (void)closedir(d_handle);
  966. return rank_id_list;
  967. }
  968. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  969. std::string net_name = GetNetName();
  970. std::string dump_dir = GetDumpDir();
  971. for (uint32_t rank_id : rank_id_list) {
  972. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  973. std::string abspath = RealPath(path);
  974. DIR *d_handle_rank = opendir(abspath.c_str());
  975. if (d_handle_rank == nullptr) {
  976. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  977. continue;
  978. }
  979. struct dirent *direc = nullptr;
  980. while ((direc = readdir(d_handle_rank)) != nullptr) {
  981. struct stat st;
  982. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  983. int ret = stat(name.c_str(), &st);
  984. if (ret != 0) {
  985. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  986. (void)closedir(d_handle_rank);
  987. return;
  988. }
  989. if (S_ISDIR(st.st_mode)) {
  990. std::string graph_dir = direc->d_name;
  991. if (graph_dir == "." || graph_dir == "..") {
  992. continue;
  993. }
  994. if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
  995. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  996. ReadGraphsHistory(rank_id, graph_id);
  997. }
  998. }
  999. }
  1000. (void)closedir(d_handle_rank);
  1001. }
  1002. }
  1003. void DebugServices::SetGraphsHistory() {
  1004. // extract rank_id_list
  1005. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  1006. // for each rank_id extract the graph_id list and set the dump version
  1007. // and for each graph read the graph history file
  1008. CheckDumpGraphIdList(rank_id_list);
  1009. }
  1010. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1011. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1012. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1013. // graph history was already stored for this rank_id and graph_id
  1014. return;
  1015. }
  1016. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1017. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1018. DIR *d_handle = opendir(exec_order_path.c_str());
  1019. if (d_handle == nullptr) {
  1020. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1021. return;
  1022. }
  1023. // read file and store the info
  1024. std::string full_path = exec_order_path + "/" + file_to_check;
  1025. std::string checked_path = RealPath(full_path);
  1026. if (!checked_path.empty()) {
  1027. ReadGraphRunIter(checked_path, rank_and_graph);
  1028. }
  1029. (void)closedir(d_handle);
  1030. }
  1031. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1032. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1033. for (auto w_table_item : watchpoint_table_) {
  1034. auto wp = std::get<1>(w_table_item);
  1035. unsigned int index = 0;
  1036. for (auto check_node : wp.check_node_list) {
  1037. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1038. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1039. // graph represents root_graph for Ascend and kernel_graph for GPU
  1040. for (auto rank : ranks) {
  1041. for (auto graph : graphs) {
  1042. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1043. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1044. }
  1045. }
  1046. index++;
  1047. }
  1048. }
  1049. return rank_and_graph_to_nodes;
  1050. }
  1051. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1052. std::ifstream infile;
  1053. std::string line;
  1054. infile.open(file_path.c_str());
  1055. if (!infile.is_open()) {
  1056. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1057. const int kMaxFilenameLength = NAME_MAX;
  1058. char err_info[kMaxFilenameLength];
  1059. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1060. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1061. }
  1062. return;
  1063. }
  1064. std::vector<uint32_t> run_iters_vec;
  1065. while (std::getline(infile, line)) {
  1066. uint32_t iter;
  1067. std::stringstream ss(line);
  1068. ss >> iter;
  1069. run_iters_vec.push_back(iter);
  1070. }
  1071. (void)graphs_run_history_.emplace(
  1072. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1073. }
  1074. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1075. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1076. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1077. const std::string &type_name, const std::vector<int64_t> &shape,
  1078. std::vector<char> *buffer,
  1079. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1080. // call LoadNewTensor to store tensor in internal cache
  1081. auto tensor_data = std::make_shared<TensorData>();
  1082. tensor_data->SetName(backend_name);
  1083. tensor_data->SetExecutionOrder(0);
  1084. tensor_data->SetSlot(slot);
  1085. tensor_data->SetIteration(iteration);
  1086. tensor_data->SetDeviceId(device_id);
  1087. tensor_data->SetRootGraphId(root_graph_id);
  1088. tensor_data->SetIsOutput(is_output);
  1089. if (buffer != nullptr) {
  1090. tensor_data->SetDataPtr(buffer->data());
  1091. } else {
  1092. tensor_data->SetDataPtr(nullptr);
  1093. }
  1094. tensor_data->SetByteSize(data_size);
  1095. tensor_data->SetType(type_name);
  1096. tensor_data->SetShape(shape);
  1097. tensor_data->SetTimeStamp(time_stamp);
  1098. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1099. if (data_size) {
  1100. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1101. }
  1102. // add to result_list
  1103. result_list->push_back(tensor_data);
  1104. }
  1105. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1106. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1107. std::string dump_style_name_part = *dump_style_kernel_name;
  1108. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1109. std::string slot_str;
  1110. if (is_output) {
  1111. slot_str = ".output." + std::to_string(slot);
  1112. } else {
  1113. slot_str = ".input." + std::to_string(slot);
  1114. }
  1115. dump_style_name_part += slot_str;
  1116. *prefix_dump_file_name = dump_style_name_part;
  1117. *slot_string_to_check = slot_str;
  1118. }
  1119. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1120. // get file with the newest timestamp from the list.
  1121. if (file_list.empty()) {
  1122. return "";
  1123. }
  1124. std::sort(file_list.begin(), file_list.end());
  1125. return file_list.back();
  1126. }
  1127. std::string GetTimeStampStr(std::string file_path) {
  1128. // get the file_name from file_path.
  1129. size_t pos = file_path.rfind("/");
  1130. std::string file_name = file_path.substr(pos + 1);
  1131. size_t first_dot = file_name.rfind(".");
  1132. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1133. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1134. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1135. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1136. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1137. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1138. return time_stamp;
  1139. }
  1140. return "";
  1141. }
  1142. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1143. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1144. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1145. const AsyncFilePool &async_file_pool,
  1146. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1147. bool *no_mem_to_read) {
  1148. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1149. // form prefix of the tensor file to read from graph pb node name
  1150. std::string dump_style_kernel_name = backend_name[i];
  1151. // remove slot from name
  1152. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1153. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1154. std::string slot_string_to_check;
  1155. std::string prefix_dump_file_name;
  1156. std::string specific_dump_dir;
  1157. bool is_cst = false;
  1158. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1159. // prefix_dump_to_check is node name used to find corresponding dump file
  1160. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1161. // if node name has prefix of "Default--data-", consider as constant, search in cst folder
  1162. if (prefix_dump_to_check.length() > (unsigned)strlen(constant_prefix) &&
  1163. prefix_dump_to_check.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  1164. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1165. std::to_string(root_graph_id[i]) + "/constants";
  1166. is_cst = true;
  1167. const std::string prefix = "Default--";
  1168. prefix_dump_file_name = prefix_dump_file_name.substr(prefix.length());
  1169. prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
  1170. } else {
  1171. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1172. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1173. }
  1174. MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
  1175. // search files in dir for the one that meets the filename prefix and read the file into memory
  1176. if (is_sync_mode_ || is_cst) {
  1177. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1178. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1179. } else {
  1180. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1181. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1182. no_mem_to_read);
  1183. }
  1184. }
  1185. }
  1186. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1187. const std::string &backend_name, const unsigned int device_id,
  1188. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1189. bool *no_mem_to_read, unsigned int iteration,
  1190. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1191. std::string time_stamp = "";
  1192. std::string type_name = "";
  1193. size_t data_size = 0;
  1194. std::vector<int64_t> shape;
  1195. std::vector<char> *buffer = nullptr;
  1196. if (found) {
  1197. std::string result_path = GetNewestFilePath(matched_paths);
  1198. time_stamp = GetTimeStampStr(result_path);
  1199. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1200. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1201. std::to_string(slot);
  1202. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1203. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1204. type_name, shape, buffer, result_list);
  1205. } else {
  1206. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1207. buffer, result_list);
  1208. MS_LOG(INFO) << "Target tensor has not been found.";
  1209. }
  1210. }
  1211. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1212. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1213. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1214. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1215. std::string abspath = RealPath(specific_dump_dir);
  1216. DIR *d = opendir(abspath.c_str());
  1217. bool found_file = false;
  1218. std::vector<std::string> matched_paths;
  1219. if (d == nullptr) {
  1220. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1221. } else {
  1222. struct dirent *dir = nullptr;
  1223. while ((dir = readdir(d)) != nullptr) {
  1224. struct stat st;
  1225. std::string name = abspath + std::string("/") + std::string(dir->d_name);
  1226. int ret = stat(name.c_str(), &st);
  1227. if (ret != 0) {
  1228. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1229. (void)closedir(d);
  1230. return;
  1231. }
  1232. if (S_ISREG(st.st_mode)) {
  1233. std::string file_name = dir->d_name;
  1234. std::string stripped_file_name = GetStrippedFilename(file_name);
  1235. if (stripped_file_name.empty()) {
  1236. continue;
  1237. }
  1238. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1239. if (found != 0) {
  1240. continue;
  1241. }
  1242. std::string full_path = specific_dump_dir + "/" + file_name;
  1243. matched_paths.push_back(full_path);
  1244. found_file = true;
  1245. }
  1246. }
  1247. (void)closedir(d);
  1248. }
  1249. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1250. no_mem_to_read, iteration, result_list);
  1251. }
  1252. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1253. const std::string &slot_string_to_check, const std::string &backend_name,
  1254. size_t slot, unsigned int device_id, unsigned int iteration,
  1255. unsigned int root_graph_id, const bool &is_output,
  1256. const AsyncFilePool &async_file_pool,
  1257. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1258. bool found = false;
  1259. std::vector<std::string> matched_paths;
  1260. // if async mode
  1261. for (const std::string &file_path : async_file_pool) {
  1262. std::string file_name_to_check = file_path;
  1263. auto delim = file_path.rfind("/");
  1264. if (delim != std::string::npos) {
  1265. file_name_to_check = file_path.substr(delim + 1);
  1266. }
  1267. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1268. file_name_to_check.find(prefix_dump_to_check) != std::string::npos &&
  1269. file_name_to_check.find(slot_string_to_check) != std::string::npos) {
  1270. matched_paths.push_back(file_path);
  1271. found = true;
  1272. }
  1273. }
  1274. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1275. iteration, result_list);
  1276. }
  1277. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1278. // strip off the task_id, stream_id, and timestamp, then compare
  1279. size_t first_dot = file_name.find(".");
  1280. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1281. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1282. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1283. return std::string();
  1284. }
  1285. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1286. size_t second_dot = fifth_dot;
  1287. const int8_t kSecondDotPosition = 2;
  1288. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1289. second_dot = file_name.rfind(".", second_dot - 1);
  1290. }
  1291. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1292. return std::string();
  1293. }
  1294. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1295. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1296. std::string stripped_file_name = start_string + end_string;
  1297. return stripped_file_name;
  1298. }
  1299. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration,
  1300. AsyncFilePool *const async_file_pool,
  1301. bool error_on_no_value) {
  1302. // get a list of nodes and the devices they are on to monitor
  1303. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1304. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1305. GetAllWpNodes();
  1306. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1307. // as they are found
  1308. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1309. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1310. uint32_t rank_id = std::get<0>(rank_and_graph);
  1311. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1312. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1313. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1314. std::string real_dump_dir = RealPath(specific_dump_dir);
  1315. if (real_dump_dir.empty()) {
  1316. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1317. continue;
  1318. }
  1319. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1320. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1321. // convert node names to dump style
  1322. for (auto node : wp_nodes) {
  1323. std::string orig_name = std::get<0>(node);
  1324. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1325. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1326. bool node_is_out = std::get<1>(node);
  1327. if (node_is_out) {
  1328. dump_style_name += ".output";
  1329. } else {
  1330. dump_style_name += ".input";
  1331. }
  1332. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1333. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1334. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1335. }
  1336. }
  1337. if (is_sync_mode_) {
  1338. // search files in dir for the one that meets the filename prefix and read the file into memory
  1339. ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1340. error_on_no_value);
  1341. } else {
  1342. // convert all files in proto_to_dump to npy and add to pool of async file names
  1343. ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
  1344. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1345. &tensor_list);
  1346. }
  1347. }
  1348. return tensor_list;
  1349. }
  1350. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1351. const std::string &specific_dump_dir, unsigned int iteration,
  1352. unsigned int device_id, unsigned int root_graph_id,
  1353. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1354. bool error_on_no_value) {
  1355. DIR *d = opendir(specific_dump_dir.c_str());
  1356. if (d == nullptr) {
  1357. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
  1358. } else {
  1359. struct dirent *dir = nullptr;
  1360. while ((dir = readdir(d)) != nullptr) {
  1361. struct stat st;
  1362. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  1363. int ret = stat(name.c_str(), &st);
  1364. if (ret != 0) {
  1365. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1366. (void)closedir(d);
  1367. return;
  1368. }
  1369. if (S_ISREG(st.st_mode)) {
  1370. std::string file_name = dir->d_name;
  1371. for (auto &node : proto_to_dump) {
  1372. std::string dump_name = std::get<1>(node);
  1373. std::string stripped_file_name = GetStrippedFilename(file_name);
  1374. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1375. continue;
  1376. }
  1377. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1378. if (found == 0) {
  1379. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1380. std::vector<int64_t> shape;
  1381. std::string orig_name = std::get<0>(node);
  1382. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1383. bool output_flag = (output_str == "output");
  1384. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1385. nullptr, tensor_list);
  1386. break;
  1387. }
  1388. }
  1389. }
  1390. }
  1391. (void)closedir(d);
  1392. }
  1393. }
  1394. std::string DebugServices::IterationString(unsigned int iteration) {
  1395. std::string iteration_string;
  1396. bool init_dbg_suspend = (iteration == UINT_MAX);
  1397. if (init_dbg_suspend) {
  1398. iteration_string = "init";
  1399. } else {
  1400. iteration_string = std::to_string(iteration);
  1401. }
  1402. return iteration_string;
  1403. }
  1404. #endif
  1405. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1406. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1407. std::vector<unsigned int> *const dtype,
  1408. std::vector<std::vector<int64_t>> *const shape) {
  1409. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1410. tensor_loader_->SearchTensors(name, &result_list);
  1411. for (auto result : result_list) {
  1412. if (std::get<1>(result) == nullptr) {
  1413. continue;
  1414. }
  1415. #ifdef ONLINE_DBG_MODE
  1416. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1417. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1418. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1419. << ".";
  1420. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1421. }
  1422. #endif
  1423. (void)ret_name->emplace_back(std::get<0>(result));
  1424. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1425. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1426. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1427. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1428. }
  1429. }
  1430. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1431. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1432. if (result_list == nullptr) {
  1433. MS_LOG(DEBUG) << "result_list is nullptr.";
  1434. return;
  1435. }
  1436. tensor_loader_->SearchTensors(name, result_list);
  1437. }
  1438. #ifdef ONLINE_DBG_MODE
  1439. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1440. bool ret = false;
  1441. for (auto w_table_item : watchpoint_table_) {
  1442. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1443. for (auto check_node : check_node_list) {
  1444. std::string w_name = std::get<0>(check_node);
  1445. bool w_type = std::get<1>(check_node);
  1446. if ((w_type == true &&
  1447. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1448. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1449. ret = true;
  1450. return ret;
  1451. }
  1452. }
  1453. }
  1454. return ret;
  1455. }
  1456. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1457. if (kernel != nullptr && w_name.length() > 0) {
  1458. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1459. for (size_t j = 0; j < input_size; ++j) {
  1460. auto input_kernel = kernel->input(j + 1);
  1461. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1462. auto found = w_name.find_last_of('/');
  1463. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1464. return true;
  1465. }
  1466. return false;
  1467. } else {
  1468. return false;
  1469. }
  1470. }
  1471. #endif
  1472. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1473. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1474. return tensor_loader_->GetTensor(tensor_name);
  1475. }
  1476. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1477. #ifdef ONLINE_DBG_MODE
  1478. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1479. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1480. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1481. size_t slot) const {
  1482. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1483. device_type, addr_format, slot);
  1484. }
  1485. #endif
  1486. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1487. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1488. }
  1489. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1490. uint32_t prev_iter;
  1491. uint32_t rank_id = tensor->GetDeviceId();
  1492. uint32_t root_graph_id = tensor->GetRootGraphId();
  1493. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1494. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1495. return UINT32_MAX;
  1496. }
  1497. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1498. tensor->GetIteration());
  1499. if (it == graphs_run_history_[rank_and_graph].end()) {
  1500. // The graph is not executed in that iteration
  1501. return UINT32_MAX;
  1502. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1503. // current iteration is the first iteration that the graph was run
  1504. // no prev iter is available
  1505. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1506. << " is the first run iteration for tensor: " << tensor->GetName();
  1507. return UINT32_MAX;
  1508. }
  1509. it--;
  1510. prev_iter = *it;
  1511. tensor->SetPrevIteration(prev_iter);
  1512. return prev_iter;
  1513. }
  1514. void DebugServices::ResetLoadedTensors() {
  1515. wp_id_cache_.clear();
  1516. MS_LOG(INFO) << "Resetting loaded tensors";
  1517. tensor_loader_->MoveParametersCurrentToPrev();
  1518. tensor_loader_->EmptyCurrentTensor();
  1519. // will move parameters from previous to current map
  1520. tensor_loader_->SwapCurrentPrev();
  1521. overflow_ops_.clear();
  1522. }
  1523. #ifdef ONLINE_DBG_MODE
  1524. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1525. MS_EXCEPTION_IF_NULL(kernel);
  1526. std::vector<std::shared_ptr<TensorData>> result;
  1527. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1528. auto kernel_name = GetKernelNodeName(kernel);
  1529. for (size_t j = 0; j < output_size; ++j) {
  1530. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1531. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1532. if (tensor != nullptr) {
  1533. result.push_back(tensor);
  1534. }
  1535. }
  1536. return result;
  1537. }
  1538. #endif
  1539. std::string GetOnlineOpOverflowDir() {
  1540. // only called for online debugger mode
  1541. // get operator overflow directory for current iteration
  1542. std::string overflow_bin_path = "";
  1543. #ifdef ONLINE_DBG_MODE
  1544. if (DumpJsonParser::GetInstance().path().empty()) {
  1545. MS_LOG(INFO) << "Dump config is not set.";
  1546. return "";
  1547. }
  1548. auto debugger = Debugger::GetInstance();
  1549. MS_EXCEPTION_IF_NULL(debugger);
  1550. auto cur_graph = debugger->GetGraphPtr();
  1551. if (cur_graph == nullptr) {
  1552. return "";
  1553. }
  1554. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1555. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1556. if (!realpath.has_value()) {
  1557. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1558. return "";
  1559. }
  1560. overflow_bin_path = realpath.value() + '/';
  1561. #endif
  1562. return overflow_bin_path;
  1563. }
  1564. void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, std::vector<std::string> *op_names) {
  1565. MS_EXCEPTION_IF_NULL(op_names);
  1566. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1567. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1568. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1569. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1570. DIR *d = opendir(overflow_bin_path.c_str());
  1571. if (d == nullptr) {
  1572. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1573. } else {
  1574. struct dirent *dir = nullptr;
  1575. while ((dir = readdir(d)) != nullptr) {
  1576. struct stat st;
  1577. std::string name = overflow_bin_path + std::string("/") + std::string(dir->d_name);
  1578. int ret = stat(name.c_str(), &st);
  1579. if (ret != 0) {
  1580. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1581. (void)closedir(d);
  1582. return;
  1583. }
  1584. if (S_ISREG(st.st_mode)) {
  1585. // form fully qualified filename
  1586. std::string file_path = name;
  1587. std::string file_name = dir->d_name;
  1588. // attempt to read the file
  1589. std::ifstream infile;
  1590. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1591. if (!infile.is_open()) {
  1592. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1593. continue;
  1594. }
  1595. std::string node_name;
  1596. uint64_t task_id = 0;
  1597. uint64_t stream_id = 0;
  1598. // detect overflow bin file
  1599. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1600. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1601. continue;
  1602. }
  1603. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1604. << ".";
  1605. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1606. } else {
  1607. // regular bin file
  1608. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1609. if (success_parse) {
  1610. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1611. }
  1612. }
  1613. infile.close();
  1614. }
  1615. }
  1616. (void)closedir(d);
  1617. }
  1618. // find the op_names with an overflow hit
  1619. for (auto &task_stream : task_stream_hit) {
  1620. auto op_name = task_stream_to_opname[task_stream];
  1621. if (!op_name.empty()) {
  1622. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1623. op_names->push_back(op_name);
  1624. }
  1625. }
  1626. }
  1627. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1628. unsigned int iteration) {
  1629. std::string overflow_bin_path = "";
  1630. #ifdef ONLINE_DBG_MODE
  1631. overflow_bin_path = GetOnlineOpOverflowDir();
  1632. #else
  1633. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1634. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1635. overflow_bin_path = RealPath(overflow_bin_path);
  1636. #endif
  1637. if (overflow_bin_path.empty()) {
  1638. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1639. return false;
  1640. }
  1641. // remove kernel_graph_#
  1642. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1643. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1644. // remove path
  1645. size_t last_slash = node_name_to_find.rfind("/");
  1646. std::string op_name_find = "";
  1647. if (last_slash != std::string::npos) {
  1648. op_name_find = node_name_to_find.substr(last_slash + 1);
  1649. }
  1650. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1651. std::vector<std::string> op_names;
  1652. overflow_wp_lock_.lock();
  1653. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1654. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1655. if (found_overflows != overflow_ops_.end()) {
  1656. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1657. op_names = overflow_ops_[overflow_bin_path];
  1658. } else {
  1659. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1660. overflow_ops_[overflow_bin_path] = op_names;
  1661. }
  1662. overflow_wp_lock_.unlock();
  1663. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1664. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1665. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1666. return true;
  1667. }
  1668. // determine if overflow wp has been triggered for the op name (from npy file)
  1669. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1670. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1671. return true;
  1672. }
  1673. return false;
  1674. }
  1675. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1676. std::string op_name_to_find = node_name_to_find;
  1677. const std::string kernel_prefix = "kernel_graph_";
  1678. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1679. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1680. if (start_of_op_name != std::string::npos) {
  1681. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1682. }
  1683. }
  1684. return op_name_to_find;
  1685. }
  1686. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1687. uint64_t *stream_id) {
  1688. size_t task_pos_start = overflow_file_prefix.length();
  1689. size_t task_pos_end = file_name.find(".", task_pos_start);
  1690. if (task_pos_end == std::string::npos) {
  1691. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1692. return false;
  1693. }
  1694. size_t stream_pos_start = task_pos_end + 1;
  1695. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1696. if (stream_pos_end == std::string::npos) {
  1697. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1698. return false;
  1699. }
  1700. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1701. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1702. *task_id = std::stoull(task_id_str);
  1703. *stream_id = std::stoull(stream_id_str);
  1704. return true;
  1705. }
  1706. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  1707. uint64_t *stream_id) {
  1708. // get the node_name, task_id, and stream_id from dump filename
  1709. // node_type.node_name.task_id.stream_id.{etcetera}
  1710. size_t first_dot = file_name.find(".");
  1711. size_t second_dot = file_name.find(".", first_dot + 1);
  1712. size_t third_dot = file_name.find(".", second_dot + 1);
  1713. size_t fourth_dot = file_name.find(".", third_dot + 1);
  1714. // check if dots were found
  1715. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1716. fourth_dot == std::string::npos) {
  1717. return false;
  1718. }
  1719. // get node_name
  1720. if (first_dot < second_dot) {
  1721. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1722. } else {
  1723. MS_LOG(ERROR) << "filename parse error to get node_name.";
  1724. return false;
  1725. }
  1726. // get task id
  1727. if (second_dot < third_dot) {
  1728. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1729. try {
  1730. *task_id = std::stoull(extracted_task_id);
  1731. } catch (std::invalid_argument &e) {
  1732. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1733. return false;
  1734. } catch (std::out_of_range &e) {
  1735. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1736. return false;
  1737. }
  1738. } else {
  1739. MS_LOG(ERROR) << "filename parse error to get task_id.";
  1740. return false;
  1741. }
  1742. // get stream id
  1743. if (third_dot < fourth_dot) {
  1744. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1745. try {
  1746. *stream_id = std::stoull(extracted_stream_id);
  1747. } catch (std::invalid_argument &e) {
  1748. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1749. return false;
  1750. } catch (std::out_of_range &e) {
  1751. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1752. return false;
  1753. }
  1754. } else {
  1755. MS_LOG(ERROR) << "filename parse error to get stream_id.";
  1756. return false;
  1757. }
  1758. return true;
  1759. }
  1760. std::string DebugServices::RealPath(const std::string &input_path) {
  1761. if (input_path.length() >= PATH_MAX) {
  1762. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1763. }
  1764. size_t path_split_pos = input_path.find_last_of('/');
  1765. // get real path
  1766. char real_path[PATH_MAX] = {0};
  1767. // input_path is dir + file_name
  1768. if (path_split_pos != std::string::npos) {
  1769. std::string prefix_path = input_path.substr(0, path_split_pos);
  1770. std::string file_name = input_path.substr(path_split_pos);
  1771. if (file_name.length() > NAME_MAX) {
  1772. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1773. }
  1774. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1775. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  1776. return "";
  1777. }
  1778. return std::string(real_path) + file_name;
  1779. }
  1780. // input_path is only file_name
  1781. if (input_path.length() > NAME_MAX) {
  1782. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1783. }
  1784. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1785. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1786. }
  1787. return std::string(real_path);
  1788. }
  1789. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1790. #if defined(__APPLE__)
  1791. return *reinterpret_cast<const uint64_t *>(buffer.data());
  1792. #else
  1793. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1794. #endif
  1795. }
  1796. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1797. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1798. }
  1799. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1800. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1801. }
  1802. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1803. if (tensor_loader_->EnableMemoryControl()) {
  1804. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1805. }
  1806. }
  1807. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1808. std::string DebugServices::GetNetName() { return net_name_; }
  1809. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1810. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1811. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1812. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1813. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1814. #ifdef ONLINE_DBG_MODE
  1815. } // namespace mindspore
  1816. #endif