You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 102 kB

5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243
  1. /**
  2. * Copyright 2019-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <limits>
  27. #include <unordered_set>
  28. #include <utility>
  29. #include <regex>
  30. #include <iomanip>
  31. #include "pybind11/embed.h"
  32. #include "pybind11/stl.h"
  33. #ifdef ONLINE_DBG_MODE
  34. #include "include/common/debug/common.h"
  35. #include "debug/debugger/debugger.h"
  36. #include "include/common/debug/anf_dump_utils.h"
  37. #include "include/common/utils/anfalgo.h"
  38. #endif
  39. #include "debug/utils.h"
  40. #include "nlohmann/json.hpp"
  41. #include "debug/debugger/tensor_summary.h"
  42. #include "utils/file_utils.h"
  43. namespace mindspore {
  44. namespace {
  45. static constexpr const char constant_prefix[] = "Default--data-";
  46. static constexpr const char kNpyExt[] = ".npy";
  47. constexpr float ms_to_s = 1000.0;
  48. constexpr int precision = 2;
  49. static constexpr int32_t wp_progress_period = 300;
  50. #ifdef __APPLE__
  51. constexpr int kStrErrorNone = 0;
  52. #else
  53. constexpr char *kStrErrorNone = nullptr;
  54. #endif
  55. } // namespace
  56. bool IsRegFile(const std::string &file_path) {
  57. struct stat st;
  58. int ret = stat(file_path.c_str(), &st);
  59. if (ret != 0) {
  60. MS_LOG(ERROR) << "stat error for " << file_path << ", ret is: " << ret;
  61. return false;
  62. }
  63. return S_ISREG(st.st_mode);
  64. }
  65. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  66. DebugServices::DebugServices(const DebugServices &other) {
  67. wp_id_cache_ = other.wp_id_cache_;
  68. net_name_ = other.net_name_;
  69. dump_dir_ = other.dump_dir_;
  70. is_sync_mode_ = other.is_sync_mode_;
  71. tensor_loader_ = other.tensor_loader_;
  72. watchpoint_table_ = other.watchpoint_table_;
  73. }
  74. DebugServices &DebugServices::operator=(const DebugServices &other) {
  75. if (this != &other) {
  76. tensor_loader_ = other.tensor_loader_;
  77. watchpoint_table_ = other.watchpoint_table_;
  78. }
  79. return *this;
  80. }
  81. /*
  82. * Feature group: Online debugger, Offline debugger.
  83. * Target device group: Ascend, GPU.
  84. * Runtime category: Old runtime, MindRT.
  85. * Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
  86. * watchpoint_table.
  87. */
  88. void DebugServices::AddWatchpoint(
  89. int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
  90. const std::vector<parameter_t> &parameter_list,
  91. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  92. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  93. std::lock_guard<std::mutex> lg(lock_);
  94. watchpoint_t watchpoint_item;
  95. watchpoint_item.id = id;
  96. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  97. watchpoint_item.condition.parameter = parameter;
  98. watchpoint_item.check_node_list = check_node_list;
  99. // For offline debugger check_node_device_list is not nullptr.
  100. if (check_node_device_list != nullptr) {
  101. watchpoint_item.check_node_device_list = *check_node_device_list;
  102. }
  103. // For offline debugger check_node_graph_list is not nullptr.
  104. if (check_node_graph_list != nullptr) {
  105. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  106. }
  107. watchpoint_item.parameter_list = parameter_list;
  108. watchpoint_table_[id] = watchpoint_item;
  109. }
  110. void DebugServices::RemoveWatchpoint(unsigned int id) {
  111. std::lock_guard<std::mutex> lg(lock_);
  112. (void)watchpoint_table_.erase(id);
  113. }
  114. /*
  115. * Feature group: Online debugger, Offline debugger.
  116. * Target device group: Ascend, GPU.
  117. * Runtime category: Old runtime, MindRT.
  118. * Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
  119. * not supported.
  120. */
  121. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  122. const void *const previous_tensor_ptr, uint64_t num_elements,
  123. uint64_t prev_num_elements, int tensor_dtype) {
  124. MS_EXCEPTION_IF_NULL(tensor);
  125. switch (tensor_dtype) {
  126. case DbgDataType::DT_UINT8: {
  127. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  128. prev_num_elements);
  129. }
  130. case DbgDataType::DT_INT8: {
  131. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  132. prev_num_elements);
  133. }
  134. case DbgDataType::DT_UINT16: {
  135. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  136. prev_num_elements);
  137. }
  138. case DbgDataType::DT_INT16: {
  139. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  140. prev_num_elements);
  141. }
  142. case DbgDataType::DT_UINT32: {
  143. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  144. prev_num_elements);
  145. }
  146. case DbgDataType::DT_INT32:
  147. case DbgDataType::DT_BASE_INT: {
  148. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  149. prev_num_elements);
  150. }
  151. case DbgDataType::DT_UINT64: {
  152. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  153. prev_num_elements);
  154. }
  155. case DbgDataType::DT_INT64: {
  156. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  157. prev_num_elements);
  158. }
  159. case DbgDataType::DT_FLOAT16: {
  160. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  161. prev_num_elements);
  162. }
  163. case DbgDataType::DT_FLOAT32:
  164. case DbgDataType::DT_BASE_FLOAT: {
  165. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  166. prev_num_elements);
  167. }
  168. case DbgDataType::DT_FLOAT64: {
  169. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  170. prev_num_elements);
  171. }
  172. case DbgDataType::DT_BOOL: {
  173. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  174. prev_num_elements);
  175. }
  176. default:
  177. MS_LOG(INFO) << "Unsupported tensor type";
  178. // return a null pointer
  179. return std::unique_ptr<TensorSummary<int32_t>>{};
  180. }
  181. }
  182. /*
  183. * Feature group: Online debugger, Offline debugger.
  184. * Target device group: Ascend, GPU.
  185. * Runtime category: Old runtime, MindRT.
  186. * Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
  187. */
  188. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  189. if (tensor == nullptr) {
  190. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  191. TensorStat empty_tensor_stat_data;
  192. return empty_tensor_stat_data;
  193. }
  194. std::unique_ptr<ITensorSummary> base_summary_ptr;
  195. void *previous_tensor_ptr = nullptr;
  196. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  197. if (base_summary_ptr == nullptr) {
  198. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  199. TensorStat empty_tensor_stat_data;
  200. return empty_tensor_stat_data;
  201. }
  202. base_summary_ptr->TensorStatistics(tensor->GetType());
  203. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  204. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  205. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  206. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  207. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  208. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  209. return tensor_stat_data;
  210. }
  211. #ifdef OFFLINE_DBG_MODE
  212. /*
  213. * Feature group: Offline debugger.
  214. * Target device group: Ascend, GPU.
  215. * Runtime category: Old runtime, MindRT.
  216. * Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
  217. * run iteration for tensor's graph.
  218. */
  219. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  220. uint64_t *prev_num_elements, bool *history_not_found) {
  221. MS_EXCEPTION_IF_NULL(tensor);
  222. const void *previous_tensor_ptr = nullptr;
  223. std::shared_ptr<TensorData> tensor_prev;
  224. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  225. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  226. *history_not_found = 1;
  227. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  228. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  229. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  230. // read data in offline mode
  231. NPYFilePool file_paths;
  232. ProcessedNPYFiles processed_npy_files;
  233. if (!is_sync_mode_) {
  234. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  235. std::vector<unsigned int>{tensor->GetDeviceId()},
  236. std::vector<unsigned int>{tensor->GetPrevIteration()},
  237. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  238. processed_npy_files = ProcessNPYFilePool(file_paths);
  239. }
  240. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  241. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  242. std::vector<unsigned int>{tensor->GetDeviceId()},
  243. std::vector<unsigned int>{tensor->GetPrevIteration()},
  244. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  245. &processed_npy_files, &result_list_prev);
  246. tensor_prev = result_list_prev[0];
  247. if (!tensor_prev->GetByteSize()) {
  248. tensor_prev.reset();
  249. } else {
  250. previous_tensor_ptr = tensor_prev->GetDataPtr();
  251. *prev_num_elements = tensor_prev->GetNumElements();
  252. }
  253. }
  254. return previous_tensor_ptr;
  255. }
  256. #endif
  257. /*
  258. * Feature group: Offline debugger, Online debugger.
  259. * Target device group: Ascend, GPU.
  260. * Runtime category: Old runtime, MindRT.
  261. * Description: Goes through all the watchpoints in the watchpoint table. If the current tensor is in the list of
  262. * check_nodes, that watchpoint is added to the vector of watchpoint_to_check (vector of watchpoints that should be
  263. * checked for the current tensor) .
  264. */
  265. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  266. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  267. std::string *const qualified_tensor_name,
  268. std::vector<watchpoint_t> *const watchpoints_to_check) {
  269. if (tensor == nullptr) {
  270. MS_LOG(DEBUG) << "tensor is nullptr.";
  271. return;
  272. }
  273. const auto tensor_name = tensor->GetName();
  274. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  275. const auto tensor_device_id = tensor->GetDeviceId();
  276. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  277. for (auto w_table_item : watchpoint_table_) {
  278. auto wp = std::get<1>(w_table_item);
  279. // check ONLY init conditions on initial suspended state.
  280. // skip other conditions on initial suspended state
  281. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  282. continue;
  283. }
  284. // skip init condition if not init suspend
  285. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  286. continue;
  287. }
  288. // check change conditions only on step end.
  289. if (wp.change_condition() && !step_end) {
  290. continue;
  291. }
  292. // if recheck, ignore the cache results and reanalyze everything.
  293. // if not a recheck, check only unanalyzed tensors
  294. if (!recheck) {
  295. wp_lock_.lock();
  296. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  297. wp_lock_.unlock();
  298. if (wp_cache_hit) {
  299. continue;
  300. }
  301. }
  302. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  303. if (!found.empty()) {
  304. *qualified_tensor_name = found;
  305. watchpoints_to_check->push_back(w_table_item.second);
  306. #ifdef OFFLINE_DBG_MODE
  307. if (wp.change_condition()) {
  308. *previous_iter_tensor_needed = true;
  309. }
  310. #endif
  311. }
  312. }
  313. }
  314. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  315. const std::string &tensor_name) {
  316. // add analyzed tensor to cache
  317. if (!recheck) {
  318. wp_lock_.lock();
  319. (void)wp_id_cache_[tensor_name].insert(id);
  320. wp_lock_.unlock();
  321. }
  322. }
  323. void DebugServices::SetCheckWatchpointsResult(
  324. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  325. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  326. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  327. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  328. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  329. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  330. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  331. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  332. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  333. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  334. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  335. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  336. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  337. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  338. if (device_id != nullptr) {
  339. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  340. }
  341. if (root_graph_id != nullptr) {
  342. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  343. }
  344. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  345. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  346. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  347. }
  348. #ifdef OFFLINE_DBG_MODE
  349. /*
  350. * Feature group: Offline debugger.
  351. * Target device group: Ascend, GPU.
  352. * Runtime category: Old runtime, MindRT.
  353. * Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and NO_VALUE error_code (for
  354. * new python API feature). Sets checkwatchpoint results.
  355. */
  356. void DebugServices::CheckOutofMemoryandNoValue(
  357. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  358. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  359. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  360. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  361. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  362. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  363. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  364. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  365. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  366. const std::vector<parameter_t> &parameter_list) {
  367. bool set_is_needed = no_mem_to_read || error_on_no_value;
  368. int32_t error_code_to_set = 0;
  369. if (no_mem_to_read) {
  370. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  371. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  372. } else if (error_on_no_value) {
  373. error_code_to_set = ITensorSummary::NO_VALUE;
  374. }
  375. if (set_is_needed) {
  376. for (auto &wp : watchpoints_to_check) {
  377. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  378. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  379. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  380. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  381. parameter_list, error_code_to_set);
  382. }
  383. }
  384. }
  385. /*
  386. * Feature group: Offline debugger.
  387. * Target device group: Ascend, GPU.
  388. * Runtime category: Old runtime, MindRT.
  389. * Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
  390. * feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
  391. * required by other nodes' checking. If previous_tensor exists, change their status in a pair.
  392. */
  393. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  394. // set the tensor into not-in-use status in tensor_loader.
  395. auto tensor_name = tensor->GetName();
  396. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  397. std::to_string(tensor->GetRootGraphId()) + ":" +
  398. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  399. AppendToCacheEvictQueue(key_name_in_cache);
  400. if (previous_tensor_ptr != nullptr) {
  401. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  402. }
  403. }
  404. #endif
  405. #ifdef ONLINE_DBG_MODE
  406. /*
  407. * Feature group: Online debugger.
  408. * Target device group: Ascend, GPU.
  409. * Runtime category: Old runtime, MindRT.
  410. * Description: Compares the current root graph id with the given graph id and returns false if they are not equal
  411. * for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
  412. * root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
  413. * different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
  414. * reading tensor if tensor's root_graph_id is different from current_root_graph_id.
  415. */
  416. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  417. auto debugger = Debugger::GetInstance();
  418. auto ms_context = MsContext::GetInstance();
  419. MS_EXCEPTION_IF_NULL(ms_context);
  420. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  421. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  422. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  423. device_target == kAscendDevice) {
  424. if (cur_root_graph_id != id) {
  425. return false;
  426. }
  427. }
  428. return true;
  429. }
  430. /*
  431. * Feature group: Online debugger.
  432. * Target device group: Ascend, GPU.
  433. * Runtime category: Old runtime, MindRT.
  434. * Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
  435. * prev_tensor_data is not nullptr.
  436. */
  437. const void *DebugServices::PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name) {
  438. std::shared_ptr<TensorData> prev_tensor_data;
  439. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  440. // not supporting watchpoints that need prev tensor for multi root graph networks.
  441. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  442. prev_tensor_data = nullptr;
  443. } else {
  444. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  445. }
  446. if (prev_tensor_data) {
  447. *prev_num_elements = prev_tensor_data->GetNumElements();
  448. return prev_tensor_data->GetDataPtr();
  449. }
  450. return nullptr;
  451. }
  452. #endif
  453. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  454. // check history error_code only for offline debugger
  455. if (history_not_found) {
  456. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  457. }
  458. }
  459. /*
  460. * Feature group: Offline debugger, Online debugger.
  461. * Target device group: Ascend, GPU.
  462. * Runtime category: Old runtime, MindRT.
  463. * Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
  464. * watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
  465. * successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
  466. */
  467. void DebugServices::CheckWatchpointsForTensor(
  468. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  469. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  470. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  471. const std::vector<std::string> &op_overflows, ProcessedNPYFiles *const processed_npy_files,
  472. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  473. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  474. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  475. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  476. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  477. int list_size = tensor_list->size();
  478. if (end > list_size) {
  479. end = list_size;
  480. }
  481. for (int i = begin; i < end; i++) {
  482. auto &tensor = (*tensor_list)[i];
  483. const auto tensor_name = tensor->GetName();
  484. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  485. const auto tensor_slot = std::to_string(tensor->GetSlot());
  486. std::vector<watchpoint_t> watchpoints_to_check;
  487. std::string qualified_tensor_name;
  488. bool previous_iter_tensor_needed = false;
  489. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  490. &qualified_tensor_name, &watchpoints_to_check);
  491. // no wp set on current tensor
  492. if (watchpoints_to_check.empty()) {
  493. continue;
  494. }
  495. #ifdef OFFLINE_DBG_MODE
  496. // read data in offline mode
  497. bool no_mem_to_read = false;
  498. std::vector<std::shared_ptr<TensorData>> result_list;
  499. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  500. std::vector<unsigned int>{tensor->GetDeviceId()},
  501. std::vector<unsigned int>{tensor->GetIteration()},
  502. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  503. processed_npy_files, &result_list, &no_mem_to_read);
  504. tensor = result_list[0];
  505. if (!tensor->GetByteSize()) {
  506. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  507. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  508. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  509. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  510. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  511. tensor->GetRootGraphId(), std::vector<parameter_t>());
  512. tensor.reset();
  513. continue;
  514. }
  515. #endif
  516. // no elements to analyze
  517. if (tensor->GetByteSize() == 0) {
  518. continue;
  519. }
  520. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  521. int tensor_dtype = tensor->GetType();
  522. uint64_t num_elements = tensor->GetNumElements();
  523. uint64_t prev_num_elements = 0;
  524. const void *previous_tensor_ptr = nullptr;
  525. #ifdef OFFLINE_DBG_MODE
  526. bool history_not_found = 0;
  527. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  528. #else
  529. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  530. MS_LOG(DEBUG)
  531. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  532. << tensor->GetName();
  533. continue;
  534. }
  535. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  536. #endif
  537. std::unique_ptr<ITensorSummary> base_summary_ptr;
  538. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  539. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  540. if (base_summary_ptr != nullptr) {
  541. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  542. }
  543. }
  544. for (auto &wp : watchpoints_to_check) {
  545. bool is_hit = false;
  546. int error_code = 0;
  547. std::vector<parameter_t> parameter_list = {};
  548. if (wp.condition.type == IS_OVERFLOW) {
  549. is_hit =
  550. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  551. } else if (base_summary_ptr != nullptr) {
  552. auto item = base_summary_ptr->IsWatchpointHit(wp);
  553. is_hit = std::get<ITensorSummary::eHitPos>(item);
  554. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  555. #ifdef OFFLINE_DBG_MODE
  556. CheckHistoryErrorCode(&error_code, history_not_found);
  557. #endif
  558. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  559. }
  560. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  561. if (is_hit || error_code) {
  562. SetCheckWatchpointsResult(
  563. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  564. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  565. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  566. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  567. }
  568. }
  569. #ifdef OFFLINE_DBG_MODE
  570. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  571. // in offline mode remove the need for the data
  572. tensor.reset();
  573. #endif
  574. (void)tensor_processed_count_.fetch_add(1, std::memory_order_relaxed);
  575. }
  576. }
  577. /*
  578. * Feature group: Offline debugger, Online debugger.
  579. * Target device group: Ascend, GPU.
  580. * Runtime category: Old runtime, MindRT.
  581. * Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
  582. * Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
  583. * sorted. In the end, the time for checking the watchpoint in the current step is reported.
  584. */
  585. void DebugServices::CheckWatchpoints(
  586. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  587. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  588. std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
  589. ProcessedNPYFiles *const processed_npy_files, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  590. const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
  591. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  592. std::lock_guard<std::mutex> lg(lock_);
  593. auto t1 = std::chrono::high_resolution_clock::now();
  594. if (watchpoint_table_.empty()) {
  595. return;
  596. }
  597. // vector to store execution order of tensors hit
  598. std::vector<int> exec_order;
  599. std::vector<std::string> time_stamps;
  600. size_t tensor_list_size = tensor_list->size();
  601. uint64_t tensor_list_byte_size = 0;
  602. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  603. if (tensor_list_size == 0) {
  604. return;
  605. }
  606. if (IS_OUTPUT_ON(INFO)) {
  607. wp_progress_enabled_ = true;
  608. wp_progress_thread_ =
  609. std::make_unique<std::thread>([this, tensor_list_size]() { CheckWatchpointProgress(tensor_list_size); });
  610. }
  611. const size_t thread_num_with_mem = 16;
  612. const size_t thread_num_without_mem = 32;
  613. // default value for number of threads
  614. const size_t default_thread_num =
  615. tensor_loader_->EnableMemoryControl() ? thread_num_with_mem : thread_num_without_mem;
  616. size_t max_thread_num = default_thread_num;
  617. if (max_thread_num > tensor_list_size) {
  618. max_thread_num = tensor_list_size;
  619. }
  620. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  621. int chunk_size = tensor_list_size / max_thread_num;
  622. int remainder = tensor_list_size % max_thread_num;
  623. partitioned_numbers chunk_exec_orders(max_thread_num);
  624. partitioned_names chunk_names(max_thread_num);
  625. partitioned_names chunk_slots(max_thread_num);
  626. partitioned_numbers chunk_conditions(max_thread_num);
  627. partitioned_id chunk_watchpoint_id(max_thread_num);
  628. partitioned_parameters chunk_parameters(max_thread_num);
  629. partitioned_error_code chunk_error_codes(max_thread_num);
  630. partitioned_id chunk_device_id(max_thread_num);
  631. partitioned_id chunk_root_graph_id(max_thread_num);
  632. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  633. partitioned_names chunk_time_stamp(max_thread_num);
  634. std::vector<std::future<void>> tensor_future_vec;
  635. int begin = 0;
  636. int end = begin;
  637. for (size_t i = 0; i < max_thread_num; i++) {
  638. end += chunk_size;
  639. if (remainder > 0) {
  640. end++;
  641. remainder--;
  642. }
  643. (void)tensor_future_vec.emplace_back(std::async(
  644. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  645. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, processed_npy_files,
  646. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  647. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  648. begin = end;
  649. }
  650. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  651. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  652. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  653. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  654. root_graph_id);
  655. auto t2 = std::chrono::high_resolution_clock::now();
  656. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  657. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  658. MS_LOG(INFO) << "CheckWatchpoints Took: " << std::fixed << std::setprecision(precision)
  659. << (ms_double.count()) / ms_to_s << "s";
  660. if (IS_OUTPUT_ON(INFO) && wp_progress_thread_ && wp_progress_thread_->joinable()) {
  661. wp_progress_enabled_ = false;
  662. wp_progress_thread_->join();
  663. MS_LOG(INFO) << "Join wp_progress_thread_.";
  664. }
  665. }
  666. void DebugServices::CheckWatchpointProgress(size_t tensor_list_size) {
  667. while (wp_progress_enabled_ && (tensor_processed_count_ != tensor_list_size)) {
  668. MS_LOG(INFO) << "CheckWatchpoint progress: " << tensor_processed_count_ << " tensor processed out of "
  669. << tensor_list_size;
  670. std::this_thread::sleep_for(std::chrono::milliseconds(wp_progress_period));
  671. }
  672. }
  673. /*
  674. * Feature group: Offline debugger, Online debugger.
  675. * Target device group: Ascend, GPU.
  676. * Runtime category: Old runtime, MindRT.
  677. * Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
  678. * debugger is based on the execution order and for the offline debugger is based on the time stamp.
  679. */
  680. void DebugServices::SortWatchpointsInfo(
  681. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  682. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  683. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  684. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  685. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  686. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  687. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  688. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  689. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  690. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  691. std::vector<unsigned int> *const root_graph_id) {
  692. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  693. (*tensor_future_vec)[i].wait();
  694. (*tensor_future_vec)[i].get();
  695. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  696. #ifdef ONLINE_DBG_MODE
  697. // if the execution order is repeated,inserts the new one before the others with same execution order.
  698. std::vector<int>::iterator iter =
  699. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  700. int position = iter - exec_order->begin();
  701. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  702. #endif
  703. #ifdef OFFLINE_DBG_MODE
  704. std::vector<std::string>::iterator iter =
  705. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  706. int position = iter - time_stamps->begin();
  707. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  708. #endif
  709. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  710. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  711. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  712. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  713. if (device_id != nullptr) {
  714. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  715. }
  716. if (root_graph_id != nullptr) {
  717. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  718. }
  719. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  720. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  721. }
  722. // free the memory for used vectors
  723. std::vector<int>().swap((*chunk_exec_orders)[i]);
  724. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  725. std::vector<std::string>().swap((*chunk_names)[i]);
  726. std::vector<std::string>().swap((*chunk_slots)[i]);
  727. std::vector<int>().swap((*chunk_conditions)[i]);
  728. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  729. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  730. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  731. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  732. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  733. if ((*tensor_list_byte_size) > UINT64_MAX - (*chunk_tensor_byte_size)[i]) {
  734. MS_LOG(WARNING) << (*tensor_list_byte_size) << " + " << (*chunk_tensor_byte_size)[i]
  735. << " would lead to integer overflow!";
  736. (*tensor_list_byte_size) = UINT64_MAX;
  737. } else {
  738. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  739. }
  740. }
  741. }
  742. #ifdef OFFLINE_DBG_MODE
  743. /*
  744. * Feature group: Offline debugger.
  745. * Target device group: Ascend, GPU.
  746. * Runtime category: Old runtime, MindRT.
  747. * Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
  748. * if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
  749. * than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
  750. * for the tensor.
  751. */
  752. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  753. std::string *const tensor_type, std::size_t *const size,
  754. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  755. bool *no_mem_to_read) {
  756. std::ifstream infile;
  757. std::string file_path = file_name;
  758. MS_LOG(INFO) << "Reading in file: " << file_path;
  759. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  760. if (!infile.is_open()) {
  761. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  762. const int kMaxFilenameLength = 128;
  763. char err_info[kMaxFilenameLength];
  764. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  765. if (ret != kStrErrorNone) {
  766. MS_LOG(ERROR) << " ErrInfo:" << ret;
  767. }
  768. return;
  769. }
  770. const int substr_len = 2;
  771. const int header_len_offset = 8;
  772. const int header_offset = 9;
  773. const int header_len_buffer_size = 2;
  774. const int type_offset = 10;
  775. // get header length
  776. (void)infile.seekg(0, std::ios::beg);
  777. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  778. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  779. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  780. return;
  781. }
  782. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  783. header_len_buffer.reset();
  784. // read in header
  785. (void)infile.seekg(0, std::ios::beg);
  786. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  787. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  788. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  789. return;
  790. }
  791. std::string header(header_buffer->data() + header_offset, header_len);
  792. header_buffer.reset();
  793. std::size_t type_i = header.find("descr") + type_offset;
  794. if (header.length() < type_i + substr_len) {
  795. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  796. return;
  797. }
  798. *tensor_type = header.substr(type_i, substr_len);
  799. std::size_t shape_i_open = header.find("(");
  800. std::size_t shape_i_close = header.find(")");
  801. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  802. std::string intermediate;
  803. std::stringstream check_shape(shape_str);
  804. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  805. while (getline(check_shape, intermediate, ',')) {
  806. int64_t shape_d = 0;
  807. if (!CheckStoi(&shape_d, intermediate)) {
  808. MS_LOG(INFO) << "Failed to get the shape from file: " << file_name << ", error in convert the string "
  809. << intermediate << " into an integer.";
  810. return;
  811. }
  812. shape->push_back(shape_d);
  813. }
  814. std::size_t word_size = 0;
  815. if (!CheckStoul(&word_size, std::string(1, (*tensor_type)[1]))) {
  816. MS_LOG(INFO) << "Failed to get the word_size from file: " << file_name << ", error in convert the string "
  817. << (*tensor_type)[1] << " into an integer.";
  818. return;
  819. }
  820. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  821. std::size_t data_size = data_len * word_size;
  822. if (!data_size) {
  823. return;
  824. }
  825. // Check memory available before loading tensor into host.
  826. bool has_enough_memory = true;
  827. if (tensor_loader_->EnableMemoryControl()) {
  828. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  829. }
  830. if (!has_enough_memory) {
  831. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  832. *no_mem_to_read = true;
  833. } else {
  834. (void)infile.seekg(header_len + type_offset);
  835. *data_buffer = new std::vector<char>(data_size);
  836. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  837. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  838. }
  839. *size = data_size;
  840. }
  841. }
  842. /*
  843. * Feature group: Offline debugger.
  844. * Target device group: Ascend.
  845. * Runtime category: Old runtime, MindRT.
  846. * Description: This function is to convert files in each directory from device format to host format and append the
  847. * converted npy file name into NPYFilePool. It's for Ascend async dump only.
  848. */
  849. void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, NPYFilePool *const result_list) {
  850. for (auto const &d : dir_to_files_map) {
  851. std::vector<std::string> files_to_convert_in_dir;
  852. std::vector<std::string> files_after_convert_in_dir;
  853. std::string dump_key = d.first;
  854. for (auto const &item : d.second) {
  855. std::string file_name = std::get<0>(item);
  856. std::string file_name_without_scope = std::get<1>(item);
  857. // skip the file that was converted to npy already.
  858. if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
  859. return file_found.find(file_name_without_scope) == std::string::npos;
  860. })) {
  861. // Full path for conversion.
  862. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  863. (void)files_after_convert_in_dir.emplace_back(file_name_without_scope);
  864. }
  865. }
  866. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  867. if (!files_to_convert_in_dir.empty()) {
  868. // Look for the installation path to the convert_async package. If not found, throw exception and terminate the
  869. // later task.
  870. auto t1 = std::chrono::high_resolution_clock::now();
  871. {
  872. pybind11::gil_scoped_acquire acquire;
  873. try {
  874. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  875. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  876. (void)convert_obj.attr("convert_files")();
  877. } catch (pybind11::error_already_set &e) {
  878. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  879. }
  880. }
  881. auto t2 = std::chrono::high_resolution_clock::now();
  882. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  883. MS_LOG(INFO) << "convert files Took: " << std::fixed << std::setprecision(precision)
  884. << (ms_double.count()) / ms_to_s << "s";
  885. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
  886. }
  887. }
  888. }
  889. /*
  890. * Feature group: Offline debugger.
  891. * Target device group: Ascend.
  892. * Runtime category: Old runtime, MindRT.
  893. * Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
  894. * append into NPYFilePool. It's for Ascend async dump only.
  895. */
  896. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  897. const std::string &dump_key, NPYFilePool *const result_list) {
  898. std::string real_dump_iter_dir = RealPath(dump_key);
  899. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  900. if (d_handle == nullptr) {
  901. MS_LOG(INFO) << "Directory " << real_dump_iter_dir << " does not exist in ConvertToHostFormat.";
  902. return;
  903. }
  904. struct dirent *dir = nullptr;
  905. while ((dir = readdir(d_handle)) != nullptr) {
  906. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  907. if (!IsRegFile(name)) {
  908. continue;
  909. }
  910. std::string candidate = dir->d_name;
  911. for (const std::string &file_to_find : files_after_convert_in_dir) {
  912. if (candidate.find(file_to_find + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
  913. // we found a converted file for this op
  914. std::string found_file = dump_key + "/" + candidate;
  915. (void)result_list->insert(found_file);
  916. }
  917. }
  918. }
  919. (void)closedir(d_handle);
  920. }
  921. /*
  922. * Feature group: Offline debugger.
  923. * Target device group: Ascend, GPU.
  924. * Runtime category: Old runtime, MindRT.
  925. * Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
  926. * dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
  927. * match the file.
  928. */
  929. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  930. if (dump_style_name.empty()) {
  931. return "";
  932. }
  933. std::size_t last_scope_marker;
  934. std::string delim = "/";
  935. last_scope_marker = dump_style_name.rfind(delim);
  936. if (last_scope_marker == std::string::npos) {
  937. return dump_style_name;
  938. }
  939. return dump_style_name.substr(last_scope_marker + delim.size());
  940. }
  941. /*
  942. * Feature group: Offline debugger.
  943. * Target device group: Ascend.
  944. * Runtime category: Old runtime, MindRT.
  945. * Description: This function is to search and prepare the target npy file to be read for each node. If the found file
  946. * is already npy format, push it to NPYFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
  947. * npy format beforehand.
  948. */
  949. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  950. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  951. std::vector<unsigned int> root_graph_id, NPYFilePool *const result_list) {
  952. DirMap dir_to_files_map;
  953. for (unsigned int i = 0; i < backend_name.size(); i++) {
  954. // form prefix of the tensor file to read from graph pb node name
  955. std::string dump_style_kernel_name = backend_name[i];
  956. // remove slot from name
  957. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  958. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  959. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  960. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  961. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  962. // if node name is constant, skip
  963. if (prefix_dump_file_name.length() > (unsigned)strlen(constant_prefix) &&
  964. prefix_dump_file_name.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  965. continue;
  966. }
  967. // search files in dir for the one that meets the filename prefix and read the file into memory
  968. std::string abspath = RealPath(specific_dump_dir);
  969. auto preprocess_async_result = PreProcessDumpDirAsync(abspath);
  970. bool is_success = std::get<0>(preprocess_async_result);
  971. if (!is_success) {
  972. // directory does not exist
  973. return;
  974. }
  975. ProcessConvertList(std::get<1>(preprocess_async_result), prefix_dump_file_name, specific_dump_dir,
  976. &dir_to_files_map, result_list);
  977. }
  978. ConvertToHostFormat(dir_to_files_map, result_list);
  979. }
  980. void DebugServices::ConvertWatchPointNodes(const DumpFileMap &dump_dir_mapped_files,
  981. const std::vector<ProtoDump> &proto_dump,
  982. const std::string &specific_dump_dir, NPYFilePool *const result_list) {
  983. DirMap dir_to_files_map;
  984. for (const auto &node : proto_dump) {
  985. std::string dump_name = node.dump_name;
  986. // search files in dir for the one that meets the filename prefix and read the file into memory
  987. std::string abspath = RealPath(specific_dump_dir);
  988. DIR *d = opendir(abspath.c_str());
  989. if (d == nullptr) {
  990. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  991. return;
  992. }
  993. ProcessConvertList(dump_dir_mapped_files, dump_name, specific_dump_dir, &dir_to_files_map, result_list);
  994. (void)closedir(d);
  995. }
  996. ConvertToHostFormat(dir_to_files_map, result_list);
  997. }
  998. /*
  999. * Feature group: Offline debugger.
  1000. * Target device group: Ascend.
  1001. * Runtime category: Old runtime, MindRT.
  1002. * Description: This function is to search the dump dir and separate npy files from bin files in async dump dir.
  1003. */
  1004. DebugServices::AsyncPreProcessResult DebugServices::PreProcessDumpDirAsync(const std::string &specific_dump_dir) {
  1005. // DumpFileMap for each specific dump dir (including rank, graph_id and iteration)
  1006. DumpFileMap dump_dir_mapped_files;
  1007. AsyncPreProcessResult async_result;
  1008. DIR *d = opendir(specific_dump_dir.c_str());
  1009. if (d == nullptr) {
  1010. MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
  1011. std::get<0>(async_result) = false;
  1012. std::get<1>(async_result) = dump_dir_mapped_files;
  1013. return async_result;
  1014. }
  1015. struct dirent *dir = nullptr;
  1016. while ((dir = readdir(d)) != nullptr) {
  1017. std::string file_name = dir->d_name;
  1018. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  1019. if (!IsRegFile(file_path)) {
  1020. continue;
  1021. }
  1022. bool is_txt = file_name.rfind(".txt") != std::string::npos;
  1023. if (is_txt) {
  1024. // txt files in dump dir contain the list of failed converted npy files.
  1025. MS_LOG(DEBUG) << "Skipping txt file: " << file_name;
  1026. continue;
  1027. }
  1028. std::string op_name;
  1029. bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
  1030. auto first_dot = file_name.find('.');
  1031. const int kSeventhFromRight = 7;
  1032. size_t pos = file_name.rfind(".");
  1033. for (int cnt = 1; cnt < kSeventhFromRight; cnt++) {
  1034. pos = file_name.rfind(".", pos - 1);
  1035. }
  1036. size_t seventh_last_dot = pos;
  1037. if (seventh_last_dot != std::string::npos && first_dot != std::string::npos && seventh_last_dot > first_dot) {
  1038. // name_to_match is between first dot and seventh last dot.
  1039. // if op_type is parameter, the op_name can have dots.
  1040. op_name = file_name.substr(first_dot + 1, seventh_last_dot - first_dot - 1);
  1041. }
  1042. if (is_npy) {
  1043. // push back the file_name with specific dump dir
  1044. (dump_dir_mapped_files[specific_dump_dir].npy_files[op_name]).push_back(file_path);
  1045. } else {
  1046. // push back the file_name without specific dump dir. dump dir is the map key.
  1047. dump_dir_mapped_files[specific_dump_dir].bin_files.push_back(file_name);
  1048. }
  1049. }
  1050. (void)closedir(d);
  1051. std::get<0>(async_result) = true;
  1052. std::get<1>(async_result) = dump_dir_mapped_files;
  1053. return async_result;
  1054. }
  1055. /*
  1056. * Feature group: Offline debugger.
  1057. * Target device group: Ascend, GPU.
  1058. * Runtime category: Old runtime, MindRT.
  1059. * Description: This function is to search the dump dir for npy files.
  1060. */
  1061. DebugServices::NPYFilePool DebugServices::PreProcessDumpDirSync(const std::string &specific_dump_dir) {
  1062. // npy format:
  1063. // {dump_path}/{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
  1064. NPYFilePool npy_files;
  1065. DIR *d = opendir(specific_dump_dir.c_str());
  1066. if (d == nullptr) {
  1067. MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
  1068. return npy_files;
  1069. }
  1070. struct dirent *dir = nullptr;
  1071. while ((dir = readdir(d)) != nullptr) {
  1072. std::string file_name = dir->d_name;
  1073. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  1074. if (!IsRegFile(file_path)) {
  1075. continue;
  1076. }
  1077. bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
  1078. if (is_npy) {
  1079. (void)npy_files.insert(file_path);
  1080. }
  1081. }
  1082. (void)closedir(d);
  1083. return npy_files;
  1084. }
  1085. void DebugServices::ProcessConvertList(const DumpFileMap &dump_dir_mapped_files,
  1086. const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1087. DirMap *dir_to_files_map, NPYFilePool *const result_list) {
  1088. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  1089. auto it = dump_dir_mapped_files.find(specific_dump_dir);
  1090. if (it == dump_dir_mapped_files.end()) {
  1091. // no matched file
  1092. MS_LOG(ERROR) << "Pre-Process is not done correctly for :" << specific_dump_dir;
  1093. return;
  1094. }
  1095. auto bin_files = (it->second).bin_files;
  1096. auto npy_files = (it->second).npy_files;
  1097. for (size_t i = 0; i < bin_files.size(); i++) {
  1098. std::string file_name = bin_files[i];
  1099. std::string file_name_w_o_perfix = file_name;
  1100. auto type_pos = file_name.find('.');
  1101. // adding dot to avoid problematic matching in the scope.
  1102. if (type_pos == std::string::npos ||
  1103. file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
  1104. continue;
  1105. }
  1106. std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
  1107. (void)file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
  1108. // if file matches prefix and is in device format add to candidate files to convert.
  1109. (*dir_to_files_map)[specific_dump_dir].push_back(std::make_tuple(file_name, file_name_w_o_perfix));
  1110. }
  1111. // Add the already converted npy files to result_list
  1112. if (npy_files.find(prefix_dump_file_name) != npy_files.end()) {
  1113. (void)std::copy(npy_files[prefix_dump_file_name].begin(), npy_files[prefix_dump_file_name].end(),
  1114. std::inserter(*result_list, result_list->end()));
  1115. }
  1116. }
  1117. void DebugServices::GetTensorDataInfoAsync(const std::vector<ProtoDump> &proto_dump,
  1118. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  1119. uint32_t root_graph_id, const ProcessedNPYFiles &processed_async_files,
  1120. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  1121. auto it = processed_async_files.find(specific_dump_dir);
  1122. if (it == processed_async_files.end()) {
  1123. MS_LOG(DEBUG) << "no npy file was found for dump directory: " << specific_dump_dir;
  1124. return;
  1125. }
  1126. auto processed_files_for_dir = it->second;
  1127. for (auto &node : proto_dump) {
  1128. std::vector<size_t> slot_list;
  1129. std::string dump_name = node.dump_name;
  1130. bool output_flag = node.is_output;
  1131. for (const auto &dump_file_attr : processed_files_for_dir) {
  1132. if (dump_file_attr.name_to_match == dump_name && dump_file_attr.is_output == output_flag) {
  1133. slot_list.push_back(dump_file_attr.slot);
  1134. }
  1135. }
  1136. for (auto slot : slot_list) {
  1137. // add a TensorData entry (data will be read when needed)
  1138. std::vector<int64_t> shape;
  1139. std::string orig_name = node.origin_node_name;
  1140. auto tensor_data = std::make_shared<TensorData>();
  1141. tensor_data->SetName(orig_name);
  1142. tensor_data->SetExecutionOrder(0);
  1143. tensor_data->SetSlot(slot);
  1144. tensor_data->SetIteration(iteration);
  1145. tensor_data->SetDeviceId(device_id);
  1146. tensor_data->SetRootGraphId(root_graph_id);
  1147. tensor_data->SetDataPtr(nullptr);
  1148. tensor_data->SetByteSize(0);
  1149. tensor_data->SetType("");
  1150. tensor_data->SetShape(shape);
  1151. tensor_data->SetIsOutput(output_flag);
  1152. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1153. tensor_list->push_back(tensor_data);
  1154. }
  1155. }
  1156. }
  1157. /*
  1158. * Feature group: Offline debugger.
  1159. * Target device group: Ascend, GPU.
  1160. * Runtime category: Old runtime, MindRT.
  1161. * Description: This function extracts the attributes like op_name and time stamp from npy file name and is used for
  1162. * both sync and async dump.
  1163. */
  1164. DebugServices::ProcessedNPYFiles DebugServices::ProcessNPYFilePool(const NPYFilePool &npy_file_pool) {
  1165. // npy file format: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
  1166. ProcessedNPYFiles processed_files;
  1167. if (npy_file_pool.empty()) {
  1168. MS_LOG(WARNING) << "ProcessNPYFilePool was called for an empty NPYFilePool.";
  1169. return processed_files;
  1170. }
  1171. for (const std::string &file_name : npy_file_pool) {
  1172. std::string file_name_to_check = file_name;
  1173. std::string specific_dump_dir;
  1174. DumpFileAttr dump_file_attr;
  1175. std::string output_str;
  1176. std::string slot_str;
  1177. auto delim = file_name.rfind("/");
  1178. if (delim != std::string::npos) {
  1179. specific_dump_dir = file_name.substr(0, delim);
  1180. file_name_to_check = file_name.substr(delim + 1);
  1181. }
  1182. std::vector<std::tuple<size_t, size_t, std::string *>> attr_to_match;
  1183. size_t first_dot = file_name_to_check.find(".");
  1184. size_t last_dot = file_name_to_check.rfind(kNpyExt);
  1185. size_t second_last_dot = file_name_to_check.rfind(".", last_dot - 1);
  1186. size_t third_last_dot = file_name_to_check.rfind(".", second_last_dot - 1);
  1187. size_t fourth_last_dot = file_name_to_check.rfind(".", third_last_dot - 1);
  1188. size_t fifth_last_dot = file_name_to_check.rfind(".", fourth_last_dot - 1);
  1189. size_t sixth_last_dot = file_name_to_check.rfind(".", fifth_last_dot - 1);
  1190. size_t seventh_last_dot = file_name_to_check.rfind(".", sixth_last_dot - 1);
  1191. // name_to_match is between first dot and seventh last dot.
  1192. // if op_type is parameter, the op_name can have dots.
  1193. auto tuple = std::make_tuple(first_dot, seventh_last_dot, &dump_file_attr.name_to_match);
  1194. attr_to_match.push_back(tuple);
  1195. // slot is between second and third dot from end of the file name.
  1196. tuple = std::make_tuple(third_last_dot, second_last_dot, &slot_str);
  1197. attr_to_match.push_back(tuple);
  1198. // time stamp is between fourth and fifth dot from end of the file name.
  1199. tuple = std::make_tuple(fifth_last_dot, fourth_last_dot, &dump_file_attr.time_stamp);
  1200. attr_to_match.push_back(tuple);
  1201. // output is between third and fourth dot from end of the file name.
  1202. tuple = std::make_tuple(fourth_last_dot, third_last_dot, &output_str);
  1203. attr_to_match.push_back(tuple);
  1204. for (auto &match_item : attr_to_match) {
  1205. CheckStringMatch(std::get<DebugServices::START_POS>(match_item), std::get<DebugServices::END_POS>(match_item),
  1206. std::get<DebugServices::STR_POS>(match_item), file_name_to_check);
  1207. }
  1208. if (!slot_str.empty() && !CheckStoull(&dump_file_attr.slot, slot_str)) {
  1209. MS_LOG(INFO) << "Failed to get the slot from file_name: " << file_name_to_check
  1210. << ", error in convert the string " << slot_str << " into an integer.";
  1211. }
  1212. dump_file_attr.is_output = (output_str == "output");
  1213. dump_file_attr.file_path = file_name_to_check;
  1214. processed_files[specific_dump_dir].push_back(dump_file_attr);
  1215. }
  1216. return processed_files;
  1217. }
  1218. /*
  1219. * Feature group: Offline debugger.
  1220. * Target device group: Ascend, GPU.
  1221. * Runtime category: Old runtime, MindRT.
  1222. * Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
  1223. * from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
  1224. */
  1225. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  1226. std::regex re;
  1227. if (mode == "rank") {
  1228. re = "^rank_([0-9]+)$";
  1229. } else if (mode == "graph") {
  1230. re = "^([0-9]+)$";
  1231. }
  1232. std::smatch tokens;
  1233. if (regex_match(name, tokens, re)) {
  1234. return std::stoi(tokens[1]);
  1235. } else {
  1236. return UINT32_MAX;
  1237. }
  1238. }
  1239. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  1240. std::vector<uint32_t> rank_id_list;
  1241. std::string dump_dir = GetDumpDir();
  1242. DIR *d_handle = opendir(dump_dir.c_str());
  1243. if (d_handle == nullptr) {
  1244. MS_LOG(ERROR) << "Dump directory does not exist.";
  1245. return rank_id_list;
  1246. }
  1247. struct dirent *dir = nullptr;
  1248. while ((dir = readdir(d_handle)) != nullptr) {
  1249. struct stat st;
  1250. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  1251. int ret = stat(name.c_str(), &st);
  1252. if (ret != 0) {
  1253. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1254. (void)closedir(d_handle);
  1255. return rank_id_list;
  1256. }
  1257. if (S_ISDIR(st.st_mode)) {
  1258. std::string rank_dir_name = dir->d_name;
  1259. uint32_t rank_id = GetRankOrGraphId("rank", rank_dir_name);
  1260. if (rank_id != UINT32_MAX) {
  1261. rank_id_list.push_back(rank_id);
  1262. }
  1263. }
  1264. }
  1265. (void)closedir(d_handle);
  1266. return rank_id_list;
  1267. }
  1268. /*
  1269. * Feature group: Offline debugger.
  1270. * Target device group: Ascend, GPU.
  1271. * Runtime category: Old runtime, MindRT.
  1272. * Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
  1273. * graph_ids. Then the history file is read for all the extracted graph_ids.
  1274. */
  1275. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  1276. std::string net_name = GetNetName();
  1277. std::string dump_dir = GetDumpDir();
  1278. for (uint32_t rank_id : rank_id_list) {
  1279. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  1280. std::string abspath = RealPath(path);
  1281. DIR *d_handle_rank = opendir(abspath.c_str());
  1282. if (d_handle_rank == nullptr) {
  1283. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  1284. continue;
  1285. }
  1286. struct dirent *direc = nullptr;
  1287. while ((direc = readdir(d_handle_rank)) != nullptr) {
  1288. struct stat st;
  1289. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  1290. int ret = stat(name.c_str(), &st);
  1291. if (ret != 0) {
  1292. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1293. (void)closedir(d_handle_rank);
  1294. return;
  1295. }
  1296. if (S_ISDIR(st.st_mode)) {
  1297. std::string graph_dir = direc->d_name;
  1298. if (graph_dir == "." || graph_dir == "..") {
  1299. continue;
  1300. }
  1301. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  1302. if (graph_id != UINT32_MAX) {
  1303. ReadGraphsHistory(rank_id, graph_id);
  1304. }
  1305. }
  1306. }
  1307. (void)closedir(d_handle_rank);
  1308. }
  1309. }
  1310. void DebugServices::SetGraphsHistory() {
  1311. // extract rank_id_list
  1312. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  1313. // for each rank_id extract the graph_id list and set the dump version
  1314. // and for each graph read the graph history file
  1315. CheckDumpGraphIdList(rank_id_list);
  1316. }
  1317. /*
  1318. * Feature group: Offline debugger.
  1319. * Target device group: Ascend, GPU.
  1320. * Runtime category: Old runtime, MindRT.
  1321. * Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
  1322. * the data in graphs_run_history_ for the given rank and graph id.
  1323. */
  1324. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1325. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1326. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1327. // graph history was already stored for this rank_id and graph_id
  1328. return;
  1329. }
  1330. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1331. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1332. DIR *d_handle = opendir(exec_order_path.c_str());
  1333. if (d_handle == nullptr) {
  1334. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1335. return;
  1336. }
  1337. // read file and store the info
  1338. std::string full_path = exec_order_path + "/" + file_to_check;
  1339. std::string checked_path = RealPath(full_path);
  1340. if (!checked_path.empty()) {
  1341. ReadGraphRunIter(checked_path, rank_and_graph);
  1342. }
  1343. (void)closedir(d_handle);
  1344. }
  1345. /*
  1346. * Feature group: Offline debugger.
  1347. * Target device group: Ascend, GPU.
  1348. * Runtime category: Old runtime, MindRT.
  1349. * Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
  1350. * tuple with two elements, the first element is the node name and the second element is whether the node is output or
  1351. * not.
  1352. */
  1353. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1354. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1355. for (auto w_table_item : watchpoint_table_) {
  1356. auto wp = std::get<1>(w_table_item);
  1357. unsigned int index = 0;
  1358. for (auto check_node : wp.check_node_list) {
  1359. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1360. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1361. // graph represents root_graph for Ascend and kernel_graph for GPU
  1362. for (auto rank : ranks) {
  1363. for (auto graph : graphs) {
  1364. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1365. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1366. }
  1367. }
  1368. index++;
  1369. }
  1370. }
  1371. return rank_and_graph_to_nodes;
  1372. }
  1373. /*
  1374. * Feature group: Offline debugger.
  1375. * Target device group: Ascend, GPU.
  1376. * Runtime category: Old runtime, MindRT.
  1377. * Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
  1378. * graph in a vector and inserts it to graphs_run_history_ map.
  1379. */
  1380. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1381. std::ifstream infile;
  1382. std::string line;
  1383. infile.open(file_path.c_str());
  1384. if (!infile.is_open()) {
  1385. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1386. const int kMaxFilenameLength = NAME_MAX;
  1387. char err_info[kMaxFilenameLength];
  1388. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1389. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1390. }
  1391. return;
  1392. }
  1393. std::vector<uint32_t> run_iters_vec;
  1394. while (std::getline(infile, line)) {
  1395. uint32_t iter;
  1396. std::stringstream ss(line);
  1397. ss >> iter;
  1398. run_iters_vec.push_back(iter);
  1399. }
  1400. (void)graphs_run_history_.emplace(
  1401. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1402. }
  1403. /*
  1404. * Feature group: Offline debugger.
  1405. * Target device group: Ascend, GPU.
  1406. * Runtime category: Old runtime, MindRT.
  1407. * Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
  1408. * to the tensor_list_map_.
  1409. */
  1410. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1411. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1412. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1413. const std::string &type_name, const std::vector<int64_t> &shape,
  1414. std::vector<char> *buffer,
  1415. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1416. // call LoadNewTensor to store tensor in internal cache
  1417. auto tensor_data = std::make_shared<TensorData>();
  1418. tensor_data->SetName(backend_name);
  1419. tensor_data->SetExecutionOrder(0);
  1420. tensor_data->SetSlot(slot);
  1421. tensor_data->SetIteration(iteration);
  1422. tensor_data->SetDeviceId(device_id);
  1423. tensor_data->SetRootGraphId(root_graph_id);
  1424. tensor_data->SetIsOutput(is_output);
  1425. if (buffer != nullptr) {
  1426. tensor_data->SetDataPtr(buffer->data());
  1427. } else {
  1428. tensor_data->SetDataPtr(nullptr);
  1429. }
  1430. tensor_data->SetByteSize(data_size);
  1431. tensor_data->SetType(type_name);
  1432. tensor_data->SetShape(shape);
  1433. tensor_data->SetTimeStamp(time_stamp);
  1434. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1435. if (data_size) {
  1436. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1437. }
  1438. // add to result_list
  1439. result_list->push_back(tensor_data);
  1440. }
  1441. int GetNewestFileIndex(std::vector<std::string> matched_time_stamps) {
  1442. // given the vector of matched_time_stamps, get the index of the newest time stamp.
  1443. // this index is used to find the corresponding matched_path.
  1444. if (matched_time_stamps.empty()) {
  1445. return -1;
  1446. }
  1447. auto it = std::max_element(matched_time_stamps.begin(), matched_time_stamps.end());
  1448. int index = it - matched_time_stamps.begin();
  1449. return index;
  1450. }
  1451. /*
  1452. * Feature group: Offline debugger.
  1453. * Target device group: Ascend, GPU.
  1454. * Runtime category: Old runtime, MindRT.
  1455. * Description: Search files in NPYFilePool (async and async mode) for the one that meets the filename
  1456. * prefix and read the file into memory.
  1457. */
  1458. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1459. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1460. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1461. ProcessedNPYFiles *const processed_npy_files,
  1462. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1463. bool *no_mem_to_read) {
  1464. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1465. // form prefix of the tensor file to read from graph pb node name
  1466. std::string dump_style_kernel_name = backend_name[i];
  1467. // remove slot from name
  1468. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1469. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1470. std::string specific_dump_dir;
  1471. bool is_cst = false;
  1472. // prefix_dump_to_check is node name used to find corresponding dump file.
  1473. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1474. // if node name has prefix of "Default--data-", consider as constant, search in cst folder
  1475. if (prefix_dump_to_check.length() > (unsigned)strlen(constant_prefix) &&
  1476. prefix_dump_to_check.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  1477. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1478. std::to_string(root_graph_id[i]) + "/constants";
  1479. is_cst = true;
  1480. const std::string prefix = "Default--";
  1481. prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
  1482. } else {
  1483. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1484. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1485. }
  1486. MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
  1487. if ((is_sync_mode_ || is_cst) && processed_npy_files->find(specific_dump_dir) == processed_npy_files->end()) {
  1488. // This case happens when ReadDumpedTensor is called from GetPrevTensor function.
  1489. NPYFilePool npy_files = PreProcessDumpDirSync(specific_dump_dir);
  1490. *processed_npy_files = ProcessNPYFilePool(npy_files);
  1491. }
  1492. ReadDumpedTensorUtils(specific_dump_dir, prefix_dump_to_check, backend_name[i], slot[i], device_id[i], iteration[i],
  1493. root_graph_id[i], is_output[i], *processed_npy_files, result_list, no_mem_to_read);
  1494. }
  1495. }
  1496. /*
  1497. * Feature group: Offline debugger.
  1498. * Target device group: Ascend, GPU.
  1499. * Runtime category: Old runtime, MindRT.
  1500. * Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
  1501. * tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
  1502. * data_size = 0, empty shape and nullptr buffer.
  1503. */
  1504. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1505. const std::vector<std::string> &matched_time_stamps,
  1506. const std::string &backend_name, const unsigned int device_id,
  1507. const unsigned int root_graph_id, bool is_output, size_t slot,
  1508. bool *no_mem_to_read, unsigned int iteration,
  1509. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1510. std::string time_stamp = "";
  1511. std::string result_path = "";
  1512. std::string type_name = "";
  1513. size_t data_size = 0;
  1514. std::vector<int64_t> shape;
  1515. std::vector<char> *buffer = nullptr;
  1516. if (found) {
  1517. int index = GetNewestFileIndex(matched_time_stamps);
  1518. if (index >= 0) {
  1519. result_path = matched_paths[index];
  1520. time_stamp = matched_time_stamps[index];
  1521. }
  1522. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1523. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1524. std::to_string(slot);
  1525. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1526. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1527. type_name, shape, buffer, result_list);
  1528. } else {
  1529. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1530. buffer, result_list);
  1531. MS_LOG(INFO) << "Target tensor has not been found.";
  1532. }
  1533. }
  1534. /*
  1535. * Feature group: Offline debugger.
  1536. * Target device group: Ascend.
  1537. * Runtime category: Old runtime, MindRT.
  1538. * Description: Iterates through all the processed npy files for the current specific_dump_dir and looks for the files
  1539. * that match the node_name for dump, read the newest file and add the related tensor_data object.
  1540. */
  1541. void DebugServices::ReadDumpedTensorUtils(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1542. const std::string &backend_name, size_t slot, unsigned int device_id,
  1543. unsigned int iteration, unsigned int root_graph_id, bool is_output,
  1544. const ProcessedNPYFiles &processed_npy_files,
  1545. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1546. bool found = false;
  1547. std::vector<std::string> matched_paths;
  1548. std::vector<std::string> matched_time_stamps;
  1549. auto it = processed_npy_files.find(specific_dump_dir);
  1550. // If there is no npy file found we still need to add tensor data with size 0.
  1551. if (it == processed_npy_files.end()) {
  1552. MS_LOG(WARNING) << "no npy files was found for dump directory: " << specific_dump_dir;
  1553. } else {
  1554. auto processed_files_for_dir = it->second;
  1555. for (const auto &dump_file_attr : processed_files_for_dir) {
  1556. std::string file_name_to_check = dump_file_attr.file_path;
  1557. std::string full_path = specific_dump_dir + "/" + file_name_to_check;
  1558. if (dump_file_attr.name_to_match == prefix_dump_to_check && (dump_file_attr.slot == slot) &&
  1559. (is_output == dump_file_attr.is_output)) {
  1560. matched_paths.push_back(full_path);
  1561. matched_time_stamps.push_back(dump_file_attr.time_stamp);
  1562. found = true;
  1563. }
  1564. }
  1565. }
  1566. ReadFileAndAddToTensor(found, matched_paths, matched_time_stamps, backend_name, device_id, root_graph_id, is_output,
  1567. slot, no_mem_to_read, iteration, result_list);
  1568. }
  1569. /*
  1570. * Feature group: Offline debugger.
  1571. * Target device group: Ascend, GPU.
  1572. * Runtime category: Old runtime, MindRT.
  1573. * Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
  1574. * original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
  1575. * = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
  1576. * checkwatchpoint functions.
  1577. */
  1578. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  1579. unsigned int iteration, ProcessedNPYFiles *const processed_npy_files, bool error_on_no_value) {
  1580. // get a list of nodes and the devices they are on to monitor
  1581. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1582. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1583. GetAllWpNodes();
  1584. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1585. // as they are found
  1586. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1587. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1588. uint32_t rank_id = std::get<0>(rank_and_graph);
  1589. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1590. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1591. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1592. std::string real_dump_dir = RealPath(specific_dump_dir);
  1593. if (real_dump_dir.empty()) {
  1594. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1595. continue;
  1596. }
  1597. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1598. std::vector<ProtoDump> proto_to_dump;
  1599. // convert node names to dump style
  1600. for (auto node : wp_nodes) {
  1601. std::string orig_name = std::get<0>(node);
  1602. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1603. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1604. bool node_is_out = std::get<1>(node);
  1605. ProtoDump dump_proto;
  1606. dump_proto.origin_node_name = orig_name;
  1607. dump_proto.dump_name = dump_style_name;
  1608. dump_proto.is_output = node_is_out;
  1609. if (std::find(proto_to_dump.begin(), proto_to_dump.end(), dump_proto) == proto_to_dump.end()) {
  1610. proto_to_dump.push_back(dump_proto);
  1611. }
  1612. }
  1613. if (is_sync_mode_) {
  1614. // search files in dir for the one that meets the filename prefix and read the file into memory
  1615. NPYFilePool npy_files = PreProcessDumpDirSync(real_dump_dir);
  1616. *processed_npy_files = ProcessNPYFilePool(npy_files);
  1617. ProcessTensorDataSync(proto_to_dump, real_dump_dir, *processed_npy_files, iteration, rank_id, root_graph_id,
  1618. &tensor_list, error_on_no_value);
  1619. } else {
  1620. auto preprocess_async_result = PreProcessDumpDirAsync(real_dump_dir);
  1621. // convert all files in proto_to_dump to npy and add to pool of async file names
  1622. NPYFilePool async_file_pool;
  1623. ConvertWatchPointNodes(std::get<1>(preprocess_async_result), proto_to_dump, real_dump_dir, &async_file_pool);
  1624. *processed_npy_files = ProcessNPYFilePool(async_file_pool);
  1625. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *processed_npy_files,
  1626. &tensor_list);
  1627. }
  1628. }
  1629. return tensor_list;
  1630. }
  1631. /*
  1632. * Feature group: Offline debugger.
  1633. * Target device group: Ascend, GPU.
  1634. * Runtime category: Old runtime, MindRT.
  1635. * Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
  1636. * names in proto_to_dump vector.
  1637. */
  1638. void DebugServices::ProcessTensorDataSync(const std::vector<ProtoDump> &proto_to_dump,
  1639. const std::string &specific_dump_dir, ProcessedNPYFiles processed_npy_files,
  1640. unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
  1641. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1642. bool error_on_no_value) {
  1643. auto it = processed_npy_files.find(specific_dump_dir);
  1644. if (it == processed_npy_files.end()) {
  1645. MS_LOG(WARNING) << "no npy files was found for dump directory: " << specific_dump_dir;
  1646. return;
  1647. }
  1648. auto processed_files_for_dir = it->second;
  1649. for (const auto &dump_file_attr : processed_files_for_dir) {
  1650. for (auto &node : proto_to_dump) {
  1651. std::string dump_name = node.dump_name;
  1652. if (dump_name == dump_file_attr.name_to_match && node.is_output == dump_file_attr.is_output) {
  1653. size_t slot = dump_file_attr.slot;
  1654. std::vector<int64_t> shape;
  1655. std::string orig_name = node.origin_node_name;
  1656. bool output_flag = node.is_output;
  1657. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, nullptr,
  1658. tensor_list);
  1659. break;
  1660. }
  1661. }
  1662. }
  1663. }
  1664. std::string DebugServices::IterationString(unsigned int iteration) {
  1665. std::string iteration_string;
  1666. bool init_dbg_suspend = (iteration == std::numeric_limits<unsigned int>::max());
  1667. if (init_dbg_suspend) {
  1668. iteration_string = "init";
  1669. } else {
  1670. iteration_string = std::to_string(iteration);
  1671. }
  1672. return iteration_string;
  1673. }
  1674. #endif
  1675. /*
  1676. * Feature group: Online debugger.
  1677. * Target device group: Ascend, GPU.
  1678. * Runtime category: Old runtime, MindRT.
  1679. * Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
  1680. * current root_graph_id, it updates the given vectors.
  1681. */
  1682. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1683. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1684. std::vector<unsigned int> *const dtype,
  1685. std::vector<std::vector<int64_t>> *const shape) {
  1686. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1687. tensor_loader_->SearchTensors(name, &result_list);
  1688. for (auto result : result_list) {
  1689. if (std::get<1>(result) == nullptr) {
  1690. continue;
  1691. }
  1692. #ifdef ONLINE_DBG_MODE
  1693. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1694. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1695. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1696. << ".";
  1697. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1698. }
  1699. #endif
  1700. (void)ret_name->emplace_back(std::get<0>(result));
  1701. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1702. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1703. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1704. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1705. }
  1706. }
  1707. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1708. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1709. if (result_list == nullptr) {
  1710. MS_LOG(DEBUG) << "result_list is nullptr.";
  1711. return;
  1712. }
  1713. tensor_loader_->SearchTensors(name, result_list);
  1714. }
  1715. #ifdef ONLINE_DBG_MODE
  1716. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1717. bool ret = false;
  1718. for (auto w_table_item : watchpoint_table_) {
  1719. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1720. for (auto check_node : check_node_list) {
  1721. std::string w_name = std::get<0>(check_node);
  1722. bool w_type = std::get<1>(check_node);
  1723. if ((w_type == true &&
  1724. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1725. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1726. ret = true;
  1727. return ret;
  1728. }
  1729. }
  1730. }
  1731. return ret;
  1732. }
  1733. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1734. if (kernel != nullptr && w_name.length() > 0) {
  1735. auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
  1736. for (size_t j = 0; j < input_size; ++j) {
  1737. auto input_kernel = kernel->input(j + 1);
  1738. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1739. auto found = w_name.find_last_of('/');
  1740. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1741. return true;
  1742. }
  1743. return false;
  1744. } else {
  1745. return false;
  1746. }
  1747. }
  1748. #endif
  1749. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1750. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1751. return tensor_loader_->GetTensor(tensor_name);
  1752. }
  1753. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1754. #ifdef ONLINE_DBG_MODE
  1755. bool DebugServices::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
  1756. const std::string &addr_format, const std::string &tensor_name, size_t slot,
  1757. const std::vector<int64_t> &host_shape, TypeId host_type) const {
  1758. return tensor_loader_->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot, host_shape,
  1759. host_type);
  1760. }
  1761. #endif
  1762. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1763. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1764. }
  1765. /*
  1766. * Feature group: Offline debugger.
  1767. * Target device group: Ascend, GPU.
  1768. * Runtime category: Old runtime, MindRT.
  1769. * Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
  1770. * run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
  1771. * prev_iteration.
  1772. */
  1773. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1774. uint32_t prev_iter;
  1775. uint32_t rank_id = tensor->GetDeviceId();
  1776. uint32_t root_graph_id = tensor->GetRootGraphId();
  1777. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1778. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1779. return UINT32_MAX;
  1780. }
  1781. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1782. tensor->GetIteration());
  1783. if (it == graphs_run_history_[rank_and_graph].end()) {
  1784. // The graph is not executed in that iteration
  1785. return UINT32_MAX;
  1786. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1787. // current iteration is the first iteration that the graph was run
  1788. // no prev iter is available
  1789. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1790. << " is the first run iteration for tensor: " << tensor->GetName();
  1791. return UINT32_MAX;
  1792. }
  1793. (void)it--;
  1794. prev_iter = *it;
  1795. tensor->SetPrevIteration(prev_iter);
  1796. return prev_iter;
  1797. }
  1798. void DebugServices::ResetLoadedTensors() {
  1799. wp_id_cache_.clear();
  1800. MS_LOG(INFO) << "Resetting loaded tensors";
  1801. tensor_loader_->MoveParametersCurrentToPrev();
  1802. tensor_loader_->EmptyCurrentTensor();
  1803. // will move parameters from previous to current map
  1804. tensor_loader_->SwapCurrentPrev();
  1805. overflow_ops_.clear();
  1806. }
  1807. #ifdef ONLINE_DBG_MODE
  1808. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1809. MS_EXCEPTION_IF_NULL(kernel);
  1810. std::vector<std::shared_ptr<TensorData>> result;
  1811. auto output_size = common::AnfAlgo::GetOutputTensorNum(kernel);
  1812. auto kernel_name = GetKernelNodeName(kernel);
  1813. for (size_t j = 0; j < output_size; ++j) {
  1814. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1815. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1816. if (tensor != nullptr) {
  1817. result.push_back(tensor);
  1818. }
  1819. }
  1820. return result;
  1821. }
  1822. #endif
  1823. std::string GetOnlineOpOverflowDir() {
  1824. // only called for online debugger mode
  1825. // get operator overflow directory for current iteration
  1826. std::string overflow_bin_path = "";
  1827. #ifdef ONLINE_DBG_MODE
  1828. if (DumpJsonParser::GetInstance().path().empty()) {
  1829. MS_LOG(INFO) << "Dump config is not set.";
  1830. return "";
  1831. }
  1832. auto debugger = Debugger::GetInstance();
  1833. MS_EXCEPTION_IF_NULL(debugger);
  1834. auto cur_graph = debugger->GetGraphPtr();
  1835. if (cur_graph == nullptr) {
  1836. return "";
  1837. }
  1838. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1839. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1840. if (!realpath.has_value()) {
  1841. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1842. return "";
  1843. }
  1844. overflow_bin_path = realpath.value() + '/';
  1845. #endif
  1846. return overflow_bin_path;
  1847. }
  1848. void DebugServices::AddOpOverflowOpNames(const std::string &overflow_bin_path, std::vector<std::string> *op_names) {
  1849. MS_EXCEPTION_IF_NULL(op_names);
  1850. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1851. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1852. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1853. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1854. DIR *d = opendir(overflow_bin_path.c_str());
  1855. if (d == nullptr) {
  1856. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1857. } else {
  1858. struct dirent *dir = nullptr;
  1859. while ((dir = readdir(d)) != nullptr) {
  1860. std::string file_name = dir->d_name;
  1861. std::string file_path = overflow_bin_path + std::string("/") + file_name;
  1862. if (IsRegFile(file_path)) {
  1863. // attempt to read the file
  1864. std::ifstream infile;
  1865. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1866. if (!infile.is_open()) {
  1867. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1868. continue;
  1869. }
  1870. std::string node_name;
  1871. uint64_t task_id = 0;
  1872. uint64_t stream_id = 0;
  1873. // detect overflow bin file
  1874. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1875. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1876. continue;
  1877. }
  1878. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1879. << ".";
  1880. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1881. } else {
  1882. // regular bin file or npy file
  1883. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1884. if (success_parse) {
  1885. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1886. }
  1887. }
  1888. infile.close();
  1889. }
  1890. }
  1891. (void)closedir(d);
  1892. }
  1893. // find the op_names with an overflow hit
  1894. for (auto &task_stream : task_stream_hit) {
  1895. auto op_name = task_stream_to_opname[task_stream];
  1896. if (!op_name.empty()) {
  1897. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1898. op_names->push_back(op_name);
  1899. }
  1900. }
  1901. }
  1902. /*
  1903. * Feature group: Online debugger, Offline debugger.
  1904. * Target device group: Ascend.
  1905. * Runtime category: Old runtime, MindRT.
  1906. * Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
  1907. * directory. This function is for async mode only.
  1908. */
  1909. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1910. unsigned int iteration) {
  1911. if (is_sync_mode_) {
  1912. return false;
  1913. }
  1914. std::string overflow_bin_path = "";
  1915. #ifdef ONLINE_DBG_MODE
  1916. overflow_bin_path = GetOnlineOpOverflowDir();
  1917. #else
  1918. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1919. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1920. overflow_bin_path = RealPath(overflow_bin_path);
  1921. #endif
  1922. if (overflow_bin_path.empty()) {
  1923. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1924. return false;
  1925. }
  1926. // remove kernel_graph_#
  1927. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1928. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1929. // remove path
  1930. size_t last_slash = node_name_to_find.rfind("/");
  1931. std::string op_name_find = "";
  1932. if (last_slash != std::string::npos) {
  1933. op_name_find = node_name_to_find.substr(last_slash + 1);
  1934. }
  1935. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1936. std::vector<std::string> op_names;
  1937. overflow_wp_lock_.lock();
  1938. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1939. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1940. if (found_overflows != overflow_ops_.end()) {
  1941. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1942. op_names = overflow_ops_[overflow_bin_path];
  1943. } else {
  1944. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1945. overflow_ops_[overflow_bin_path] = op_names;
  1946. }
  1947. overflow_wp_lock_.unlock();
  1948. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1949. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1950. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1951. return true;
  1952. }
  1953. // determine if overflow wp has been triggered for the op name (from npy file)
  1954. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1955. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1956. return true;
  1957. }
  1958. return false;
  1959. }
  1960. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1961. std::string op_name_to_find = node_name_to_find;
  1962. const std::string kernel_prefix = "kernel_graph_";
  1963. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1964. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1965. if (start_of_op_name != std::string::npos) {
  1966. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1967. }
  1968. }
  1969. return op_name_to_find;
  1970. }
  1971. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1972. uint64_t *stream_id) {
  1973. size_t task_pos_start = overflow_file_prefix.length();
  1974. size_t task_pos_end = file_name.find(".", task_pos_start);
  1975. if (task_pos_end == std::string::npos) {
  1976. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1977. return false;
  1978. }
  1979. size_t stream_pos_start = task_pos_end + 1;
  1980. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1981. if (stream_pos_end == std::string::npos) {
  1982. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1983. return false;
  1984. }
  1985. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1986. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1987. if (!CheckStoull(task_id, task_id_str)) {
  1988. MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
  1989. << task_id_str << " into an integer.";
  1990. return false;
  1991. }
  1992. if (!CheckStoull(stream_id, stream_id_str)) {
  1993. MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
  1994. << stream_id_str << " into an integer.";
  1995. return false;
  1996. }
  1997. return true;
  1998. }
  1999. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  2000. uint64_t *stream_id) {
  2001. // get the node_name, task_id, and stream_id from dump filename in the following two formats:
  2002. // 1. bin file: node_type.node_name.task_id.stream_id.timestamp
  2003. // 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
  2004. // Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
  2005. // to search the file name from right to left.
  2006. size_t first_dot = file_name.find(".");
  2007. size_t fourth_dot;
  2008. if (file_name.rfind(kNpyExt) != std::string::npos) {
  2009. // npy format file (converted file or A+M dump file)
  2010. size_t pos = file_name.rfind(".");
  2011. const int kFourthFromRight = 4;
  2012. for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
  2013. pos = file_name.rfind(".", pos - 1);
  2014. }
  2015. fourth_dot = pos;
  2016. } else {
  2017. // bin format file
  2018. fourth_dot = file_name.rfind(".");
  2019. }
  2020. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  2021. size_t second_dot = file_name.rfind(".", third_dot - 1);
  2022. // check if dots were found
  2023. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  2024. fourth_dot == std::string::npos) {
  2025. return false;
  2026. }
  2027. // get node_name
  2028. if (first_dot < second_dot) {
  2029. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  2030. } else {
  2031. MS_LOG(ERROR) << "filename parse error to get node_name.";
  2032. return false;
  2033. }
  2034. // get task id
  2035. if (second_dot < third_dot) {
  2036. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  2037. if (!CheckStoull(task_id, extracted_task_id)) {
  2038. MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
  2039. << extracted_task_id << " into an integer.";
  2040. return false;
  2041. }
  2042. } else {
  2043. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get task_id.";
  2044. return false;
  2045. }
  2046. // get stream id
  2047. if (third_dot < fourth_dot) {
  2048. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  2049. if (!CheckStoull(stream_id, extracted_stream_id)) {
  2050. MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
  2051. << extracted_stream_id << " into an integer.";
  2052. return false;
  2053. }
  2054. } else {
  2055. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get stream_id.";
  2056. return false;
  2057. }
  2058. return true;
  2059. }
  2060. std::string DebugServices::RealPath(const std::string &input_path) {
  2061. if (input_path.length() >= PATH_MAX) {
  2062. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  2063. }
  2064. size_t path_split_pos = input_path.find_last_of('/');
  2065. // get real path
  2066. char real_path[PATH_MAX] = {0};
  2067. // input_path is dir + file_name
  2068. if (path_split_pos != std::string::npos) {
  2069. std::string prefix_path = input_path.substr(0, path_split_pos);
  2070. std::string file_name = input_path.substr(path_split_pos);
  2071. if (file_name.length() > NAME_MAX) {
  2072. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  2073. }
  2074. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  2075. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  2076. return "";
  2077. }
  2078. return std::string(real_path) + file_name;
  2079. }
  2080. // input_path is only file_name
  2081. if (input_path.length() > NAME_MAX) {
  2082. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  2083. }
  2084. if (realpath(input_path.c_str(), real_path) == nullptr) {
  2085. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  2086. }
  2087. return std::string(real_path);
  2088. }
  2089. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  2090. #if defined(__APPLE__)
  2091. return *reinterpret_cast<const uint64_t *>(buffer.data());
  2092. #else
  2093. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  2094. #endif
  2095. }
  2096. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  2097. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  2098. }
  2099. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  2100. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  2101. }
  2102. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  2103. if (tensor_loader_->EnableMemoryControl()) {
  2104. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  2105. }
  2106. }
  2107. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  2108. std::string DebugServices::GetNetName() { return net_name_; }
  2109. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  2110. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  2111. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  2112. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  2113. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  2114. } // namespace mindspore