You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 100 kB

5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199
  1. /**
  2. * Copyright 2019-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <limits>
  27. #include <unordered_set>
  28. #include <utility>
  29. #include <regex>
  30. #include "pybind11/embed.h"
  31. #include "pybind11/stl.h"
  32. #ifdef ONLINE_DBG_MODE
  33. #include "debug/common.h"
  34. #include "debug/debugger/debugger.h"
  35. #include "debug/anf_ir_utils.h"
  36. #include "backend/common/session/anf_runtime_algorithm.h"
  37. #include "include/common/utils/anfalgo.h"
  38. #endif
  39. #include "debug/utils.h"
  40. #include "nlohmann/json.hpp"
  41. #include "debug/debugger/tensor_summary.h"
  42. #include "utils/file_utils.h"
  43. namespace mindspore {
  44. namespace {
  45. static constexpr const char *constant_prefix = "Default--data-";
  46. static constexpr const char *kNpyExt = ".npy";
  47. #ifdef __APPLE__
  48. constexpr int kStrErrorNone = 0;
  49. #else
  50. constexpr char *kStrErrorNone = nullptr;
  51. #endif
  52. } // namespace
  53. bool IsRegFile(const std::string &file_path) {
  54. struct stat st;
  55. int ret = stat(file_path.c_str(), &st);
  56. if (ret != 0) {
  57. MS_LOG(ERROR) << "stat error for " << file_path << ", ret is: " << ret;
  58. return false;
  59. }
  60. return S_ISREG(st.st_mode);
  61. }
  62. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  63. DebugServices::DebugServices(const DebugServices &other) {
  64. wp_id_cache_ = other.wp_id_cache_;
  65. net_name_ = other.net_name_;
  66. dump_dir_ = other.dump_dir_;
  67. is_sync_mode_ = other.is_sync_mode_;
  68. tensor_loader_ = other.tensor_loader_;
  69. watchpoint_table_ = other.watchpoint_table_;
  70. }
  71. DebugServices &DebugServices::operator=(const DebugServices &other) {
  72. if (this != &other) {
  73. tensor_loader_ = other.tensor_loader_;
  74. watchpoint_table_ = other.watchpoint_table_;
  75. }
  76. return *this;
  77. }
  78. /*
  79. * Feature group: Online debugger, Offline debugger.
  80. * Target device group: Ascend, GPU.
  81. * Runtime category: Old runtime, MindRT.
  82. * Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
  83. * watchpoint_table.
  84. */
  85. void DebugServices::AddWatchpoint(
  86. int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
  87. const std::vector<parameter_t> &parameter_list,
  88. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  89. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  90. std::lock_guard<std::mutex> lg(lock_);
  91. watchpoint_t watchpoint_item;
  92. watchpoint_item.id = id;
  93. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  94. watchpoint_item.condition.parameter = parameter;
  95. watchpoint_item.check_node_list = check_node_list;
  96. // For offline debugger check_node_device_list is not nullptr.
  97. if (check_node_device_list != nullptr) {
  98. watchpoint_item.check_node_device_list = *check_node_device_list;
  99. }
  100. // For offline debugger check_node_graph_list is not nullptr.
  101. if (check_node_graph_list != nullptr) {
  102. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  103. }
  104. watchpoint_item.parameter_list = parameter_list;
  105. watchpoint_table_[id] = watchpoint_item;
  106. }
  107. void DebugServices::RemoveWatchpoint(unsigned int id) {
  108. std::lock_guard<std::mutex> lg(lock_);
  109. (void)watchpoint_table_.erase(id);
  110. }
  111. /*
  112. * Feature group: Online debugger, Offline debugger.
  113. * Target device group: Ascend, GPU.
  114. * Runtime category: Old runtime, MindRT.
  115. * Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
  116. * not supported.
  117. */
  118. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  119. const void *const previous_tensor_ptr, uint64_t num_elements,
  120. uint64_t prev_num_elements, int tensor_dtype) {
  121. MS_EXCEPTION_IF_NULL(tensor);
  122. switch (tensor_dtype) {
  123. case DbgDataType::DT_UINT8: {
  124. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  125. prev_num_elements);
  126. }
  127. case DbgDataType::DT_INT8: {
  128. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  129. prev_num_elements);
  130. }
  131. case DbgDataType::DT_UINT16: {
  132. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  133. prev_num_elements);
  134. }
  135. case DbgDataType::DT_INT16: {
  136. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  137. prev_num_elements);
  138. }
  139. case DbgDataType::DT_UINT32: {
  140. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  141. prev_num_elements);
  142. }
  143. case DbgDataType::DT_INT32:
  144. case DbgDataType::DT_BASE_INT: {
  145. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  146. prev_num_elements);
  147. }
  148. case DbgDataType::DT_UINT64: {
  149. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  150. prev_num_elements);
  151. }
  152. case DbgDataType::DT_INT64: {
  153. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  154. prev_num_elements);
  155. }
  156. case DbgDataType::DT_FLOAT16: {
  157. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  158. prev_num_elements);
  159. }
  160. case DbgDataType::DT_FLOAT32:
  161. case DbgDataType::DT_BASE_FLOAT: {
  162. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  163. prev_num_elements);
  164. }
  165. case DbgDataType::DT_FLOAT64: {
  166. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  167. prev_num_elements);
  168. }
  169. case DbgDataType::DT_BOOL: {
  170. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  171. prev_num_elements);
  172. }
  173. default:
  174. MS_LOG(INFO) << "Unsupported tensor type";
  175. // return a null pointer
  176. return std::unique_ptr<TensorSummary<int32_t>>{};
  177. }
  178. }
  179. /*
  180. * Feature group: Online debugger, Offline debugger.
  181. * Target device group: Ascend, GPU.
  182. * Runtime category: Old runtime, MindRT.
  183. * Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
  184. */
  185. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  186. if (tensor == nullptr) {
  187. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  188. TensorStat empty_tensor_stat_data;
  189. return empty_tensor_stat_data;
  190. }
  191. std::unique_ptr<ITensorSummary> base_summary_ptr;
  192. void *previous_tensor_ptr = nullptr;
  193. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  194. if (base_summary_ptr == nullptr) {
  195. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  196. TensorStat empty_tensor_stat_data;
  197. return empty_tensor_stat_data;
  198. }
  199. base_summary_ptr->TensorStatistics(tensor->GetType());
  200. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  201. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  202. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  203. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  204. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  205. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  206. return tensor_stat_data;
  207. }
  208. #ifdef OFFLINE_DBG_MODE
  209. /*
  210. * Feature group: Offline debugger.
  211. * Target device group: Ascend, GPU.
  212. * Runtime category: Old runtime, MindRT.
  213. * Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
  214. * run iteration for tensor's graph.
  215. */
  216. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  217. uint64_t *prev_num_elements, bool *history_not_found) {
  218. MS_EXCEPTION_IF_NULL(tensor);
  219. const void *previous_tensor_ptr = nullptr;
  220. std::shared_ptr<TensorData> tensor_prev;
  221. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  222. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  223. *history_not_found = 1;
  224. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  225. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  226. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  227. // read data in offline mode
  228. AsyncFilePool file_paths;
  229. if (!is_sync_mode_) {
  230. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  231. std::vector<unsigned int>{tensor->GetDeviceId()},
  232. std::vector<unsigned int>{tensor->GetPrevIteration()},
  233. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  234. }
  235. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  236. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  237. std::vector<unsigned int>{tensor->GetDeviceId()},
  238. std::vector<unsigned int>{tensor->GetPrevIteration()},
  239. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  240. file_paths, &result_list_prev);
  241. tensor_prev = result_list_prev[0];
  242. if (!tensor_prev->GetByteSize()) {
  243. tensor_prev.reset();
  244. } else {
  245. previous_tensor_ptr = tensor_prev->GetDataPtr();
  246. *prev_num_elements = tensor_prev->GetNumElements();
  247. }
  248. }
  249. return previous_tensor_ptr;
  250. }
  251. #endif
  252. /*
  253. * Feature group: Offline debugger, Online debugger.
  254. * Target device group: Ascend, GPU.
  255. * Runtime category: Old runtime, MindRT.
  256. * Description: Goes through all the watchpoints in the watchpoint table. If the current tensor is in the list of
  257. * check_nodes, that watchpoint is added to the vector of watchpoint_to_check (vector of watchpoints that should be
  258. * checked for the current tensor) .
  259. */
  260. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  261. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  262. std::string *const qualified_tensor_name,
  263. std::vector<watchpoint_t> *const watchpoints_to_check) {
  264. if (tensor == nullptr) {
  265. MS_LOG(DEBUG) << "tensor is nullptr.";
  266. return;
  267. }
  268. const auto tensor_name = tensor->GetName();
  269. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  270. const auto tensor_device_id = tensor->GetDeviceId();
  271. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  272. for (auto w_table_item : watchpoint_table_) {
  273. auto wp = std::get<1>(w_table_item);
  274. // check ONLY init conditions on initial suspended state.
  275. // skip other conditions on initial suspended state
  276. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  277. continue;
  278. }
  279. // skip init condition if not init suspend
  280. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  281. continue;
  282. }
  283. // check change conditions only on step end.
  284. if (wp.change_condition() && !step_end) {
  285. continue;
  286. }
  287. // if recheck, ignore the cache results and reanalyze everything.
  288. // if not a recheck, check only unanalyzed tensors
  289. if (!recheck) {
  290. wp_lock_.lock();
  291. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  292. wp_lock_.unlock();
  293. if (wp_cache_hit) {
  294. continue;
  295. }
  296. }
  297. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  298. if (!found.empty()) {
  299. *qualified_tensor_name = found;
  300. watchpoints_to_check->push_back(w_table_item.second);
  301. #ifdef OFFLINE_DBG_MODE
  302. if (wp.change_condition()) {
  303. *previous_iter_tensor_needed = true;
  304. }
  305. #endif
  306. }
  307. }
  308. }
  309. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  310. const std::string &tensor_name) {
  311. // add analyzed tensor to cache
  312. if (!recheck) {
  313. wp_lock_.lock();
  314. (void)wp_id_cache_[tensor_name].insert(id);
  315. wp_lock_.unlock();
  316. }
  317. }
  318. void DebugServices::SetCheckWatchpointsResult(
  319. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  320. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  321. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  322. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  323. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  324. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  325. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  326. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  327. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  328. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  329. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  330. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  331. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  332. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  333. if (device_id != nullptr) {
  334. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  335. }
  336. if (root_graph_id != nullptr) {
  337. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  338. }
  339. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  340. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  341. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  342. }
  343. #ifdef OFFLINE_DBG_MODE
  344. /*
  345. * Feature group: Offline debugger.
  346. * Target device group: Ascend, GPU.
  347. * Runtime category: Old runtime, MindRT.
  348. * Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and NO_VALUE error_code (for
  349. * new python API feature). Sets checkwatchpoint results.
  350. */
  351. void DebugServices::CheckOutofMemoryandNoValue(
  352. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  353. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  354. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  355. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  356. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  357. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  358. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  359. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  360. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  361. const std::vector<parameter_t> &parameter_list) {
  362. bool set_is_needed = no_mem_to_read || error_on_no_value;
  363. int32_t error_code_to_set = 0;
  364. if (no_mem_to_read) {
  365. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  366. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  367. } else if (error_on_no_value) {
  368. error_code_to_set = ITensorSummary::NO_VALUE;
  369. }
  370. if (set_is_needed) {
  371. for (auto &wp : watchpoints_to_check) {
  372. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  373. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  374. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  375. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  376. parameter_list, error_code_to_set);
  377. }
  378. }
  379. }
  380. /*
  381. * Feature group: Offline debugger.
  382. * Target device group: Ascend, GPU.
  383. * Runtime category: Old runtime, MindRT.
  384. * Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
  385. * feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
  386. * required by other nodes' checking. If previous_tensor exists, change their status in a pair.
  387. */
  388. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  389. // set the tensor into not-in-use status in tensor_loader.
  390. auto tensor_name = tensor->GetName();
  391. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  392. std::to_string(tensor->GetRootGraphId()) + ":" +
  393. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  394. AppendToCacheEvictQueue(key_name_in_cache);
  395. if (previous_tensor_ptr != nullptr) {
  396. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  397. }
  398. }
  399. #endif
  400. #ifdef ONLINE_DBG_MODE
  401. /*
  402. * Feature group: Online debugger.
  403. * Target device group: Ascend, GPU.
  404. * Runtime category: Old runtime, MindRT.
  405. * Description: Compares the current root graph id with the given graph id and returns false if they are not equal
  406. * for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
  407. * root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
  408. * different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
  409. * reading tensor if tensor's root_graph_id is different from current_root_graph_id.
  410. */
  411. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  412. auto debugger = Debugger::GetInstance();
  413. auto ms_context = MsContext::GetInstance();
  414. MS_EXCEPTION_IF_NULL(ms_context);
  415. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  416. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  417. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  418. device_target == kAscendDevice) {
  419. if (cur_root_graph_id != id) {
  420. return false;
  421. }
  422. }
  423. return true;
  424. }
  425. /*
  426. * Feature group: Online debugger.
  427. * Target device group: Ascend, GPU.
  428. * Runtime category: Old runtime, MindRT.
  429. * Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
  430. * prev_tensor_data is not nullptr.
  431. */
  432. const void *DebugServices::PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name) {
  433. std::shared_ptr<TensorData> prev_tensor_data;
  434. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  435. // not supporting watchpoints that need prev tensor for multi root graph networks.
  436. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  437. prev_tensor_data = nullptr;
  438. } else {
  439. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  440. }
  441. if (prev_tensor_data) {
  442. *prev_num_elements = prev_tensor_data->GetNumElements();
  443. return prev_tensor_data->GetDataPtr();
  444. }
  445. return nullptr;
  446. }
  447. #endif
  448. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  449. // check history error_code only for offline debugger
  450. if (history_not_found) {
  451. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  452. }
  453. }
  454. /*
  455. * Feature group: Offline debugger, Online debugger.
  456. * Target device group: Ascend, GPU.
  457. * Runtime category: Old runtime, MindRT.
  458. * Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
  459. * watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
  460. * successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
  461. */
  462. void DebugServices::CheckWatchpointsForTensor(
  463. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  464. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  465. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  466. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  467. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  468. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  469. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  470. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  471. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  472. int list_size = tensor_list->size();
  473. if (end > list_size) {
  474. end = list_size;
  475. }
  476. for (int i = begin; i < end; i++) {
  477. auto &tensor = (*tensor_list)[i];
  478. const auto tensor_name = tensor->GetName();
  479. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  480. const auto tensor_slot = std::to_string(tensor->GetSlot());
  481. std::vector<watchpoint_t> watchpoints_to_check;
  482. std::string qualified_tensor_name;
  483. bool previous_iter_tensor_needed = false;
  484. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  485. &qualified_tensor_name, &watchpoints_to_check);
  486. // no wp set on current tensor
  487. if (watchpoints_to_check.empty()) {
  488. continue;
  489. }
  490. #ifdef OFFLINE_DBG_MODE
  491. // read data in offline mode
  492. bool no_mem_to_read = false;
  493. std::vector<std::shared_ptr<TensorData>> result_list;
  494. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  495. std::vector<unsigned int>{tensor->GetDeviceId()},
  496. std::vector<unsigned int>{tensor->GetIteration()},
  497. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  498. async_file_pool, &result_list, &no_mem_to_read);
  499. tensor = result_list[0];
  500. if (!tensor->GetByteSize()) {
  501. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  502. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  503. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  504. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  505. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  506. tensor->GetRootGraphId(), std::vector<parameter_t>());
  507. tensor.reset();
  508. continue;
  509. }
  510. #endif
  511. // no elements to analyze
  512. if (tensor->GetByteSize() == 0) {
  513. continue;
  514. }
  515. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  516. int tensor_dtype = tensor->GetType();
  517. uint64_t num_elements = tensor->GetNumElements();
  518. uint64_t prev_num_elements = 0;
  519. const void *previous_tensor_ptr = nullptr;
  520. #ifdef OFFLINE_DBG_MODE
  521. bool history_not_found = 0;
  522. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  523. #else
  524. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  525. MS_LOG(DEBUG)
  526. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  527. << tensor->GetName();
  528. continue;
  529. }
  530. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  531. #endif
  532. std::unique_ptr<ITensorSummary> base_summary_ptr;
  533. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  534. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  535. if (base_summary_ptr != nullptr) {
  536. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  537. }
  538. }
  539. for (auto &wp : watchpoints_to_check) {
  540. bool is_hit = false;
  541. int error_code = 0;
  542. std::vector<parameter_t> parameter_list = {};
  543. if (wp.condition.type == IS_OVERFLOW) {
  544. is_hit =
  545. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  546. } else if (base_summary_ptr != nullptr) {
  547. auto item = base_summary_ptr->IsWatchpointHit(wp);
  548. is_hit = std::get<ITensorSummary::eHitPos>(item);
  549. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  550. #ifdef OFFLINE_DBG_MODE
  551. CheckHistoryErrorCode(&error_code, history_not_found);
  552. #endif
  553. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  554. }
  555. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  556. if (is_hit || error_code) {
  557. SetCheckWatchpointsResult(
  558. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  559. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  560. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  561. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  562. }
  563. }
  564. #ifdef OFFLINE_DBG_MODE
  565. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  566. // in offline mode remove the need for the data
  567. tensor.reset();
  568. #endif
  569. }
  570. }
  571. /*
  572. * Feature group: Offline debugger, Online debugger.
  573. * Target device group: Ascend, GPU.
  574. * Runtime category: Old runtime, MindRT.
  575. * Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
  576. * Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
  577. * sorted. In the end, the time for checking the watchpoint in the current step is reported.
  578. */
  579. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  580. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  581. std::vector<std::vector<parameter_t>> *const parameters,
  582. std::vector<int32_t> *const error_codes,
  583. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  584. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  585. const bool init_dbg_suspend, const bool step_end, const bool recheck,
  586. std::vector<unsigned int> *const device_id,
  587. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  588. std::lock_guard<std::mutex> lg(lock_);
  589. auto t1 = std::chrono::high_resolution_clock::now();
  590. if (watchpoint_table_.empty()) {
  591. return;
  592. }
  593. // vector to store execution order of tensors hit
  594. std::vector<int> exec_order;
  595. std::vector<std::string> time_stamps;
  596. int tensor_list_size = tensor_list->size();
  597. uint64_t tensor_list_byte_size = 0;
  598. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  599. if (tensor_list_size <= 0) {
  600. return;
  601. }
  602. // default value for number of threads
  603. const int default_thread_num = 16;
  604. int max_thread_num = default_thread_num;
  605. if (max_thread_num > tensor_list_size) {
  606. max_thread_num = tensor_list_size;
  607. }
  608. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  609. int chunk_size = tensor_list_size / max_thread_num;
  610. int remainder = tensor_list_size % max_thread_num;
  611. partitioned_numbers chunk_exec_orders(max_thread_num);
  612. partitioned_names chunk_names(max_thread_num);
  613. partitioned_names chunk_slots(max_thread_num);
  614. partitioned_numbers chunk_conditions(max_thread_num);
  615. partitioned_id chunk_watchpoint_id(max_thread_num);
  616. partitioned_parameters chunk_parameters(max_thread_num);
  617. partitioned_error_code chunk_error_codes(max_thread_num);
  618. partitioned_id chunk_device_id(max_thread_num);
  619. partitioned_id chunk_root_graph_id(max_thread_num);
  620. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  621. partitioned_names chunk_time_stamp(max_thread_num);
  622. std::vector<std::future<void>> tensor_future_vec;
  623. int begin = 0;
  624. int end = begin;
  625. for (int i = 0; i < max_thread_num; i++) {
  626. end += chunk_size;
  627. if (remainder > 0) {
  628. end++;
  629. remainder--;
  630. }
  631. (void)tensor_future_vec.emplace_back(std::async(
  632. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  633. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  634. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  635. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  636. begin = end;
  637. }
  638. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  639. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  640. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  641. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  642. root_graph_id);
  643. auto t2 = std::chrono::high_resolution_clock::now();
  644. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  645. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  646. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  647. }
  648. /*
  649. * Feature group: Offline debugger, Online debugger.
  650. * Target device group: Ascend, GPU.
  651. * Runtime category: Old runtime, MindRT.
  652. * Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
  653. * debugger is based on the execution order and for the offline debugger is based on the time stamp.
  654. */
  655. void DebugServices::SortWatchpointsInfo(
  656. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  657. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  658. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  659. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  660. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  661. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  662. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  663. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  664. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  665. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  666. std::vector<unsigned int> *const root_graph_id) {
  667. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  668. (*tensor_future_vec)[i].wait();
  669. (*tensor_future_vec)[i].get();
  670. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  671. #ifdef ONLINE_DBG_MODE
  672. // if the execution order is repeated,inserts the new one before the others with same execution order.
  673. std::vector<int>::iterator iter =
  674. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  675. int position = iter - exec_order->begin();
  676. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  677. #endif
  678. #ifdef OFFLINE_DBG_MODE
  679. std::vector<std::string>::iterator iter =
  680. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  681. int position = iter - time_stamps->begin();
  682. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  683. #endif
  684. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  685. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  686. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  687. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  688. if (device_id != nullptr) {
  689. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  690. }
  691. if (root_graph_id != nullptr) {
  692. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  693. }
  694. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  695. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  696. }
  697. // free the memory for used vectors
  698. std::vector<int>().swap((*chunk_exec_orders)[i]);
  699. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  700. std::vector<std::string>().swap((*chunk_names)[i]);
  701. std::vector<std::string>().swap((*chunk_slots)[i]);
  702. std::vector<int>().swap((*chunk_conditions)[i]);
  703. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  704. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  705. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  706. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  707. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  708. if ((*tensor_list_byte_size) > UINT64_MAX - (*chunk_tensor_byte_size)[i]) {
  709. MS_LOG(WARNING) << (*tensor_list_byte_size) << " + " << (*chunk_tensor_byte_size)[i]
  710. << " would lead to integer overflow!";
  711. (*tensor_list_byte_size) = UINT64_MAX;
  712. } else {
  713. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  714. }
  715. }
  716. }
  717. #ifdef OFFLINE_DBG_MODE
  718. /*
  719. * Feature group: Offline debugger.
  720. * Target device group: Ascend, GPU.
  721. * Runtime category: Old runtime, MindRT.
  722. * Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
  723. * if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
  724. * than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
  725. * for the tensor.
  726. */
  727. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  728. std::string *const tensor_type, std::size_t *const size,
  729. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  730. bool *no_mem_to_read) {
  731. std::ifstream infile;
  732. std::string file_path = file_name;
  733. MS_LOG(INFO) << "Reading in file: " << file_path;
  734. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  735. if (!infile.is_open()) {
  736. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  737. const int kMaxFilenameLength = 128;
  738. char err_info[kMaxFilenameLength];
  739. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  740. if (ret != kStrErrorNone) {
  741. MS_LOG(ERROR) << " ErrInfo:" << ret;
  742. }
  743. return;
  744. }
  745. const int substr_len = 2;
  746. const int header_len_offset = 8;
  747. const int header_offset = 9;
  748. const int header_len_buffer_size = 2;
  749. const int type_offset = 10;
  750. // get header length
  751. (void)infile.seekg(0, std::ios::beg);
  752. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  753. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  754. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  755. return;
  756. }
  757. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  758. header_len_buffer.reset();
  759. // read in header
  760. (void)infile.seekg(0, std::ios::beg);
  761. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  762. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  763. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  764. return;
  765. }
  766. std::string header(header_buffer->data() + header_offset, header_len);
  767. header_buffer.reset();
  768. std::size_t type_i = header.find("descr") + type_offset;
  769. if (header.length() < type_i + substr_len) {
  770. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  771. return;
  772. }
  773. *tensor_type = header.substr(type_i, substr_len);
  774. std::size_t shape_i_open = header.find("(");
  775. std::size_t shape_i_close = header.find(")");
  776. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  777. std::string intermediate;
  778. std::stringstream check_shape(shape_str);
  779. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  780. while (getline(check_shape, intermediate, ',')) {
  781. int64_t shape_d = 0;
  782. if (!CheckStoi(&shape_d, intermediate)) {
  783. MS_LOG(INFO) << "Failed to get the shape from file: " << file_name << ", error in convert the string "
  784. << intermediate << " into an integer.";
  785. return;
  786. }
  787. shape->push_back(shape_d);
  788. }
  789. std::size_t word_size = 0;
  790. if (!CheckStoul(&word_size, std::string(1, (*tensor_type)[1]))) {
  791. MS_LOG(INFO) << "Failed to get the word_size from file: " << file_name << ", error in convert the string "
  792. << (*tensor_type)[1] << " into an integer.";
  793. return;
  794. }
  795. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  796. std::size_t data_size = data_len * word_size;
  797. if (!data_size) {
  798. return;
  799. }
  800. // Check memory available before loading tensor into host.
  801. bool has_enough_memory = true;
  802. if (tensor_loader_->EnableMemoryControl()) {
  803. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  804. }
  805. if (!has_enough_memory) {
  806. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  807. *no_mem_to_read = true;
  808. } else {
  809. (void)infile.seekg(header_len + type_offset);
  810. *data_buffer = new std::vector<char>(data_size);
  811. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  812. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  813. }
  814. *size = data_size;
  815. }
  816. }
  817. /*
  818. * Feature group: Offline debugger.
  819. * Target device group: Ascend.
  820. * Runtime category: Old runtime, MindRT.
  821. * Description: This function is to convert files in each directory from device format to host format and append the
  822. * converted npy file name into AsyncFilePool. It's for Ascend async dump only.
  823. */
  824. void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
  825. for (auto const &d : dir_to_files_map) {
  826. std::vector<std::string> files_to_convert_in_dir;
  827. std::vector<std::string> files_after_convert_in_dir;
  828. std::string dump_key = d.first;
  829. for (auto const &pair : d.second) {
  830. std::string file_name = pair.first;
  831. std::string file_name_without_scope = pair.second;
  832. // skip the file that was converted to npy already.
  833. if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
  834. return file_found.find(file_name_without_scope) == std::string::npos;
  835. })) {
  836. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  837. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  838. }
  839. }
  840. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  841. if (!files_to_convert_in_dir.empty()) {
  842. // Look for the installation path to the convert_async package. If not found, throw exception and terminate the
  843. // later task.
  844. {
  845. pybind11::gil_scoped_acquire acquire;
  846. try {
  847. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  848. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  849. (void)convert_obj.attr("convert_files")();
  850. } catch (pybind11::error_already_set &e) {
  851. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  852. }
  853. }
  854. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
  855. }
  856. }
  857. }
  858. /*
  859. * Feature group: Offline debugger.
  860. * Target device group: Ascend.
  861. * Runtime category: Old runtime, MindRT.
  862. * Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
  863. * append into AsyncFilePool. It's for Ascend async dump only.
  864. */
  865. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  866. const std::string &dump_key, AsyncFilePool *const result_list) {
  867. std::string real_dump_iter_dir = RealPath(dump_key);
  868. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  869. if (d_handle == nullptr) {
  870. MS_LOG(INFO) << "Directory " << real_dump_iter_dir << " does not exist in ConvertToHostFormat.";
  871. return;
  872. }
  873. struct dirent *dir = nullptr;
  874. while ((dir = readdir(d_handle)) != nullptr) {
  875. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  876. if (!IsRegFile(name)) {
  877. continue;
  878. }
  879. std::string candidate = dir->d_name;
  880. for (const std::string &file_to_find : files_after_convert_in_dir) {
  881. std::string file_n = file_to_find;
  882. auto last_slash_pos = file_to_find.find_last_of("\\/");
  883. if (last_slash_pos != std::string::npos) {
  884. file_n = file_to_find.substr(last_slash_pos + 1);
  885. }
  886. if (candidate.find(file_n + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
  887. // we found a converted file for this op
  888. std::string found_file = dump_key + "/" + candidate;
  889. (void)result_list->insert(found_file);
  890. }
  891. }
  892. }
  893. (void)closedir(d_handle);
  894. }
  895. /*
  896. * Feature group: Offline debugger.
  897. * Target device group: Ascend, GPU.
  898. * Runtime category: Old runtime, MindRT.
  899. * Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
  900. * dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
  901. * match the file.
  902. */
  903. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  904. if (dump_style_name.empty()) {
  905. return "";
  906. }
  907. std::size_t last_scope_marker;
  908. std::string delim = "/";
  909. last_scope_marker = dump_style_name.rfind(delim);
  910. if (last_scope_marker == std::string::npos) {
  911. return dump_style_name;
  912. }
  913. return dump_style_name.substr(last_scope_marker + delim.size());
  914. }
  915. /*
  916. * Feature group: Offline debugger.
  917. * Target device group: Ascend.
  918. * Runtime category: Old runtime, MindRT.
  919. * Description: This function is to search and prepare the target npy file to be read for each node. If the found file
  920. * is already npy format, push it to AsyncFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
  921. * npy format beforehand.
  922. */
  923. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  924. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  925. std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
  926. DirMap dir_to_files_map;
  927. for (unsigned int i = 0; i < backend_name.size(); i++) {
  928. // form prefix of the tensor file to read from graph pb node name
  929. std::string dump_style_kernel_name = backend_name[i];
  930. // remove slot from name
  931. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  932. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  933. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  934. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  935. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  936. // if node name is constant, skip
  937. if (prefix_dump_file_name.length() > (unsigned)strlen(constant_prefix) &&
  938. prefix_dump_file_name.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  939. continue;
  940. }
  941. // search files in dir for the one that meets the filename prefix and read the file into memory
  942. std::string abspath = RealPath(specific_dump_dir);
  943. DIR *d = opendir(abspath.c_str());
  944. if (d == nullptr) {
  945. MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
  946. return;
  947. }
  948. ProcessConvertList(prefix_dump_file_name, specific_dump_dir, &dir_to_files_map, result_list);
  949. (void)closedir(d);
  950. }
  951. ConvertToHostFormat(dir_to_files_map, result_list);
  952. }
  953. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  954. const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
  955. DirMap dir_to_files_map;
  956. for (const auto &node : proto_dump) {
  957. std::string dump_name = std::get<1>(node);
  958. dump_name = dump_name.substr(0, dump_name.rfind("."));
  959. // search files in dir for the one that meets the filename prefix and read the file into memory
  960. std::string abspath = RealPath(specific_dump_dir);
  961. DIR *d = opendir(abspath.c_str());
  962. if (d == nullptr) {
  963. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  964. return;
  965. }
  966. ProcessConvertList(dump_name, specific_dump_dir, &dir_to_files_map, result_list);
  967. (void)closedir(d);
  968. }
  969. ConvertToHostFormat(dir_to_files_map, result_list);
  970. }
  971. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  972. DirMap *dir_to_files_map, AsyncFilePool *const result_list) {
  973. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  974. DIR *d = opendir(specific_dump_dir.c_str());
  975. struct dirent *dir = nullptr;
  976. while ((dir = readdir(d)) != nullptr) {
  977. std::string file_name = dir->d_name;
  978. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  979. if (!IsRegFile(file_path)) {
  980. continue;
  981. }
  982. std::string file_name_w_o_perfix = file_name;
  983. auto type_pos = file_name.find('.');
  984. // adding dot to avoid problematic matching in the scope.
  985. if (type_pos == std::string::npos ||
  986. file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
  987. continue;
  988. }
  989. if (file_name.rfind(kNpyExt) == std::string::npos) {
  990. std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
  991. (void)file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
  992. // if file matches prefix and is in device format add to candidate files to convert.
  993. (*dir_to_files_map)[specific_dump_dir].push_back(std::make_pair(file_name, file_name_w_o_perfix));
  994. } else {
  995. // otherwise, if file matches prefix and already has been converted to host format
  996. // add to result of converted files.
  997. (void)result_list->insert(file_path);
  998. }
  999. }
  1000. (void)closedir(d);
  1001. }
  1002. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  1003. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  1004. uint32_t root_graph_id, const AsyncFilePool &async_file_pool,
  1005. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  1006. for (auto &node : proto_dump) {
  1007. std::vector<size_t> slot_list;
  1008. std::string dump_style_name = std::get<1>(node);
  1009. // Get dump_name and output_str from the second element of tuple
  1010. std::size_t found_dot = dump_style_name.rfind(".");
  1011. std::string dump_name = dump_style_name.substr(0, found_dot);
  1012. std::string output_str = dump_style_name.substr(found_dot + 1);
  1013. bool output_flag = (output_str == "output");
  1014. for (const std::string &file_name : async_file_pool) {
  1015. std::string file_name_to_check = file_name;
  1016. auto delim = file_name.rfind("/");
  1017. if (delim != std::string::npos) {
  1018. file_name_to_check = file_name.substr(delim + 1);
  1019. }
  1020. std::size_t found = file_name_to_check.find("." + dump_name + ".");
  1021. std::size_t found_out = file_name_to_check.find(output_str, found + dump_name.length());
  1022. std::size_t found_dot_start = file_name_to_check.find(".", found_out);
  1023. std::size_t found_dot_end = file_name_to_check.find(".", found_dot_start);
  1024. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  1025. found_out != std::string::npos) {
  1026. std::string slot_str = file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1);
  1027. size_t slot = 0;
  1028. if (!CheckStoul(&slot, slot_str)) {
  1029. MS_LOG(INFO) << "Failed to get the slot_id from file_name: " << file_name << ", error in convert the string "
  1030. << slot_str << " into an integer.";
  1031. continue;
  1032. }
  1033. slot_list.push_back(slot);
  1034. }
  1035. }
  1036. for (auto slot : slot_list) {
  1037. // add a TensorData entry (data will be read when needed)
  1038. std::vector<int64_t> shape;
  1039. std::string orig_name = std::get<0>(node);
  1040. auto tensor_data = std::make_shared<TensorData>();
  1041. tensor_data->SetName(orig_name);
  1042. tensor_data->SetExecutionOrder(0);
  1043. tensor_data->SetSlot(slot);
  1044. tensor_data->SetIteration(iteration);
  1045. tensor_data->SetDeviceId(device_id);
  1046. tensor_data->SetRootGraphId(root_graph_id);
  1047. tensor_data->SetDataPtr(nullptr);
  1048. tensor_data->SetByteSize(0);
  1049. tensor_data->SetType("");
  1050. tensor_data->SetShape(shape);
  1051. tensor_data->SetIsOutput(output_flag);
  1052. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1053. tensor_list->push_back(tensor_data);
  1054. }
  1055. }
  1056. }
  1057. /*
  1058. * Feature group: Offline debugger.
  1059. * Target device group: Ascend, GPU.
  1060. * Runtime category: Old runtime, MindRT.
  1061. * Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
  1062. * from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
  1063. */
  1064. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  1065. std::regex re;
  1066. if (mode == "rank") {
  1067. re = "^rank_([0-9]+)$";
  1068. } else if (mode == "graph") {
  1069. re = "^([0-9]+)$";
  1070. }
  1071. std::smatch tokens;
  1072. if (regex_match(name, tokens, re)) {
  1073. return std::stoi(tokens[1]);
  1074. } else {
  1075. return UINT32_MAX;
  1076. }
  1077. }
  1078. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  1079. std::vector<uint32_t> rank_id_list;
  1080. std::string dump_dir = GetDumpDir();
  1081. DIR *d_handle = opendir(dump_dir.c_str());
  1082. if (d_handle == nullptr) {
  1083. MS_LOG(ERROR) << "Dump directory does not exist.";
  1084. return rank_id_list;
  1085. }
  1086. struct dirent *dir = nullptr;
  1087. while ((dir = readdir(d_handle)) != nullptr) {
  1088. struct stat st;
  1089. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  1090. int ret = stat(name.c_str(), &st);
  1091. if (ret != 0) {
  1092. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1093. (void)closedir(d_handle);
  1094. return rank_id_list;
  1095. }
  1096. if (S_ISDIR(st.st_mode)) {
  1097. std::string rank_dir_name = dir->d_name;
  1098. uint32_t rank_id = GetRankOrGraphId("rank", rank_dir_name);
  1099. if (rank_id != UINT32_MAX) {
  1100. rank_id_list.push_back(rank_id);
  1101. }
  1102. }
  1103. }
  1104. (void)closedir(d_handle);
  1105. return rank_id_list;
  1106. }
  1107. /*
  1108. * Feature group: Offline debugger.
  1109. * Target device group: Ascend, GPU.
  1110. * Runtime category: Old runtime, MindRT.
  1111. * Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
  1112. * graph_ids. Then the history file is read for all the extracted graph_ids.
  1113. */
  1114. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  1115. std::string net_name = GetNetName();
  1116. std::string dump_dir = GetDumpDir();
  1117. for (uint32_t rank_id : rank_id_list) {
  1118. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  1119. std::string abspath = RealPath(path);
  1120. DIR *d_handle_rank = opendir(abspath.c_str());
  1121. if (d_handle_rank == nullptr) {
  1122. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  1123. continue;
  1124. }
  1125. struct dirent *direc = nullptr;
  1126. while ((direc = readdir(d_handle_rank)) != nullptr) {
  1127. struct stat st;
  1128. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  1129. int ret = stat(name.c_str(), &st);
  1130. if (ret != 0) {
  1131. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1132. (void)closedir(d_handle_rank);
  1133. return;
  1134. }
  1135. if (S_ISDIR(st.st_mode)) {
  1136. std::string graph_dir = direc->d_name;
  1137. if (graph_dir == "." || graph_dir == "..") {
  1138. continue;
  1139. }
  1140. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  1141. if (graph_id != UINT32_MAX) {
  1142. ReadGraphsHistory(rank_id, graph_id);
  1143. }
  1144. }
  1145. }
  1146. (void)closedir(d_handle_rank);
  1147. }
  1148. }
  1149. void DebugServices::SetGraphsHistory() {
  1150. // extract rank_id_list
  1151. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  1152. // for each rank_id extract the graph_id list and set the dump version
  1153. // and for each graph read the graph history file
  1154. CheckDumpGraphIdList(rank_id_list);
  1155. }
  1156. /*
  1157. * Feature group: Offline debugger.
  1158. * Target device group: Ascend, GPU.
  1159. * Runtime category: Old runtime, MindRT.
  1160. * Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
  1161. * the data in graphs_run_history_ for the given rank and graph id.
  1162. */
  1163. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1164. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1165. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1166. // graph history was already stored for this rank_id and graph_id
  1167. return;
  1168. }
  1169. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1170. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1171. DIR *d_handle = opendir(exec_order_path.c_str());
  1172. if (d_handle == nullptr) {
  1173. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1174. return;
  1175. }
  1176. // read file and store the info
  1177. std::string full_path = exec_order_path + "/" + file_to_check;
  1178. std::string checked_path = RealPath(full_path);
  1179. if (!checked_path.empty()) {
  1180. ReadGraphRunIter(checked_path, rank_and_graph);
  1181. }
  1182. (void)closedir(d_handle);
  1183. }
  1184. /*
  1185. * Feature group: Offline debugger.
  1186. * Target device group: Ascend, GPU.
  1187. * Runtime category: Old runtime, MindRT.
  1188. * Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
  1189. * tuple with two elements, the first element is the node name and the second element is whether the node is output or
  1190. * not.
  1191. */
  1192. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1193. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1194. for (auto w_table_item : watchpoint_table_) {
  1195. auto wp = std::get<1>(w_table_item);
  1196. unsigned int index = 0;
  1197. for (auto check_node : wp.check_node_list) {
  1198. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1199. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1200. // graph represents root_graph for Ascend and kernel_graph for GPU
  1201. for (auto rank : ranks) {
  1202. for (auto graph : graphs) {
  1203. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1204. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1205. }
  1206. }
  1207. index++;
  1208. }
  1209. }
  1210. return rank_and_graph_to_nodes;
  1211. }
  1212. /*
  1213. * Feature group: Offline debugger.
  1214. * Target device group: Ascend, GPU.
  1215. * Runtime category: Old runtime, MindRT.
  1216. * Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
  1217. * graph in a vector and inserts it to graphs_run_history_ map.
  1218. */
  1219. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1220. std::ifstream infile;
  1221. std::string line;
  1222. infile.open(file_path.c_str());
  1223. if (!infile.is_open()) {
  1224. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1225. const int kMaxFilenameLength = NAME_MAX;
  1226. char err_info[kMaxFilenameLength];
  1227. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1228. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1229. }
  1230. return;
  1231. }
  1232. std::vector<uint32_t> run_iters_vec;
  1233. while (std::getline(infile, line)) {
  1234. uint32_t iter;
  1235. std::stringstream ss(line);
  1236. ss >> iter;
  1237. run_iters_vec.push_back(iter);
  1238. }
  1239. (void)graphs_run_history_.emplace(
  1240. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1241. }
  1242. /*
  1243. * Feature group: Offline debugger.
  1244. * Target device group: Ascend, GPU.
  1245. * Runtime category: Old runtime, MindRT.
  1246. * Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
  1247. * to the tensor_list_map_.
  1248. */
  1249. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1250. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1251. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1252. const std::string &type_name, const std::vector<int64_t> &shape,
  1253. std::vector<char> *buffer,
  1254. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1255. // call LoadNewTensor to store tensor in internal cache
  1256. auto tensor_data = std::make_shared<TensorData>();
  1257. tensor_data->SetName(backend_name);
  1258. tensor_data->SetExecutionOrder(0);
  1259. tensor_data->SetSlot(slot);
  1260. tensor_data->SetIteration(iteration);
  1261. tensor_data->SetDeviceId(device_id);
  1262. tensor_data->SetRootGraphId(root_graph_id);
  1263. tensor_data->SetIsOutput(is_output);
  1264. if (buffer != nullptr) {
  1265. tensor_data->SetDataPtr(buffer->data());
  1266. } else {
  1267. tensor_data->SetDataPtr(nullptr);
  1268. }
  1269. tensor_data->SetByteSize(data_size);
  1270. tensor_data->SetType(type_name);
  1271. tensor_data->SetShape(shape);
  1272. tensor_data->SetTimeStamp(time_stamp);
  1273. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1274. if (data_size) {
  1275. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1276. }
  1277. // add to result_list
  1278. result_list->push_back(tensor_data);
  1279. }
  1280. /*
  1281. * Feature group: Offline debugger.
  1282. * Target device group: Ascend, GPU.
  1283. * Runtime category: Old runtime, MindRT.
  1284. * Description: Generate a string in format of {no-scope-op-name}.{input-output}.{slot} to check and match files to
  1285. * read.
  1286. */
  1287. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1288. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1289. std::string dump_style_name_part = *dump_style_kernel_name;
  1290. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1291. std::string slot_str;
  1292. if (is_output) {
  1293. slot_str = ".output." + std::to_string(slot);
  1294. } else {
  1295. slot_str = ".input." + std::to_string(slot);
  1296. }
  1297. dump_style_name_part += slot_str;
  1298. *prefix_dump_file_name = dump_style_name_part;
  1299. *slot_string_to_check = slot_str;
  1300. }
  1301. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1302. // get file with the newest timestamp from the list.
  1303. if (file_list.empty()) {
  1304. return "";
  1305. }
  1306. std::sort(file_list.begin(), file_list.end());
  1307. return file_list.back();
  1308. }
  1309. std::string GetTimeStampStr(std::string file_path) {
  1310. // get the file_name from file_path.
  1311. size_t pos = file_path.rfind("/");
  1312. std::string file_name = file_path.substr(pos + 1);
  1313. size_t first_dot = file_name.rfind(".");
  1314. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1315. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1316. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1317. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1318. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1319. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1320. return time_stamp;
  1321. }
  1322. return "";
  1323. }
  1324. /*
  1325. * Feature group: Offline debugger.
  1326. * Target device group: Ascend, GPU.
  1327. * Runtime category: Old runtime, MindRT.
  1328. * Description: Search files in dir (sync mode) or in AsyncFilePool (async mode) for the one that meets the filename
  1329. * prefix and read the file into memory.
  1330. */
  1331. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1332. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1333. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1334. const AsyncFilePool &async_file_pool,
  1335. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1336. bool *no_mem_to_read) {
  1337. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1338. // form prefix of the tensor file to read from graph pb node name
  1339. std::string dump_style_kernel_name = backend_name[i];
  1340. // remove slot from name
  1341. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1342. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1343. std::string slot_string_to_check;
  1344. std::string prefix_dump_file_name;
  1345. std::string specific_dump_dir;
  1346. bool is_cst = false;
  1347. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1348. // prefix_dump_to_check is node name used to find corresponding dump file
  1349. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1350. // if node name has prefix of "Default--data-", consider as constant, search in cst folder
  1351. if (prefix_dump_to_check.length() > (unsigned)strlen(constant_prefix) &&
  1352. prefix_dump_to_check.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  1353. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1354. std::to_string(root_graph_id[i]) + "/constants";
  1355. is_cst = true;
  1356. const std::string prefix = "Default--";
  1357. prefix_dump_file_name = prefix_dump_file_name.substr(prefix.length());
  1358. prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
  1359. } else {
  1360. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1361. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1362. }
  1363. MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
  1364. if (is_sync_mode_ || is_cst) {
  1365. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1366. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1367. } else {
  1368. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1369. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1370. no_mem_to_read);
  1371. }
  1372. }
  1373. }
  1374. /*
  1375. * Feature group: Offline debugger.
  1376. * Target device group: Ascend, GPU.
  1377. * Runtime category: Old runtime, MindRT.
  1378. * Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
  1379. * tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
  1380. * data_size = 0, empty shape and nullptr buffer.
  1381. */
  1382. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1383. const std::string &backend_name, const unsigned int device_id,
  1384. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1385. bool *no_mem_to_read, unsigned int iteration,
  1386. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1387. std::string time_stamp = "";
  1388. std::string type_name = "";
  1389. size_t data_size = 0;
  1390. std::vector<int64_t> shape;
  1391. std::vector<char> *buffer = nullptr;
  1392. if (found) {
  1393. std::string result_path = GetNewestFilePath(matched_paths);
  1394. time_stamp = GetTimeStampStr(result_path);
  1395. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1396. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1397. std::to_string(slot);
  1398. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1399. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1400. type_name, shape, buffer, result_list);
  1401. } else {
  1402. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1403. buffer, result_list);
  1404. MS_LOG(INFO) << "Target tensor has not been found.";
  1405. }
  1406. }
  1407. /*
  1408. * Feature group: Offline debugger.
  1409. * Target device group: Ascend, GPU.
  1410. * Runtime category: Old runtime, MindRT.
  1411. * Description: Looks for the files that match the node_name (in the dump directory) for sync dump, read the newest file
  1412. * and add the related tensor_data object.
  1413. */
  1414. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1415. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1416. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1417. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1418. std::string abspath = RealPath(specific_dump_dir);
  1419. DIR *d = opendir(abspath.c_str());
  1420. bool found_file = false;
  1421. std::vector<std::string> matched_paths;
  1422. if (d == nullptr) {
  1423. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1424. } else {
  1425. struct dirent *dir = nullptr;
  1426. while ((dir = readdir(d)) != nullptr) {
  1427. std::string file_name = dir->d_name;
  1428. std::string file_path = abspath + std::string("/") + file_name;
  1429. if (IsRegFile(file_path)) {
  1430. std::string stripped_file_name = GetStrippedFilename(file_name);
  1431. if (stripped_file_name.empty()) {
  1432. continue;
  1433. }
  1434. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1435. if (found != 0) {
  1436. continue;
  1437. }
  1438. matched_paths.push_back(file_path);
  1439. found_file = true;
  1440. }
  1441. }
  1442. (void)closedir(d);
  1443. }
  1444. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1445. no_mem_to_read, iteration, result_list);
  1446. }
  1447. /*
  1448. * Feature group: Offline debugger.
  1449. * Target device group: Ascend.
  1450. * Runtime category: Old runtime, MindRT.
  1451. * Description: Iterates through all the file paths in the async_file_pool and looks for the files that match the
  1452. * node_name for async dump, read the newest file and add the related tensor_data object.
  1453. */
  1454. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1455. const std::string &slot_string_to_check, const std::string &backend_name,
  1456. size_t slot, unsigned int device_id, unsigned int iteration,
  1457. unsigned int root_graph_id, const bool &is_output,
  1458. const AsyncFilePool &async_file_pool,
  1459. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1460. bool found = false;
  1461. std::vector<std::string> matched_paths;
  1462. // if async mode
  1463. for (const std::string &file_path : async_file_pool) {
  1464. std::string file_name_to_check = file_path;
  1465. auto delim = file_path.rfind("/");
  1466. if (delim != std::string::npos) {
  1467. file_name_to_check = file_path.substr(delim + 1);
  1468. }
  1469. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1470. file_name_to_check.find("." + prefix_dump_to_check + ".") != std::string::npos &&
  1471. file_name_to_check.find(slot_string_to_check + ".") != std::string::npos) {
  1472. matched_paths.push_back(file_path);
  1473. found = true;
  1474. }
  1475. }
  1476. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1477. iteration, result_list);
  1478. }
  1479. /*
  1480. * Feature group: Offline debugger.
  1481. * Target device group: Ascend, GPU.
  1482. * Runtime category: Old runtime, MindRT.
  1483. * Description: Obtain opname, output_str and slot from the npy file. Make sure its return value is the same as
  1484. * SetPrefixToCheck(). The input/output examples look like:
  1485. * input: {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
  1486. * output: {op_name}.{output_or_input_string}.{slot}
  1487. */
  1488. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1489. // strip off the task_id, stream_id, and timestamp, then compare
  1490. size_t first_dot = file_name.find(".");
  1491. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1492. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1493. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1494. return std::string();
  1495. }
  1496. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1497. size_t second_dot = fifth_dot;
  1498. const int8_t kSecondDotPosition = 2;
  1499. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1500. second_dot = file_name.rfind(".", second_dot - 1);
  1501. }
  1502. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1503. return std::string();
  1504. }
  1505. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1506. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1507. std::string stripped_file_name = start_string + end_string;
  1508. return stripped_file_name;
  1509. }
  1510. /*
  1511. * Feature group: Offline debugger.
  1512. * Target device group: Ascend, GPU.
  1513. * Runtime category: Old runtime, MindRT.
  1514. * Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
  1515. * original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
  1516. * = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
  1517. * checkwatchpoint functions.
  1518. */
  1519. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration,
  1520. AsyncFilePool *const async_file_pool,
  1521. bool error_on_no_value) {
  1522. // get a list of nodes and the devices they are on to monitor
  1523. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1524. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1525. GetAllWpNodes();
  1526. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1527. // as they are found
  1528. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1529. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1530. uint32_t rank_id = std::get<0>(rank_and_graph);
  1531. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1532. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1533. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1534. std::string real_dump_dir = RealPath(specific_dump_dir);
  1535. if (real_dump_dir.empty()) {
  1536. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1537. continue;
  1538. }
  1539. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1540. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1541. // convert node names to dump style
  1542. for (auto node : wp_nodes) {
  1543. std::string orig_name = std::get<0>(node);
  1544. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1545. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1546. bool node_is_out = std::get<1>(node);
  1547. if (node_is_out) {
  1548. dump_style_name += ".output";
  1549. } else {
  1550. dump_style_name += ".input";
  1551. }
  1552. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1553. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1554. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1555. }
  1556. }
  1557. if (is_sync_mode_) {
  1558. // search files in dir for the one that meets the filename prefix and read the file into memory
  1559. ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1560. error_on_no_value);
  1561. } else {
  1562. // convert all files in proto_to_dump to npy and add to pool of async file names
  1563. ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
  1564. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1565. &tensor_list);
  1566. }
  1567. }
  1568. return tensor_list;
  1569. }
  1570. /*
  1571. * Feature group: Offline debugger.
  1572. * Target device group: Ascend, GPU.
  1573. * Runtime category: Old runtime, MindRT.
  1574. * Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
  1575. * names in proto_to_dump vector.
  1576. */
  1577. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1578. const std::string &specific_dump_dir, unsigned int iteration,
  1579. unsigned int device_id, unsigned int root_graph_id,
  1580. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1581. bool error_on_no_value) {
  1582. DIR *d = opendir(specific_dump_dir.c_str());
  1583. if (d == nullptr) {
  1584. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
  1585. return;
  1586. }
  1587. struct dirent *dir = nullptr;
  1588. while ((dir = readdir(d)) != nullptr) {
  1589. std::string file_name = dir->d_name;
  1590. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  1591. if (IsRegFile(file_path)) {
  1592. for (auto &node : proto_to_dump) {
  1593. std::string dump_name = std::get<1>(node);
  1594. std::string stripped_file_name = GetStrippedFilename(file_name);
  1595. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1596. continue;
  1597. }
  1598. std::size_t found = stripped_file_name.rfind(dump_name + ".", 0);
  1599. if (found == 0) {
  1600. size_t slot = 0;
  1601. if (!CheckStoul(&slot, stripped_file_name.substr(dump_name.length() + 1))) {
  1602. MS_LOG(INFO) << "Failed to get the slot from file_name: " << file_name << ", error in convert the string "
  1603. << stripped_file_name.substr(dump_name.length() + 1) << " into an integer.";
  1604. continue;
  1605. }
  1606. std::vector<int64_t> shape;
  1607. std::string orig_name = std::get<0>(node);
  1608. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1609. bool output_flag = (output_str == "output");
  1610. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, nullptr,
  1611. tensor_list);
  1612. break;
  1613. }
  1614. }
  1615. }
  1616. }
  1617. (void)closedir(d);
  1618. }
  1619. std::string DebugServices::IterationString(unsigned int iteration) {
  1620. std::string iteration_string;
  1621. bool init_dbg_suspend = (iteration == std::numeric_limits<unsigned int>::max());
  1622. if (init_dbg_suspend) {
  1623. iteration_string = "init";
  1624. } else {
  1625. iteration_string = std::to_string(iteration);
  1626. }
  1627. return iteration_string;
  1628. }
  1629. #endif
  1630. /*
  1631. * Feature group: Online debugger.
  1632. * Target device group: Ascend, GPU.
  1633. * Runtime category: Old runtime, MindRT.
  1634. * Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
  1635. * current root_graph_id, it updates the given vectors.
  1636. */
  1637. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1638. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1639. std::vector<unsigned int> *const dtype,
  1640. std::vector<std::vector<int64_t>> *const shape) {
  1641. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1642. tensor_loader_->SearchTensors(name, &result_list);
  1643. for (auto result : result_list) {
  1644. if (std::get<1>(result) == nullptr) {
  1645. continue;
  1646. }
  1647. #ifdef ONLINE_DBG_MODE
  1648. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1649. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1650. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1651. << ".";
  1652. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1653. }
  1654. #endif
  1655. (void)ret_name->emplace_back(std::get<0>(result));
  1656. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1657. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1658. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1659. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1660. }
  1661. }
  1662. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1663. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1664. if (result_list == nullptr) {
  1665. MS_LOG(DEBUG) << "result_list is nullptr.";
  1666. return;
  1667. }
  1668. tensor_loader_->SearchTensors(name, result_list);
  1669. }
  1670. #ifdef ONLINE_DBG_MODE
  1671. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1672. bool ret = false;
  1673. for (auto w_table_item : watchpoint_table_) {
  1674. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1675. for (auto check_node : check_node_list) {
  1676. std::string w_name = std::get<0>(check_node);
  1677. bool w_type = std::get<1>(check_node);
  1678. if ((w_type == true &&
  1679. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1680. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1681. ret = true;
  1682. return ret;
  1683. }
  1684. }
  1685. }
  1686. return ret;
  1687. }
  1688. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1689. if (kernel != nullptr && w_name.length() > 0) {
  1690. auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
  1691. for (size_t j = 0; j < input_size; ++j) {
  1692. auto input_kernel = kernel->input(j + 1);
  1693. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1694. auto found = w_name.find_last_of('/');
  1695. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1696. return true;
  1697. }
  1698. return false;
  1699. } else {
  1700. return false;
  1701. }
  1702. }
  1703. #endif
  1704. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1705. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1706. return tensor_loader_->GetTensor(tensor_name);
  1707. }
  1708. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1709. #ifdef ONLINE_DBG_MODE
  1710. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1711. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1712. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1713. size_t slot) const {
  1714. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1715. device_type, addr_format, slot);
  1716. }
  1717. #endif
  1718. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1719. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1720. }
  1721. /*
  1722. * Feature group: Offline debugger.
  1723. * Target device group: Ascend, GPU.
  1724. * Runtime category: Old runtime, MindRT.
  1725. * Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
  1726. * run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
  1727. * prev_iteration.
  1728. */
  1729. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1730. uint32_t prev_iter;
  1731. uint32_t rank_id = tensor->GetDeviceId();
  1732. uint32_t root_graph_id = tensor->GetRootGraphId();
  1733. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1734. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1735. return UINT32_MAX;
  1736. }
  1737. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1738. tensor->GetIteration());
  1739. if (it == graphs_run_history_[rank_and_graph].end()) {
  1740. // The graph is not executed in that iteration
  1741. return UINT32_MAX;
  1742. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1743. // current iteration is the first iteration that the graph was run
  1744. // no prev iter is available
  1745. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1746. << " is the first run iteration for tensor: " << tensor->GetName();
  1747. return UINT32_MAX;
  1748. }
  1749. (void)it--;
  1750. prev_iter = *it;
  1751. tensor->SetPrevIteration(prev_iter);
  1752. return prev_iter;
  1753. }
  1754. void DebugServices::ResetLoadedTensors() {
  1755. wp_id_cache_.clear();
  1756. MS_LOG(INFO) << "Resetting loaded tensors";
  1757. tensor_loader_->MoveParametersCurrentToPrev();
  1758. tensor_loader_->EmptyCurrentTensor();
  1759. // will move parameters from previous to current map
  1760. tensor_loader_->SwapCurrentPrev();
  1761. overflow_ops_.clear();
  1762. }
  1763. #ifdef ONLINE_DBG_MODE
  1764. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1765. MS_EXCEPTION_IF_NULL(kernel);
  1766. std::vector<std::shared_ptr<TensorData>> result;
  1767. auto output_size = common::AnfAlgo::GetOutputTensorNum(kernel);
  1768. auto kernel_name = GetKernelNodeName(kernel);
  1769. for (size_t j = 0; j < output_size; ++j) {
  1770. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1771. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1772. if (tensor != nullptr) {
  1773. result.push_back(tensor);
  1774. }
  1775. }
  1776. return result;
  1777. }
  1778. #endif
  1779. std::string GetOnlineOpOverflowDir() {
  1780. // only called for online debugger mode
  1781. // get operator overflow directory for current iteration
  1782. std::string overflow_bin_path = "";
  1783. #ifdef ONLINE_DBG_MODE
  1784. if (DumpJsonParser::GetInstance().path().empty()) {
  1785. MS_LOG(INFO) << "Dump config is not set.";
  1786. return "";
  1787. }
  1788. auto debugger = Debugger::GetInstance();
  1789. MS_EXCEPTION_IF_NULL(debugger);
  1790. auto cur_graph = debugger->GetGraphPtr();
  1791. if (cur_graph == nullptr) {
  1792. return "";
  1793. }
  1794. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1795. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1796. if (!realpath.has_value()) {
  1797. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1798. return "";
  1799. }
  1800. overflow_bin_path = realpath.value() + '/';
  1801. #endif
  1802. return overflow_bin_path;
  1803. }
  1804. void DebugServices::AddOpOverflowOpNames(const std::string &overflow_bin_path, std::vector<std::string> *op_names) {
  1805. MS_EXCEPTION_IF_NULL(op_names);
  1806. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1807. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1808. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1809. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1810. DIR *d = opendir(overflow_bin_path.c_str());
  1811. if (d == nullptr) {
  1812. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1813. } else {
  1814. struct dirent *dir = nullptr;
  1815. while ((dir = readdir(d)) != nullptr) {
  1816. std::string file_name = dir->d_name;
  1817. std::string file_path = overflow_bin_path + std::string("/") + file_name;
  1818. if (IsRegFile(file_path)) {
  1819. // attempt to read the file
  1820. std::ifstream infile;
  1821. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1822. if (!infile.is_open()) {
  1823. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1824. continue;
  1825. }
  1826. std::string node_name;
  1827. uint64_t task_id = 0;
  1828. uint64_t stream_id = 0;
  1829. // detect overflow bin file
  1830. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1831. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1832. continue;
  1833. }
  1834. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1835. << ".";
  1836. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1837. } else {
  1838. // regular bin file or npy file
  1839. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1840. if (success_parse) {
  1841. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1842. }
  1843. }
  1844. infile.close();
  1845. }
  1846. }
  1847. (void)closedir(d);
  1848. }
  1849. // find the op_names with an overflow hit
  1850. for (auto &task_stream : task_stream_hit) {
  1851. auto op_name = task_stream_to_opname[task_stream];
  1852. if (!op_name.empty()) {
  1853. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1854. op_names->push_back(op_name);
  1855. }
  1856. }
  1857. }
  1858. /*
  1859. * Feature group: Online debugger, Offline debugger.
  1860. * Target device group: Ascend.
  1861. * Runtime category: Old runtime, MindRT.
  1862. * Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
  1863. * directory. This function is for async mode only.
  1864. */
  1865. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1866. unsigned int iteration) {
  1867. if (is_sync_mode_) {
  1868. return false;
  1869. }
  1870. std::string overflow_bin_path = "";
  1871. #ifdef ONLINE_DBG_MODE
  1872. overflow_bin_path = GetOnlineOpOverflowDir();
  1873. #else
  1874. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1875. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1876. overflow_bin_path = RealPath(overflow_bin_path);
  1877. #endif
  1878. if (overflow_bin_path.empty()) {
  1879. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1880. return false;
  1881. }
  1882. // remove kernel_graph_#
  1883. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1884. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1885. // remove path
  1886. size_t last_slash = node_name_to_find.rfind("/");
  1887. std::string op_name_find = "";
  1888. if (last_slash != std::string::npos) {
  1889. op_name_find = node_name_to_find.substr(last_slash + 1);
  1890. }
  1891. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1892. std::vector<std::string> op_names;
  1893. overflow_wp_lock_.lock();
  1894. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1895. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1896. if (found_overflows != overflow_ops_.end()) {
  1897. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1898. op_names = overflow_ops_[overflow_bin_path];
  1899. } else {
  1900. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1901. overflow_ops_[overflow_bin_path] = op_names;
  1902. }
  1903. overflow_wp_lock_.unlock();
  1904. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1905. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1906. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1907. return true;
  1908. }
  1909. // determine if overflow wp has been triggered for the op name (from npy file)
  1910. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1911. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1912. return true;
  1913. }
  1914. return false;
  1915. }
  1916. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1917. std::string op_name_to_find = node_name_to_find;
  1918. const std::string kernel_prefix = "kernel_graph_";
  1919. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1920. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1921. if (start_of_op_name != std::string::npos) {
  1922. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1923. }
  1924. }
  1925. return op_name_to_find;
  1926. }
  1927. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1928. uint64_t *stream_id) {
  1929. size_t task_pos_start = overflow_file_prefix.length();
  1930. size_t task_pos_end = file_name.find(".", task_pos_start);
  1931. if (task_pos_end == std::string::npos) {
  1932. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1933. return false;
  1934. }
  1935. size_t stream_pos_start = task_pos_end + 1;
  1936. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1937. if (stream_pos_end == std::string::npos) {
  1938. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1939. return false;
  1940. }
  1941. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1942. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1943. if (!CheckStoull(task_id, task_id_str)) {
  1944. MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
  1945. << task_id_str << " into an integer.";
  1946. return false;
  1947. }
  1948. if (!CheckStoull(stream_id, stream_id_str)) {
  1949. MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
  1950. << stream_id_str << " into an integer.";
  1951. return false;
  1952. }
  1953. return true;
  1954. }
  1955. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  1956. uint64_t *stream_id) {
  1957. // get the node_name, task_id, and stream_id from dump filename in the following two formats:
  1958. // 1. bin file: node_type.node_name.task_id.stream_id.timestamp
  1959. // 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
  1960. // Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
  1961. // to search the file name from right to left.
  1962. size_t first_dot = file_name.find(".");
  1963. size_t fourth_dot;
  1964. if (file_name.rfind(kNpyExt) != std::string::npos) {
  1965. // npy format file (converted file or A+M dump file)
  1966. size_t pos = file_name.rfind(".");
  1967. const int kFourthFromRight = 4;
  1968. for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
  1969. pos = file_name.rfind(".", pos - 1);
  1970. }
  1971. fourth_dot = pos;
  1972. } else {
  1973. // bin format file
  1974. fourth_dot = file_name.rfind(".");
  1975. }
  1976. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1977. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1978. // check if dots were found
  1979. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1980. fourth_dot == std::string::npos) {
  1981. return false;
  1982. }
  1983. // get node_name
  1984. if (first_dot < second_dot) {
  1985. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1986. } else {
  1987. MS_LOG(ERROR) << "filename parse error to get node_name.";
  1988. return false;
  1989. }
  1990. // get task id
  1991. if (second_dot < third_dot) {
  1992. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1993. if (!CheckStoull(task_id, extracted_task_id)) {
  1994. MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
  1995. << extracted_task_id << " into an integer.";
  1996. return false;
  1997. }
  1998. } else {
  1999. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get task_id.";
  2000. return false;
  2001. }
  2002. // get stream id
  2003. if (third_dot < fourth_dot) {
  2004. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  2005. if (!CheckStoull(stream_id, extracted_stream_id)) {
  2006. MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
  2007. << extracted_stream_id << " into an integer.";
  2008. return false;
  2009. }
  2010. } else {
  2011. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get stream_id.";
  2012. return false;
  2013. }
  2014. return true;
  2015. }
  2016. std::string DebugServices::RealPath(const std::string &input_path) {
  2017. if (input_path.length() >= PATH_MAX) {
  2018. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  2019. }
  2020. size_t path_split_pos = input_path.find_last_of('/');
  2021. // get real path
  2022. char real_path[PATH_MAX] = {0};
  2023. // input_path is dir + file_name
  2024. if (path_split_pos != std::string::npos) {
  2025. std::string prefix_path = input_path.substr(0, path_split_pos);
  2026. std::string file_name = input_path.substr(path_split_pos);
  2027. if (file_name.length() > NAME_MAX) {
  2028. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  2029. }
  2030. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  2031. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  2032. return "";
  2033. }
  2034. return std::string(real_path) + file_name;
  2035. }
  2036. // input_path is only file_name
  2037. if (input_path.length() > NAME_MAX) {
  2038. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  2039. }
  2040. if (realpath(input_path.c_str(), real_path) == nullptr) {
  2041. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  2042. }
  2043. return std::string(real_path);
  2044. }
  2045. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  2046. #if defined(__APPLE__)
  2047. return *reinterpret_cast<const uint64_t *>(buffer.data());
  2048. #else
  2049. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  2050. #endif
  2051. }
  2052. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  2053. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  2054. }
  2055. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  2056. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  2057. }
  2058. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  2059. if (tensor_loader_->EnableMemoryControl()) {
  2060. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  2061. }
  2062. }
  2063. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  2064. std::string DebugServices::GetNetName() { return net_name_; }
  2065. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  2066. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  2067. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  2068. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  2069. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  2070. } // namespace mindspore