You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 86 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include <regex>
  29. #include "pybind11/embed.h"
  30. #include "pybind11/stl.h"
  31. #ifdef ONLINE_DBG_MODE
  32. #include "debug/common.h"
  33. #include "debug/debugger/debugger.h"
  34. #include "debug/anf_ir_utils.h"
  35. #include "backend/session/anf_runtime_algorithm.h"
  36. #endif
  37. #include "nlohmann/json.hpp"
  38. #include "debug/debugger/tensor_summary.h"
  39. #include "utils/file_utils.h"
  40. #include "climits"
  41. #ifdef ONLINE_DBG_MODE
  42. namespace mindspore {
  43. #endif
  44. namespace {
  45. #ifdef __APPLE__
  46. constexpr int kStrErrorNone = 0;
  47. #else
  48. constexpr char *kStrErrorNone = nullptr;
  49. #endif
  50. } // namespace
  51. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  52. DebugServices::DebugServices(const DebugServices &other) {
  53. wp_id_cache_ = other.wp_id_cache_;
  54. net_name_ = other.net_name_;
  55. dump_dir_ = other.dump_dir_;
  56. is_sync_mode_ = other.is_sync_mode_;
  57. tensor_loader_ = other.tensor_loader_;
  58. watchpoint_table_ = other.watchpoint_table_;
  59. }
  60. DebugServices &DebugServices::operator=(const DebugServices &other) {
  61. if (this != &other) {
  62. tensor_loader_ = other.tensor_loader_;
  63. watchpoint_table_ = other.watchpoint_table_;
  64. }
  65. return *this;
  66. }
  67. void DebugServices::AddWatchpoint(
  68. unsigned int id, unsigned int watch_condition, float parameter,
  69. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  70. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  71. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  72. std::lock_guard<std::mutex> lg(lock_);
  73. watchpoint_t watchpoint_item;
  74. watchpoint_item.id = id;
  75. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  76. watchpoint_item.condition.parameter = parameter;
  77. watchpoint_item.check_node_list = check_node_list;
  78. if (check_node_device_list != nullptr) {
  79. watchpoint_item.check_node_device_list = *check_node_device_list;
  80. }
  81. if (check_node_graph_list != nullptr) {
  82. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  83. }
  84. watchpoint_item.parameter_list = parameter_list;
  85. watchpoint_table_[id] = watchpoint_item;
  86. }
  87. void DebugServices::RemoveWatchpoint(unsigned int id) {
  88. std::lock_guard<std::mutex> lg(lock_);
  89. (void)watchpoint_table_.erase(id);
  90. }
  91. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  92. const void *const previous_tensor_ptr, uint32_t num_elements,
  93. uint32_t prev_num_elements, int tensor_dtype) {
  94. MS_EXCEPTION_IF_NULL(tensor);
  95. switch (tensor_dtype) {
  96. case DbgDataType::DT_UINT8: {
  97. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  98. prev_num_elements);
  99. }
  100. case DbgDataType::DT_INT8: {
  101. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  102. prev_num_elements);
  103. }
  104. case DbgDataType::DT_UINT16: {
  105. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  106. prev_num_elements);
  107. }
  108. case DbgDataType::DT_INT16: {
  109. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  110. prev_num_elements);
  111. }
  112. case DbgDataType::DT_UINT32: {
  113. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  114. prev_num_elements);
  115. }
  116. case DbgDataType::DT_INT32:
  117. case DbgDataType::DT_BASE_INT: {
  118. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  119. prev_num_elements);
  120. }
  121. case DbgDataType::DT_UINT64: {
  122. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  123. prev_num_elements);
  124. }
  125. case DbgDataType::DT_INT64: {
  126. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  127. prev_num_elements);
  128. }
  129. case DbgDataType::DT_FLOAT16: {
  130. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  131. prev_num_elements);
  132. }
  133. case DbgDataType::DT_FLOAT32:
  134. case DbgDataType::DT_BASE_FLOAT: {
  135. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  136. prev_num_elements);
  137. }
  138. case DbgDataType::DT_FLOAT64: {
  139. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  140. prev_num_elements);
  141. }
  142. case DbgDataType::DT_BOOL: {
  143. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  144. prev_num_elements);
  145. }
  146. default:
  147. MS_LOG(INFO) << "Unsupported tensor type";
  148. // return a null pointer
  149. return std::unique_ptr<TensorSummary<int32_t>>{};
  150. }
  151. }
  152. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  153. if (tensor == nullptr) {
  154. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  155. TensorStat empty_tensor_stat_data;
  156. return empty_tensor_stat_data;
  157. }
  158. std::unique_ptr<ITensorSummary> base_summary_ptr;
  159. void *previous_tensor_ptr = nullptr;
  160. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  161. if (base_summary_ptr == nullptr) {
  162. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  163. TensorStat empty_tensor_stat_data;
  164. return empty_tensor_stat_data;
  165. }
  166. base_summary_ptr->TensorStatistics(tensor->GetType());
  167. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  168. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  169. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  170. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  171. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  172. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  173. return tensor_stat_data;
  174. }
  175. #ifdef OFFLINE_DBG_MODE
  176. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  177. uint32_t *prev_num_elements, bool *history_not_found) {
  178. MS_EXCEPTION_IF_NULL(tensor);
  179. const void *previous_tensor_ptr = nullptr;
  180. std::shared_ptr<TensorData> tensor_prev;
  181. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  182. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  183. *history_not_found = 1;
  184. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  185. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  186. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  187. // read data in offline mode
  188. std::vector<std::string> file_paths;
  189. if (!is_sync_mode_) {
  190. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  191. std::vector<unsigned int>{tensor->GetDeviceId()},
  192. std::vector<unsigned int>{tensor->GetPrevIteration()},
  193. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  194. }
  195. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  196. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  197. std::vector<unsigned int>{tensor->GetDeviceId()},
  198. std::vector<unsigned int>{tensor->GetPrevIteration()},
  199. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  200. file_paths, &result_list_prev);
  201. tensor_prev = result_list_prev[0];
  202. if (!tensor_prev->GetByteSize()) {
  203. tensor_prev.reset();
  204. } else {
  205. previous_tensor_ptr = tensor_prev->GetDataPtr();
  206. *prev_num_elements = tensor_prev->GetNumElements();
  207. }
  208. }
  209. return previous_tensor_ptr;
  210. }
  211. #endif
  212. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  213. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  214. std::string *const qualified_tensor_name,
  215. std::vector<watchpoint_t> *const watchpoints_to_check) {
  216. if (tensor == nullptr) {
  217. MS_LOG(DEBUG) << "tensor is nullptr.";
  218. return;
  219. }
  220. const auto tensor_name = tensor->GetName();
  221. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  222. const auto tensor_device_id = tensor->GetDeviceId();
  223. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  224. for (auto w_table_item : watchpoint_table_) {
  225. auto wp = std::get<1>(w_table_item);
  226. // check ONLY init conditions on initial suspended state.
  227. // skip other conditions on initial suspended state
  228. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  229. continue;
  230. }
  231. // skip init condition if not init suspend
  232. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  233. continue;
  234. }
  235. // check change conditions only on step end.
  236. if (wp.change_condition() && !step_end) {
  237. continue;
  238. }
  239. // if recheck, ignore the cache results and reanalyze everything.
  240. // if not a recheck, check only unanalyzed tensors
  241. if (!recheck) {
  242. wp_lock_.lock();
  243. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  244. wp_lock_.unlock();
  245. if (wp_cache_hit) {
  246. continue;
  247. }
  248. }
  249. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  250. if (!found.empty()) {
  251. *qualified_tensor_name = found;
  252. watchpoints_to_check->push_back(w_table_item.second);
  253. #ifdef OFFLINE_DBG_MODE
  254. if (wp.change_condition()) {
  255. *previous_iter_tensor_needed = true;
  256. }
  257. #endif
  258. }
  259. }
  260. }
  261. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  262. const std::string &tensor_name) {
  263. // add analyzed tensor to cache
  264. if (!recheck) {
  265. wp_lock_.lock();
  266. (void)wp_id_cache_[tensor_name].insert(id);
  267. wp_lock_.unlock();
  268. }
  269. }
  270. void DebugServices::SetCheckWatchpointsResult(
  271. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  272. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  273. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  274. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  275. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  276. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  277. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  278. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  279. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  280. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  281. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  282. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  283. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  284. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  285. if (device_id != nullptr) {
  286. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  287. }
  288. if (root_graph_id != nullptr) {
  289. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  290. }
  291. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  292. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  293. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  294. }
  295. #ifdef OFFLINE_DBG_MODE
  296. void DebugServices::CheckOutofMemoryandNoValue(
  297. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  298. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  299. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  300. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  301. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  302. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  303. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  304. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  305. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  306. const std::vector<parameter_t> &parameter_list) {
  307. bool set_is_needed = no_mem_to_read || error_on_no_value;
  308. int32_t error_code_to_set = 0;
  309. if (no_mem_to_read) {
  310. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  311. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  312. } else if (error_on_no_value) {
  313. error_code_to_set = ITensorSummary::NO_VALUE;
  314. }
  315. if (set_is_needed) {
  316. for (auto &wp : watchpoints_to_check) {
  317. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  318. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  319. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  320. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  321. parameter_list, error_code_to_set);
  322. }
  323. }
  324. }
  325. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  326. // set the tensor into not-in-use status in tensor_loader.
  327. auto tensor_name = tensor->GetName();
  328. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  329. std::to_string(tensor->GetRootGraphId()) + ":" +
  330. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  331. AppendToCacheEvictQueue(key_name_in_cache);
  332. if (previous_tensor_ptr != nullptr) {
  333. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  334. }
  335. }
  336. #endif
  337. #ifdef ONLINE_DBG_MODE
  338. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  339. auto debugger = Debugger::GetInstance();
  340. auto ms_context = MsContext::GetInstance();
  341. MS_EXCEPTION_IF_NULL(ms_context);
  342. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  343. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  344. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  345. device_target == kAscendDevice) {
  346. if (cur_root_graph_id != id) {
  347. return false;
  348. }
  349. }
  350. return true;
  351. }
  352. const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
  353. std::shared_ptr<TensorData> prev_tensor_data;
  354. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  355. // not supporting watchpoints that need prev tensor for multi root graph networks.
  356. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  357. prev_tensor_data = nullptr;
  358. } else {
  359. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  360. }
  361. if (prev_tensor_data) {
  362. *prev_num_elements = prev_tensor_data->GetNumElements();
  363. return prev_tensor_data->GetDataPtr();
  364. }
  365. return nullptr;
  366. }
  367. #endif
  368. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  369. // check history error_code only for offline debugger
  370. if (history_not_found) {
  371. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  372. }
  373. }
  374. void DebugServices::CheckWatchpointsForTensor(
  375. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  376. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  377. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  378. const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
  379. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  380. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  381. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  382. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  383. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  384. int list_size = tensor_list->size();
  385. if (end > list_size) {
  386. end = list_size;
  387. }
  388. for (int i = begin; i < end; i++) {
  389. auto &tensor = (*tensor_list)[i];
  390. const auto tensor_name = tensor->GetName();
  391. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  392. const auto tensor_slot = std::to_string(tensor->GetSlot());
  393. std::vector<watchpoint_t> watchpoints_to_check;
  394. std::string qualified_tensor_name;
  395. bool previous_iter_tensor_needed = false;
  396. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  397. &qualified_tensor_name, &watchpoints_to_check);
  398. // no wp set on current tensor
  399. if (watchpoints_to_check.empty()) {
  400. continue;
  401. }
  402. #ifdef OFFLINE_DBG_MODE
  403. // read data in offline mode
  404. bool no_mem_to_read = false;
  405. std::vector<std::shared_ptr<TensorData>> result_list;
  406. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  407. std::vector<unsigned int>{tensor->GetDeviceId()},
  408. std::vector<unsigned int>{tensor->GetIteration()},
  409. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  410. async_file_pool, &result_list, &no_mem_to_read);
  411. tensor = result_list[0];
  412. if (!tensor->GetByteSize()) {
  413. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  414. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  415. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  416. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  417. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  418. tensor->GetRootGraphId(), std::vector<parameter_t>());
  419. tensor.reset();
  420. continue;
  421. }
  422. #endif
  423. // no elements to analyze
  424. if (tensor->GetByteSize() == 0) {
  425. continue;
  426. }
  427. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  428. int tensor_dtype = tensor->GetType();
  429. uint32_t num_elements = tensor->GetNumElements();
  430. uint32_t prev_num_elements = 0;
  431. const void *previous_tensor_ptr = nullptr;
  432. #ifdef OFFLINE_DBG_MODE
  433. bool history_not_found = 0;
  434. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  435. #else
  436. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  437. MS_LOG(DEBUG)
  438. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  439. << tensor->GetName();
  440. continue;
  441. }
  442. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  443. #endif
  444. std::unique_ptr<ITensorSummary> base_summary_ptr;
  445. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  446. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  447. if (base_summary_ptr != nullptr) {
  448. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  449. }
  450. }
  451. for (auto &wp : watchpoints_to_check) {
  452. bool is_hit = false;
  453. int error_code = 0;
  454. std::vector<parameter_t> parameter_list = {};
  455. if (wp.condition.type == IS_OVERFLOW) {
  456. is_hit =
  457. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  458. } else if (base_summary_ptr != nullptr) {
  459. auto item = base_summary_ptr->IsWatchpointHit(wp);
  460. is_hit = std::get<ITensorSummary::eHitPos>(item);
  461. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  462. #ifdef OFFLINE_DBG_MODE
  463. CheckHistoryErrorCode(&error_code, history_not_found);
  464. #endif
  465. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  466. }
  467. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  468. if (is_hit || error_code) {
  469. SetCheckWatchpointsResult(
  470. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  471. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  472. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  473. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  474. }
  475. }
  476. #ifdef OFFLINE_DBG_MODE
  477. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  478. // in offline mode remove the need for the data
  479. tensor.reset();
  480. #endif
  481. }
  482. }
  483. void DebugServices::CheckWatchpoints(
  484. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  485. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  486. std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
  487. const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  488. const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
  489. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  490. std::lock_guard<std::mutex> lg(lock_);
  491. auto t1 = std::chrono::high_resolution_clock::now();
  492. if (watchpoint_table_.empty()) {
  493. return;
  494. }
  495. // vector to store execution order of tensors hit
  496. std::vector<int> exec_order;
  497. std::vector<std::string> time_stamps;
  498. int tensor_list_size = tensor_list->size();
  499. uint64_t tensor_list_byte_size = 0;
  500. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  501. if (tensor_list_size <= 0) {
  502. return;
  503. }
  504. // default value for number of threads
  505. const int default_thread_num = 16;
  506. int max_thread_num = default_thread_num;
  507. if (max_thread_num > tensor_list_size) {
  508. max_thread_num = tensor_list_size;
  509. }
  510. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  511. int chunk_size = tensor_list_size / max_thread_num;
  512. int remainder = tensor_list_size % max_thread_num;
  513. partitioned_numbers chunk_exec_orders(max_thread_num);
  514. partitioned_names chunk_names(max_thread_num);
  515. partitioned_names chunk_slots(max_thread_num);
  516. partitioned_numbers chunk_conditions(max_thread_num);
  517. partitioned_id chunk_watchpoint_id(max_thread_num);
  518. partitioned_parameters chunk_parameters(max_thread_num);
  519. partitioned_error_code chunk_error_codes(max_thread_num);
  520. partitioned_id chunk_device_id(max_thread_num);
  521. partitioned_id chunk_root_graph_id(max_thread_num);
  522. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  523. partitioned_names chunk_time_stamp(max_thread_num);
  524. std::vector<std::future<void>> tensor_future_vec;
  525. int begin = 0;
  526. int end = begin;
  527. for (int i = 0; i < max_thread_num; i++) {
  528. end += chunk_size;
  529. if (remainder > 0) {
  530. end++;
  531. remainder--;
  532. }
  533. (void)tensor_future_vec.emplace_back(std::async(
  534. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  535. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  536. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  537. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  538. begin = end;
  539. }
  540. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  541. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  542. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  543. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  544. root_graph_id);
  545. auto t2 = std::chrono::high_resolution_clock::now();
  546. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  547. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  548. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  549. }
  550. void DebugServices::SortWatchpointsInfo(
  551. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  552. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  553. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  554. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  555. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  556. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  557. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  558. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  559. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  560. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  561. std::vector<unsigned int> *const root_graph_id) {
  562. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  563. (*tensor_future_vec)[i].wait();
  564. (*tensor_future_vec)[i].get();
  565. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  566. #ifdef ONLINE_DBG_MODE
  567. // if the execution order is repeated,inserts the new one before the others with same execution order.
  568. std::vector<int>::iterator iter =
  569. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  570. int position = iter - exec_order->begin();
  571. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  572. #endif
  573. #ifdef OFFLINE_DBG_MODE
  574. std::vector<std::string>::iterator iter =
  575. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  576. int position = iter - time_stamps->begin();
  577. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  578. #endif
  579. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  580. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  581. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  582. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  583. if (device_id != nullptr) {
  584. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  585. }
  586. if (root_graph_id != nullptr) {
  587. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  588. }
  589. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  590. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  591. }
  592. // free the memory for used vectors
  593. std::vector<int>().swap((*chunk_exec_orders)[i]);
  594. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  595. std::vector<std::string>().swap((*chunk_names)[i]);
  596. std::vector<std::string>().swap((*chunk_slots)[i]);
  597. std::vector<int>().swap((*chunk_conditions)[i]);
  598. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  599. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  600. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  601. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  602. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  603. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  604. }
  605. }
  606. #ifdef OFFLINE_DBG_MODE
  607. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  608. std::string *const tensor_type, std::size_t *const size,
  609. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  610. bool *no_mem_to_read) {
  611. std::ifstream infile;
  612. std::string file_path = file_name;
  613. MS_LOG(INFO) << "Reading in file: " << file_path;
  614. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  615. if (!infile.is_open()) {
  616. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  617. const int kMaxFilenameLength = 128;
  618. char err_info[kMaxFilenameLength];
  619. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  620. if (ret != kStrErrorNone) {
  621. MS_LOG(ERROR) << " ErrInfo:" << ret;
  622. }
  623. return;
  624. }
  625. const int substr_len = 2;
  626. const int header_len_offset = 8;
  627. const int header_offset = 9;
  628. const int header_len_buffer_size = 2;
  629. const int type_offset = 10;
  630. // get header length
  631. (void)infile.seekg(0, std::ios::beg);
  632. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  633. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  634. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  635. return;
  636. }
  637. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  638. header_len_buffer.reset();
  639. // read in header
  640. (void)infile.seekg(0, std::ios::beg);
  641. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  642. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  643. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  644. return;
  645. }
  646. std::string header(header_buffer->data() + header_offset, header_len);
  647. header_buffer.reset();
  648. std::size_t type_i = header.find("descr") + type_offset;
  649. if (header.length() < type_i + substr_len) {
  650. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  651. return;
  652. }
  653. *tensor_type = header.substr(type_i, substr_len);
  654. std::size_t shape_i_open = header.find("(");
  655. std::size_t shape_i_close = header.find(")");
  656. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  657. std::string intermediate;
  658. std::stringstream check_shape(shape_str);
  659. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  660. while (getline(check_shape, intermediate, ',')) {
  661. shape->push_back(std::stoi(intermediate));
  662. }
  663. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  664. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  665. std::size_t data_size = data_len * word_size;
  666. if (!data_size) {
  667. return;
  668. }
  669. // Check memory available before loading tensor into host.
  670. bool has_enough_memory = true;
  671. if (tensor_loader_->EnableMemoryControl()) {
  672. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  673. }
  674. if (!has_enough_memory) {
  675. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  676. *no_mem_to_read = true;
  677. } else {
  678. (void)infile.seekg(header_len + type_offset);
  679. *data_buffer = new std::vector<char>(data_size);
  680. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  681. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  682. }
  683. *size = data_size;
  684. }
  685. }
  686. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  687. std::vector<std::string> *const result_list) {
  688. std::string file_format = "npy";
  689. for (auto const &d : dir_to_files_map) {
  690. std::vector<std::string> files_to_convert_in_dir;
  691. std::vector<std::string> files_after_convert_in_dir;
  692. std::string dump_key = d.first;
  693. for (auto const &file_name : d.second) {
  694. bool already_converted = false;
  695. // Remove scope from the file_name for matching files converted by mindinsight tool.
  696. std::size_t found_first_dot = file_name.find(".");
  697. std::size_t found_last_underscore = file_name.find_last_of("_");
  698. std::string file_name_without_scope = file_name;
  699. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  700. file_name_without_scope =
  701. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  702. }
  703. for (std::string &file_found : *result_list) {
  704. if (file_found.find(file_name_without_scope) != std::string::npos) {
  705. already_converted = true;
  706. break;
  707. }
  708. }
  709. if (!already_converted) {
  710. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  711. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  712. }
  713. }
  714. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  715. if (!files_to_convert_in_dir.empty()) {
  716. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  717. // later task.
  718. try {
  719. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  720. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  721. (void)convert_obj.attr("convert_files")();
  722. } catch (pybind11::error_already_set &e) {
  723. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  724. }
  725. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
  726. }
  727. }
  728. }
  729. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  730. const std::string &dump_key, std::vector<std::string> *const result_list,
  731. const std::string &file_format) {
  732. std::string real_dump_iter_dir = RealPath(dump_key);
  733. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  734. if (d_handle == nullptr) {
  735. MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
  736. return;
  737. }
  738. struct dirent *dir = nullptr;
  739. while ((dir = readdir(d_handle)) != nullptr) {
  740. struct stat st;
  741. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  742. int ret = stat(name.c_str(), &st);
  743. if (ret != 0) {
  744. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  745. (void)closedir(d_handle);
  746. return;
  747. } else if (S_ISREG(st.st_mode)) {
  748. std::string candidate = dir->d_name;
  749. for (const std::string &file_to_find : files_after_convert_in_dir) {
  750. std::string file_n = file_to_find;
  751. auto last_slash_pos = file_to_find.find_last_of("\\/");
  752. if (last_slash_pos != std::string::npos) {
  753. file_n = file_to_find.substr(last_slash_pos + 1);
  754. }
  755. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  756. // we found a converted file for this op
  757. std::string found_file = dump_key + "/" + candidate;
  758. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  759. result_list->push_back(found_file);
  760. }
  761. }
  762. }
  763. }
  764. }
  765. (void)closedir(d_handle);
  766. }
  767. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  768. if (dump_style_name.empty()) {
  769. return "";
  770. }
  771. std::size_t last_scope_marker;
  772. std::string delim = "/";
  773. last_scope_marker = dump_style_name.rfind(delim);
  774. if (last_scope_marker == std::string::npos) {
  775. return dump_style_name;
  776. }
  777. return dump_style_name.substr(last_scope_marker + delim.size());
  778. }
  779. void ReplaceSrcFileName(std::string *dump_style_name) {
  780. if (dump_style_name == nullptr) {
  781. return;
  782. }
  783. const std::string strsrc = "/";
  784. std::string strdst = "_";
  785. std::string::size_type pos = 0;
  786. std::string::size_type srclen = strsrc.size();
  787. std::string::size_type dstlen = strdst.size();
  788. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  789. (void)dump_style_name->replace(pos, srclen, strdst);
  790. pos += dstlen;
  791. }
  792. }
  793. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  794. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  795. std::vector<unsigned int> root_graph_id,
  796. std::vector<std::string> *const result_list) {
  797. std::string file_format = "npy";
  798. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  799. for (unsigned int i = 0; i < backend_name.size(); i++) {
  800. // form prefix of the tensor file to read from graph pb node name
  801. std::string dump_style_kernel_name = backend_name[i];
  802. // remove slot from name
  803. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  804. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  805. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  806. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  807. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  808. // search files in dir for the one that meets the filename prefix and read the file into memory
  809. std::string abspath = RealPath(specific_dump_dir);
  810. DIR *d = opendir(abspath.c_str());
  811. if (d == nullptr) {
  812. MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
  813. return;
  814. }
  815. ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  816. (void)closedir(d);
  817. }
  818. ConvertToHostFormat(dir_to_files_map, result_list);
  819. }
  820. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  821. const std::string &specific_dump_dir,
  822. std::vector<std::string> *const result_list) {
  823. std::string file_format = "npy";
  824. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  825. for (const auto &node : proto_dump) {
  826. std::string dump_name = std::get<1>(node);
  827. dump_name = dump_name.substr(0, dump_name.rfind("."));
  828. // search files in dir for the one that meets the filename prefix and read the file into memory
  829. std::string abspath = RealPath(specific_dump_dir);
  830. DIR *d = opendir(abspath.c_str());
  831. if (d == nullptr) {
  832. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  833. return;
  834. }
  835. ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  836. (void)closedir(d);
  837. }
  838. ConvertToHostFormat(dir_to_files_map, result_list);
  839. }
  840. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
  841. const std::string &specific_dump_dir,
  842. std::map<std::string, std::vector<std::string>> *dir_to_files_map,
  843. std::vector<std::string> *const result_list) {
  844. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  845. DIR *d = opendir(specific_dump_dir.c_str());
  846. struct dirent *dir = nullptr;
  847. while ((dir = readdir(d)) != nullptr) {
  848. struct stat st;
  849. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  850. int ret = stat(name.c_str(), &st);
  851. if (ret != 0) {
  852. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  853. (void)closedir(d);
  854. return;
  855. } else if (!(S_ISREG(st.st_mode))) {
  856. continue;
  857. }
  858. std::string file_name = dir->d_name;
  859. std::string file_name_w_o_perfix = file_name;
  860. auto type_pos = file_name.find('.');
  861. if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
  862. continue;
  863. }
  864. if (file_name.rfind(file_format) == std::string::npos) {
  865. // if file matches prefix and is in device format add to candidate files to convert.
  866. (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
  867. } else {
  868. // otherwise, if file matches prefix and already has been converted to host format
  869. // add to result of converted files.
  870. std::string found_file = specific_dump_dir + "/" + file_name;
  871. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  872. result_list->push_back(found_file);
  873. }
  874. }
  875. }
  876. (void)closedir(d);
  877. }
  878. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  879. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  880. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  881. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  882. for (auto &node : proto_dump) {
  883. std::vector<size_t> slot_list;
  884. std::string dump_style_name = std::get<1>(node);
  885. // Get dump_name and output_str from the second element of tuple
  886. std::size_t found_dot = dump_style_name.rfind(".");
  887. std::string dump_name = dump_style_name.substr(0, found_dot);
  888. std::string output_str = dump_style_name.substr(found_dot + 1);
  889. bool output_flag = (output_str == "output");
  890. for (const std::string &file_name : async_file_pool) {
  891. std::size_t found = file_name.find(dump_name);
  892. std::size_t found_out = file_name.find(output_str);
  893. std::size_t found_dot_start = file_name.find(".", found_out);
  894. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  895. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  896. found_out != std::string::npos) {
  897. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  898. }
  899. }
  900. for (auto slot : slot_list) {
  901. // add a TensorData entry (data will be read when needed)
  902. std::vector<int64_t> shape;
  903. std::string orig_name = std::get<0>(node);
  904. auto tensor_data = std::make_shared<TensorData>();
  905. tensor_data->SetName(orig_name);
  906. tensor_data->SetExecutionOrder(0);
  907. tensor_data->SetSlot(slot);
  908. tensor_data->SetIteration(iteration);
  909. tensor_data->SetDeviceId(device_id);
  910. tensor_data->SetRootGraphId(root_graph_id);
  911. tensor_data->SetDataPtr(nullptr);
  912. tensor_data->SetByteSize(0);
  913. tensor_data->SetType("");
  914. tensor_data->SetShape(shape);
  915. tensor_data->SetIsOutput(output_flag);
  916. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  917. tensor_list->push_back(tensor_data);
  918. }
  919. }
  920. }
  921. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  922. std::regex re;
  923. if (mode == "rank") {
  924. re = "^rank_([0-9]+)$";
  925. } else if (mode == "graph") {
  926. re = "^([0-9]+)$";
  927. }
  928. std::smatch tokens;
  929. if (regex_match(name, tokens, re)) {
  930. return std::stoi(tokens[1]);
  931. } else {
  932. return UINT32_MAX;
  933. }
  934. }
  935. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  936. std::vector<uint32_t> rank_id_list;
  937. std::string dump_dir = GetDumpDir();
  938. DIR *d_handle = opendir(dump_dir.c_str());
  939. if (d_handle == nullptr) {
  940. MS_LOG(ERROR) << "Dump directory does not exist.";
  941. return rank_id_list;
  942. }
  943. struct dirent *dir = nullptr;
  944. while ((dir = readdir(d_handle)) != nullptr) {
  945. struct stat st;
  946. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  947. int ret = stat(name.c_str(), &st);
  948. if (ret != 0) {
  949. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  950. (void)closedir(d_handle);
  951. return rank_id_list;
  952. } else if (S_ISDIR(st.st_mode)) {
  953. std::string rank_dir_name = dir->d_name;
  954. if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
  955. rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
  956. }
  957. }
  958. }
  959. (void)closedir(d_handle);
  960. return rank_id_list;
  961. }
  962. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  963. std::string net_name = GetNetName();
  964. std::string dump_dir = GetDumpDir();
  965. for (uint32_t rank_id : rank_id_list) {
  966. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  967. std::string abspath = RealPath(path);
  968. DIR *d_handle_rank = opendir(abspath.c_str());
  969. if (d_handle_rank == nullptr) {
  970. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  971. continue;
  972. }
  973. struct dirent *direc = nullptr;
  974. while ((direc = readdir(d_handle_rank)) != nullptr) {
  975. struct stat st;
  976. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  977. int ret = stat(name.c_str(), &st);
  978. if (ret != 0) {
  979. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  980. (void)closedir(d_handle_rank);
  981. return;
  982. } else if (S_ISDIR(st.st_mode)) {
  983. std::string graph_dir = direc->d_name;
  984. if (graph_dir == "." || graph_dir == "..") {
  985. continue;
  986. }
  987. if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
  988. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  989. ReadGraphsHistory(rank_id, graph_id);
  990. }
  991. }
  992. }
  993. (void)closedir(d_handle_rank);
  994. }
  995. }
  996. void DebugServices::SetGraphsHistory() {
  997. // extract rank_id_list
  998. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  999. // for each rank_id extract the graph_id list and set the dump version
  1000. // and for each graph read the graph history file
  1001. CheckDumpGraphIdList(rank_id_list);
  1002. }
  1003. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1004. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1005. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1006. // graph history was already stored for this rank_id and graph_id
  1007. return;
  1008. }
  1009. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1010. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1011. DIR *d_handle = opendir(exec_order_path.c_str());
  1012. if (d_handle == nullptr) {
  1013. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1014. return;
  1015. }
  1016. // read file and store the info
  1017. std::string full_path = exec_order_path + "/" + file_to_check;
  1018. std::string checked_path = RealPath(full_path);
  1019. if (!checked_path.empty()) {
  1020. ReadGraphRunIter(checked_path, rank_and_graph);
  1021. }
  1022. (void)closedir(d_handle);
  1023. }
  1024. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1025. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1026. for (auto w_table_item : watchpoint_table_) {
  1027. auto wp = std::get<1>(w_table_item);
  1028. unsigned int index = 0;
  1029. for (auto check_node : wp.check_node_list) {
  1030. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1031. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1032. // graph represents root_graph for Ascend and kernel_graph for GPU
  1033. for (auto rank : ranks) {
  1034. for (auto graph : graphs) {
  1035. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1036. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1037. }
  1038. }
  1039. index++;
  1040. }
  1041. }
  1042. return rank_and_graph_to_nodes;
  1043. }
  1044. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1045. std::ifstream infile;
  1046. std::string line;
  1047. infile.open(file_path.c_str());
  1048. if (!infile.is_open()) {
  1049. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1050. const int kMaxFilenameLength = NAME_MAX;
  1051. char err_info[kMaxFilenameLength];
  1052. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1053. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1054. }
  1055. return;
  1056. }
  1057. std::vector<uint32_t> run_iters_vec;
  1058. while (std::getline(infile, line)) {
  1059. uint32_t iter;
  1060. std::stringstream ss(line);
  1061. ss >> iter;
  1062. run_iters_vec.push_back(iter);
  1063. }
  1064. (void)graphs_run_history_.emplace(
  1065. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1066. }
  1067. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1068. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1069. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1070. const std::string &type_name, const std::vector<int64_t> &shape,
  1071. std::vector<char> *buffer,
  1072. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1073. // call LoadNewTensor to store tensor in internal cache
  1074. auto tensor_data = std::make_shared<TensorData>();
  1075. tensor_data->SetName(backend_name);
  1076. tensor_data->SetExecutionOrder(0);
  1077. tensor_data->SetSlot(slot);
  1078. tensor_data->SetIteration(iteration);
  1079. tensor_data->SetDeviceId(device_id);
  1080. tensor_data->SetRootGraphId(root_graph_id);
  1081. tensor_data->SetIsOutput(is_output);
  1082. if (buffer != nullptr) {
  1083. tensor_data->SetDataPtr(buffer->data());
  1084. } else {
  1085. tensor_data->SetDataPtr(nullptr);
  1086. }
  1087. tensor_data->SetByteSize(data_size);
  1088. tensor_data->SetType(type_name);
  1089. tensor_data->SetShape(shape);
  1090. tensor_data->SetTimeStamp(time_stamp);
  1091. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1092. if (data_size) {
  1093. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1094. }
  1095. // add to result_list
  1096. result_list->push_back(tensor_data);
  1097. }
  1098. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1099. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1100. std::string dump_style_name_part = *dump_style_kernel_name;
  1101. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1102. std::string slot_str;
  1103. if (is_output) {
  1104. slot_str = ".output." + std::to_string(slot);
  1105. } else {
  1106. slot_str = ".input." + std::to_string(slot);
  1107. }
  1108. dump_style_name_part += slot_str;
  1109. *prefix_dump_file_name = dump_style_name_part;
  1110. *slot_string_to_check = slot_str;
  1111. }
  1112. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1113. // get file with the newest timestamp from the list.
  1114. if (file_list.empty()) {
  1115. return "";
  1116. }
  1117. std::sort(file_list.begin(), file_list.end());
  1118. return file_list.back();
  1119. }
  1120. std::string GetTimeStampStr(std::string file_path) {
  1121. // get the file_name from file_path.
  1122. size_t pos = file_path.rfind("/");
  1123. std::string file_name = file_path.substr(pos + 1);
  1124. size_t first_dot = file_name.rfind(".");
  1125. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1126. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1127. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1128. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1129. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1130. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1131. return time_stamp;
  1132. }
  1133. return "";
  1134. }
  1135. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1136. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1137. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1138. const std::vector<std::string> &async_file_pool,
  1139. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1140. bool *no_mem_to_read) {
  1141. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1142. // form prefix of the tensor file to read from graph pb node name
  1143. std::string dump_style_kernel_name = backend_name[i];
  1144. // remove slot from name
  1145. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1146. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1147. std::string slot_string_to_check;
  1148. std::string prefix_dump_file_name;
  1149. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1150. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1151. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1152. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1153. // search files in dir for the one that meets the filename prefix and read the file into memory
  1154. if (is_sync_mode_) {
  1155. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1156. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1157. } else {
  1158. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1159. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1160. no_mem_to_read);
  1161. }
  1162. }
  1163. }
  1164. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1165. const std::string &backend_name, const unsigned int device_id,
  1166. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1167. bool *no_mem_to_read, unsigned int iteration,
  1168. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1169. std::string time_stamp = "";
  1170. std::string type_name = "";
  1171. size_t data_size = 0;
  1172. std::vector<int64_t> shape;
  1173. std::vector<char> *buffer = nullptr;
  1174. if (found) {
  1175. std::string result_path = GetNewestFilePath(matched_paths);
  1176. time_stamp = GetTimeStampStr(result_path);
  1177. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1178. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1179. std::to_string(slot);
  1180. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1181. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1182. type_name, shape, buffer, result_list);
  1183. } else {
  1184. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1185. buffer, result_list);
  1186. MS_LOG(INFO) << "Target tensor has not been found.";
  1187. }
  1188. }
  1189. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1190. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1191. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1192. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1193. std::string abspath = RealPath(specific_dump_dir);
  1194. DIR *d = opendir(abspath.c_str());
  1195. bool found_file = false;
  1196. std::vector<std::string> matched_paths;
  1197. if (d == nullptr) {
  1198. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1199. } else {
  1200. struct dirent *dir = nullptr;
  1201. while ((dir = readdir(d)) != nullptr) {
  1202. struct stat st;
  1203. std::string name = abspath + std::string("/") + std::string(dir->d_name);
  1204. int ret = stat(name.c_str(), &st);
  1205. if (ret != 0) {
  1206. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1207. (void)closedir(d);
  1208. return;
  1209. } else if (S_ISREG(st.st_mode)) {
  1210. std::string file_name = dir->d_name;
  1211. std::string stripped_file_name = GetStrippedFilename(file_name);
  1212. if (stripped_file_name.empty()) {
  1213. continue;
  1214. }
  1215. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1216. if (found != 0) {
  1217. continue;
  1218. }
  1219. std::string full_path = specific_dump_dir + "/" + file_name;
  1220. matched_paths.push_back(full_path);
  1221. found_file = true;
  1222. }
  1223. }
  1224. (void)closedir(d);
  1225. }
  1226. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1227. no_mem_to_read, iteration, result_list);
  1228. }
  1229. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1230. const std::string &slot_string_to_check, const std::string &backend_name,
  1231. size_t slot, unsigned int device_id, unsigned int iteration,
  1232. unsigned int root_graph_id, const bool &is_output,
  1233. const std::vector<std::string> &async_file_pool,
  1234. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1235. bool found = false;
  1236. std::vector<std::string> matched_paths;
  1237. // if async mode
  1238. for (const std::string &file_path : async_file_pool) {
  1239. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1240. file_path.find(prefix_dump_to_check) != std::string::npos &&
  1241. file_path.find(slot_string_to_check) != std::string::npos) {
  1242. matched_paths.push_back(file_path);
  1243. found = true;
  1244. }
  1245. }
  1246. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1247. iteration, result_list);
  1248. }
  1249. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1250. // strip off the task_id, stream_id, and timestamp, then compare
  1251. size_t first_dot = file_name.find(".");
  1252. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1253. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1254. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1255. return std::string();
  1256. }
  1257. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1258. size_t second_dot = fifth_dot;
  1259. const int8_t kSecondDotPosition = 2;
  1260. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1261. second_dot = file_name.rfind(".", second_dot - 1);
  1262. }
  1263. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1264. return std::string();
  1265. }
  1266. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1267. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1268. std::string stripped_file_name = start_string + end_string;
  1269. return stripped_file_name;
  1270. }
  1271. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  1272. unsigned int iteration, std::vector<std::string> *const async_file_pool, bool error_on_no_value) {
  1273. // get a list of nodes and the devices they are on to monitor
  1274. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1275. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1276. GetAllWpNodes();
  1277. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1278. // as they are found
  1279. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1280. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1281. uint32_t rank_id = std::get<0>(rank_and_graph);
  1282. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1283. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1284. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1285. std::string real_dump_dir = RealPath(specific_dump_dir);
  1286. if (real_dump_dir.empty()) {
  1287. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1288. continue;
  1289. }
  1290. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1291. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1292. // convert node names to dump style
  1293. for (auto node : wp_nodes) {
  1294. std::string orig_name = std::get<0>(node);
  1295. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1296. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1297. bool node_is_out = std::get<1>(node);
  1298. if (node_is_out) {
  1299. dump_style_name += ".output";
  1300. } else {
  1301. dump_style_name += ".input";
  1302. }
  1303. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1304. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1305. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1306. }
  1307. }
  1308. if (is_sync_mode_) {
  1309. // search files in dir for the one that meets the filename prefix and read the file into memory
  1310. ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1311. error_on_no_value);
  1312. } else {
  1313. // convert all files in proto_to_dump to npy and add to pool of async file names
  1314. ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
  1315. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1316. &tensor_list);
  1317. }
  1318. }
  1319. return tensor_list;
  1320. }
  1321. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1322. const std::string &specific_dump_dir, unsigned int iteration,
  1323. unsigned int device_id, unsigned int root_graph_id,
  1324. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1325. bool error_on_no_value) {
  1326. DIR *d = opendir(specific_dump_dir.c_str());
  1327. if (d == nullptr) {
  1328. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
  1329. } else {
  1330. struct dirent *dir = nullptr;
  1331. while ((dir = readdir(d)) != nullptr) {
  1332. struct stat st;
  1333. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  1334. int ret = stat(name.c_str(), &st);
  1335. if (ret != 0) {
  1336. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1337. (void)closedir(d);
  1338. return;
  1339. } else if (S_ISREG(st.st_mode)) {
  1340. std::string file_name = dir->d_name;
  1341. for (auto &node : proto_to_dump) {
  1342. std::string dump_name = std::get<1>(node);
  1343. std::string stripped_file_name = GetStrippedFilename(file_name);
  1344. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1345. continue;
  1346. }
  1347. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1348. if (found == 0) {
  1349. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1350. std::vector<int64_t> shape;
  1351. std::string orig_name = std::get<0>(node);
  1352. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1353. bool output_flag = (output_str == "output");
  1354. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1355. nullptr, tensor_list);
  1356. break;
  1357. }
  1358. }
  1359. }
  1360. }
  1361. (void)closedir(d);
  1362. }
  1363. }
  1364. std::string DebugServices::IterationString(unsigned int iteration) {
  1365. std::string iteration_string;
  1366. bool init_dbg_suspend = (iteration == UINT_MAX);
  1367. if (init_dbg_suspend) {
  1368. iteration_string = "init";
  1369. } else {
  1370. iteration_string = std::to_string(iteration);
  1371. }
  1372. return iteration_string;
  1373. }
  1374. #endif
  1375. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1376. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1377. std::vector<unsigned int> *const dtype,
  1378. std::vector<std::vector<int64_t>> *const shape) {
  1379. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1380. tensor_loader_->SearchTensors(name, &result_list);
  1381. for (auto result : result_list) {
  1382. if (std::get<1>(result) == nullptr) {
  1383. continue;
  1384. }
  1385. #ifdef ONLINE_DBG_MODE
  1386. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1387. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1388. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1389. << ".";
  1390. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1391. }
  1392. #endif
  1393. (void)ret_name->emplace_back(std::get<0>(result));
  1394. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1395. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1396. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1397. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1398. }
  1399. }
  1400. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1401. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1402. if (result_list == nullptr) {
  1403. MS_LOG(DEBUG) << "result_list is nullptr.";
  1404. return;
  1405. }
  1406. tensor_loader_->SearchTensors(name, result_list);
  1407. }
  1408. #ifdef ONLINE_DBG_MODE
  1409. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1410. bool ret = false;
  1411. for (auto w_table_item : watchpoint_table_) {
  1412. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1413. for (auto check_node : check_node_list) {
  1414. std::string w_name = std::get<0>(check_node);
  1415. bool w_type = std::get<1>(check_node);
  1416. if ((w_type == true &&
  1417. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1418. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1419. ret = true;
  1420. return ret;
  1421. }
  1422. }
  1423. }
  1424. return ret;
  1425. }
  1426. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1427. if (kernel != nullptr && w_name.length() > 0) {
  1428. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1429. for (size_t j = 0; j < input_size; ++j) {
  1430. auto input_kernel = kernel->input(j + 1);
  1431. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1432. auto found = w_name.find_last_of('/');
  1433. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1434. return true;
  1435. }
  1436. return false;
  1437. } else {
  1438. return false;
  1439. }
  1440. }
  1441. #endif
  1442. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1443. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1444. return tensor_loader_->GetTensor(tensor_name);
  1445. }
  1446. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1447. #ifdef ONLINE_DBG_MODE
  1448. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1449. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1450. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1451. size_t slot) const {
  1452. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1453. device_type, addr_format, slot);
  1454. }
  1455. #endif
  1456. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1457. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1458. }
  1459. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1460. uint32_t prev_iter;
  1461. uint32_t rank_id = tensor->GetDeviceId();
  1462. uint32_t root_graph_id = tensor->GetRootGraphId();
  1463. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1464. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1465. return UINT32_MAX;
  1466. }
  1467. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1468. tensor->GetIteration());
  1469. if (it == graphs_run_history_[rank_and_graph].end()) {
  1470. // The graph is not executed in that iteration
  1471. return UINT32_MAX;
  1472. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1473. // current iteration is the first iteration that the graph was run
  1474. // no prev iter is available
  1475. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1476. << " is the first run iteration for tensor: " << tensor->GetName();
  1477. return UINT32_MAX;
  1478. }
  1479. it--;
  1480. prev_iter = *it;
  1481. tensor->SetPrevIteration(prev_iter);
  1482. return prev_iter;
  1483. }
  1484. void DebugServices::ResetLoadedTensors() {
  1485. wp_id_cache_.clear();
  1486. MS_LOG(INFO) << "Resetting loaded tensors";
  1487. tensor_loader_->MoveParametersCurrentToPrev();
  1488. tensor_loader_->EmptyCurrentTensor();
  1489. // will move parameters from previous to current map
  1490. tensor_loader_->SwapCurrentPrev();
  1491. overflow_ops_.clear();
  1492. }
  1493. #ifdef ONLINE_DBG_MODE
  1494. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1495. MS_EXCEPTION_IF_NULL(kernel);
  1496. std::vector<std::shared_ptr<TensorData>> result;
  1497. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1498. auto kernel_name = GetKernelNodeName(kernel);
  1499. for (size_t j = 0; j < output_size; ++j) {
  1500. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1501. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1502. if (tensor != nullptr) {
  1503. result.push_back(tensor);
  1504. }
  1505. }
  1506. return result;
  1507. }
  1508. #endif
  1509. std::string GetOnlineOpOverflowDir() {
  1510. // only called for online debugger mode
  1511. // get operator overflow directory for current iteration
  1512. std::string overflow_bin_path = "";
  1513. #ifdef ONLINE_DBG_MODE
  1514. if (DumpJsonParser::GetInstance().path().empty()) {
  1515. MS_LOG(INFO) << "Dump config is not set.";
  1516. return "";
  1517. }
  1518. auto debugger = Debugger::GetInstance();
  1519. MS_EXCEPTION_IF_NULL(debugger);
  1520. auto cur_graph = debugger->GetGraphPtr();
  1521. if (cur_graph == nullptr) {
  1522. return "";
  1523. }
  1524. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1525. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1526. if (!realpath.has_value()) {
  1527. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1528. return "";
  1529. }
  1530. overflow_bin_path = realpath.value() + '/';
  1531. #endif
  1532. return overflow_bin_path;
  1533. }
  1534. void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, std::vector<std::string> *op_names) {
  1535. MS_EXCEPTION_IF_NULL(op_names);
  1536. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1537. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1538. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1539. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1540. DIR *d = opendir(overflow_bin_path.c_str());
  1541. if (d == nullptr) {
  1542. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1543. } else {
  1544. struct dirent *dir = nullptr;
  1545. while ((dir = readdir(d)) != nullptr) {
  1546. struct stat st;
  1547. std::string name = overflow_bin_path + std::string("/") + std::string(dir->d_name);
  1548. int ret = stat(name.c_str(), &st);
  1549. if (ret != 0) {
  1550. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1551. (void)closedir(d);
  1552. return;
  1553. } else if (S_ISREG(st.st_mode)) {
  1554. // form fully qualified filename
  1555. std::string file_path = name;
  1556. std::string file_name = dir->d_name;
  1557. // attempt to read the file
  1558. std::ifstream infile;
  1559. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1560. if (!infile.is_open()) {
  1561. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1562. continue;
  1563. }
  1564. std::string node_name;
  1565. uint64_t task_id = 0;
  1566. uint64_t stream_id = 0;
  1567. // detect overflow bin file
  1568. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1569. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1570. continue;
  1571. }
  1572. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1573. << ".";
  1574. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1575. } else {
  1576. // regular bin file
  1577. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1578. if (success_parse) {
  1579. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1580. }
  1581. }
  1582. infile.close();
  1583. }
  1584. }
  1585. (void)closedir(d);
  1586. }
  1587. // find the op_names with an overflow hit
  1588. for (auto &task_stream : task_stream_hit) {
  1589. auto op_name = task_stream_to_opname[task_stream];
  1590. if (!op_name.empty()) {
  1591. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1592. op_names->push_back(op_name);
  1593. }
  1594. }
  1595. }
  1596. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1597. unsigned int iteration) {
  1598. std::string overflow_bin_path = "";
  1599. #ifdef ONLINE_DBG_MODE
  1600. overflow_bin_path = GetOnlineOpOverflowDir();
  1601. #else
  1602. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1603. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1604. overflow_bin_path = RealPath(overflow_bin_path);
  1605. #endif
  1606. if (overflow_bin_path.empty()) {
  1607. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1608. return false;
  1609. }
  1610. // remove kernel_graph_#
  1611. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1612. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1613. // remove path
  1614. size_t last_slash = node_name_to_find.rfind("/");
  1615. std::string op_name_find = "";
  1616. if (last_slash != std::string::npos) {
  1617. op_name_find = node_name_to_find.substr(last_slash + 1);
  1618. }
  1619. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1620. std::vector<std::string> op_names;
  1621. overflow_wp_lock_.lock();
  1622. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1623. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1624. if (found_overflows != overflow_ops_.end()) {
  1625. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1626. op_names = overflow_ops_[overflow_bin_path];
  1627. } else {
  1628. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1629. overflow_ops_[overflow_bin_path] = op_names;
  1630. }
  1631. overflow_wp_lock_.unlock();
  1632. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1633. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1634. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1635. return true;
  1636. }
  1637. // determine if overflow wp has been triggered for the op name (from npy file)
  1638. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1639. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1640. return true;
  1641. }
  1642. return false;
  1643. }
  1644. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1645. std::string op_name_to_find = node_name_to_find;
  1646. const std::string kernel_prefix = "kernel_graph_";
  1647. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1648. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1649. if (start_of_op_name != std::string::npos) {
  1650. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1651. }
  1652. }
  1653. return op_name_to_find;
  1654. }
  1655. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1656. uint64_t *stream_id) {
  1657. size_t task_pos_start = overflow_file_prefix.length();
  1658. size_t task_pos_end = file_name.find(".", task_pos_start);
  1659. if (task_pos_end == std::string::npos) {
  1660. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1661. return false;
  1662. }
  1663. size_t stream_pos_start = task_pos_end + 1;
  1664. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1665. if (stream_pos_end == std::string::npos) {
  1666. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1667. return false;
  1668. }
  1669. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1670. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1671. *task_id = std::stoull(task_id_str);
  1672. *stream_id = std::stoull(stream_id_str);
  1673. return true;
  1674. }
  1675. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  1676. uint64_t *stream_id) {
  1677. // get the node_name, task_id, and stream_id from dump filename
  1678. // node_type.node_name.task_id.stream_id.{etcetera}
  1679. size_t first_dot = file_name.find(".");
  1680. size_t second_dot = file_name.find(".", first_dot + 1);
  1681. size_t third_dot = file_name.find(".", second_dot + 1);
  1682. size_t fourth_dot = file_name.find(".", third_dot + 1);
  1683. // check if dots were found
  1684. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1685. fourth_dot == std::string::npos) {
  1686. return false;
  1687. }
  1688. // get node_name
  1689. if (first_dot < second_dot) {
  1690. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1691. } else {
  1692. MS_LOG(ERROR) << "filename parse error to get node_name.";
  1693. return false;
  1694. }
  1695. // get task id
  1696. if (second_dot < third_dot) {
  1697. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1698. try {
  1699. *task_id = std::stoull(extracted_task_id);
  1700. } catch (std::invalid_argument &e) {
  1701. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1702. return false;
  1703. } catch (std::out_of_range &e) {
  1704. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1705. return false;
  1706. }
  1707. } else {
  1708. MS_LOG(ERROR) << "filename parse error to get task_id.";
  1709. return false;
  1710. }
  1711. // get stream id
  1712. if (third_dot < fourth_dot) {
  1713. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1714. try {
  1715. *stream_id = std::stoull(extracted_stream_id);
  1716. } catch (std::invalid_argument &e) {
  1717. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1718. return false;
  1719. } catch (std::out_of_range &e) {
  1720. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1721. return false;
  1722. }
  1723. } else {
  1724. MS_LOG(ERROR) << "filename parse error to get stream_id.";
  1725. return false;
  1726. }
  1727. return true;
  1728. }
  1729. std::string DebugServices::RealPath(const std::string &input_path) {
  1730. if (input_path.length() >= PATH_MAX) {
  1731. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1732. }
  1733. size_t path_split_pos = input_path.find_last_of('/');
  1734. // get real path
  1735. char real_path[PATH_MAX] = {0};
  1736. // input_path is dir + file_name
  1737. if (path_split_pos != std::string::npos) {
  1738. std::string prefix_path = input_path.substr(0, path_split_pos);
  1739. std::string file_name = input_path.substr(path_split_pos);
  1740. if (file_name.length() > NAME_MAX) {
  1741. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1742. }
  1743. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1744. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  1745. return "";
  1746. }
  1747. return std::string(real_path) + file_name;
  1748. }
  1749. // input_path is only file_name
  1750. if (input_path.length() > NAME_MAX) {
  1751. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1752. }
  1753. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1754. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1755. }
  1756. return std::string(real_path);
  1757. }
  1758. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1759. #if defined(__APPLE__)
  1760. return *reinterpret_cast<const uint64_t *>(buffer.data());
  1761. #else
  1762. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1763. #endif
  1764. }
  1765. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1766. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1767. }
  1768. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1769. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1770. }
  1771. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1772. if (tensor_loader_->EnableMemoryControl()) {
  1773. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1774. }
  1775. }
  1776. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1777. std::string DebugServices::GetNetName() { return net_name_; }
  1778. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1779. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1780. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1781. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1782. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1783. #ifdef ONLINE_DBG_MODE
  1784. } // namespace mindspore
  1785. #endif