You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 99 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173
  1. /**
  2. * Copyright 2019-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include <regex>
  29. #include "pybind11/embed.h"
  30. #include "pybind11/stl.h"
  31. #ifdef ONLINE_DBG_MODE
  32. #include "debug/common.h"
  33. #include "debug/debugger/debugger.h"
  34. #include "debug/anf_ir_utils.h"
  35. #include "backend/common/session/anf_runtime_algorithm.h"
  36. #endif
  37. #include "nlohmann/json.hpp"
  38. #include "debug/debugger/tensor_summary.h"
  39. #include "utils/file_utils.h"
  40. #include "climits"
  41. #ifdef ONLINE_DBG_MODE
  42. namespace mindspore {
  43. #endif
  44. static constexpr const char *constant_prefix = "Default--data-";
  45. static constexpr const char *kNpyExt = ".npy";
  46. namespace {
  47. #ifdef __APPLE__
  48. constexpr int kStrErrorNone = 0;
  49. #else
  50. constexpr char *kStrErrorNone = nullptr;
  51. #endif
  52. } // namespace
  53. bool IsRegFile(const std::string &file_path) {
  54. struct stat st;
  55. int ret = stat(file_path.c_str(), &st);
  56. if (ret != 0) {
  57. MS_LOG(ERROR) << "stat error for " << file_path << ", ret is: " << ret;
  58. return false;
  59. }
  60. return S_ISREG(st.st_mode);
  61. }
  62. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  63. DebugServices::DebugServices(const DebugServices &other) {
  64. wp_id_cache_ = other.wp_id_cache_;
  65. net_name_ = other.net_name_;
  66. dump_dir_ = other.dump_dir_;
  67. is_sync_mode_ = other.is_sync_mode_;
  68. tensor_loader_ = other.tensor_loader_;
  69. watchpoint_table_ = other.watchpoint_table_;
  70. }
  71. DebugServices &DebugServices::operator=(const DebugServices &other) {
  72. if (this != &other) {
  73. tensor_loader_ = other.tensor_loader_;
  74. watchpoint_table_ = other.watchpoint_table_;
  75. }
  76. return *this;
  77. }
  78. /*
  79. * Feature group: Online debugger, Offline debugger.
  80. * Target device group: Ascend, GPU.
  81. * Runtime category: Old runtime, MindRT.
  82. * Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
  83. * watchpoint_table.
  84. */
  85. void DebugServices::AddWatchpoint(
  86. unsigned int id, int watch_condition, float parameter,
  87. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  88. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  89. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  90. std::lock_guard<std::mutex> lg(lock_);
  91. watchpoint_t watchpoint_item;
  92. watchpoint_item.id = id;
  93. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  94. watchpoint_item.condition.parameter = parameter;
  95. watchpoint_item.check_node_list = check_node_list;
  96. // For offline debugger check_node_device_list is not nullptr.
  97. if (check_node_device_list != nullptr) {
  98. watchpoint_item.check_node_device_list = *check_node_device_list;
  99. }
  100. // For offline debugger check_node_graph_list is not nullptr.
  101. if (check_node_graph_list != nullptr) {
  102. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  103. }
  104. watchpoint_item.parameter_list = parameter_list;
  105. watchpoint_table_[id] = watchpoint_item;
  106. }
  107. void DebugServices::RemoveWatchpoint(unsigned int id) {
  108. std::lock_guard<std::mutex> lg(lock_);
  109. (void)watchpoint_table_.erase(id);
  110. }
  111. /*
  112. * Feature group: Online debugger, Offline debugger.
  113. * Target device group: Ascend, GPU.
  114. * Runtime category: Old runtime, MindRT.
  115. * Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
  116. * not supported.
  117. */
  118. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  119. const void *const previous_tensor_ptr, uint64_t num_elements,
  120. uint64_t prev_num_elements, int tensor_dtype) {
  121. MS_EXCEPTION_IF_NULL(tensor);
  122. switch (tensor_dtype) {
  123. case DbgDataType::DT_UINT8: {
  124. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  125. prev_num_elements);
  126. }
  127. case DbgDataType::DT_INT8: {
  128. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  129. prev_num_elements);
  130. }
  131. case DbgDataType::DT_UINT16: {
  132. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  133. prev_num_elements);
  134. }
  135. case DbgDataType::DT_INT16: {
  136. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  137. prev_num_elements);
  138. }
  139. case DbgDataType::DT_UINT32: {
  140. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  141. prev_num_elements);
  142. }
  143. case DbgDataType::DT_INT32:
  144. case DbgDataType::DT_BASE_INT: {
  145. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  146. prev_num_elements);
  147. }
  148. case DbgDataType::DT_UINT64: {
  149. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  150. prev_num_elements);
  151. }
  152. case DbgDataType::DT_INT64: {
  153. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  154. prev_num_elements);
  155. }
  156. case DbgDataType::DT_FLOAT16: {
  157. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  158. prev_num_elements);
  159. }
  160. case DbgDataType::DT_FLOAT32:
  161. case DbgDataType::DT_BASE_FLOAT: {
  162. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  163. prev_num_elements);
  164. }
  165. case DbgDataType::DT_FLOAT64: {
  166. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  167. prev_num_elements);
  168. }
  169. case DbgDataType::DT_BOOL: {
  170. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  171. prev_num_elements);
  172. }
  173. default:
  174. MS_LOG(INFO) << "Unsupported tensor type";
  175. // return a null pointer
  176. return std::unique_ptr<TensorSummary<int32_t>>{};
  177. }
  178. }
  179. /*
  180. * Feature group: Online debugger, Offline debugger.
  181. * Target device group: Ascend, GPU.
  182. * Runtime category: Old runtime, MindRT.
  183. * Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
  184. */
  185. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  186. if (tensor == nullptr) {
  187. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  188. TensorStat empty_tensor_stat_data;
  189. return empty_tensor_stat_data;
  190. }
  191. std::unique_ptr<ITensorSummary> base_summary_ptr;
  192. void *previous_tensor_ptr = nullptr;
  193. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  194. if (base_summary_ptr == nullptr) {
  195. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  196. TensorStat empty_tensor_stat_data;
  197. return empty_tensor_stat_data;
  198. }
  199. base_summary_ptr->TensorStatistics(tensor->GetType());
  200. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  201. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  202. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  203. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  204. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  205. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  206. return tensor_stat_data;
  207. }
  208. #ifdef OFFLINE_DBG_MODE
  209. /*
  210. * Feature group: Offline debugger.
  211. * Target device group: Ascend, GPU.
  212. * Runtime category: Old runtime, MindRT.
  213. * Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
  214. * run iteration for tensor's graph.
  215. */
  216. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  217. uint64_t *prev_num_elements, bool *history_not_found) {
  218. MS_EXCEPTION_IF_NULL(tensor);
  219. const void *previous_tensor_ptr = nullptr;
  220. std::shared_ptr<TensorData> tensor_prev;
  221. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  222. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  223. *history_not_found = 1;
  224. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  225. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  226. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  227. // read data in offline mode
  228. AsyncFilePool file_paths;
  229. if (!is_sync_mode_) {
  230. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  231. std::vector<unsigned int>{tensor->GetDeviceId()},
  232. std::vector<unsigned int>{tensor->GetPrevIteration()},
  233. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  234. }
  235. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  236. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  237. std::vector<unsigned int>{tensor->GetDeviceId()},
  238. std::vector<unsigned int>{tensor->GetPrevIteration()},
  239. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  240. file_paths, &result_list_prev);
  241. tensor_prev = result_list_prev[0];
  242. if (!tensor_prev->GetByteSize()) {
  243. tensor_prev.reset();
  244. } else {
  245. previous_tensor_ptr = tensor_prev->GetDataPtr();
  246. *prev_num_elements = tensor_prev->GetNumElements();
  247. }
  248. }
  249. return previous_tensor_ptr;
  250. }
  251. #endif
  252. /*
  253. * Feature group: Offline debugger, Online debugger.
  254. * Target device group: Ascend, GPU.
  255. * Runtime category: Old runtime, MindRT.
  256. * Description: Goes through all the watchpoints in the watchpoint table. If the current tensor is in the list of
  257. * check_nodes, that watchpoint is added to the vector of watchpoint_to_check (vector of watchpoints that should be
  258. * checked for the current tensor) .
  259. */
  260. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  261. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  262. std::string *const qualified_tensor_name,
  263. std::vector<watchpoint_t> *const watchpoints_to_check) {
  264. if (tensor == nullptr) {
  265. MS_LOG(DEBUG) << "tensor is nullptr.";
  266. return;
  267. }
  268. const auto tensor_name = tensor->GetName();
  269. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  270. const auto tensor_device_id = tensor->GetDeviceId();
  271. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  272. for (auto w_table_item : watchpoint_table_) {
  273. auto wp = std::get<1>(w_table_item);
  274. // check ONLY init conditions on initial suspended state.
  275. // skip other conditions on initial suspended state
  276. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  277. continue;
  278. }
  279. // skip init condition if not init suspend
  280. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  281. continue;
  282. }
  283. // check change conditions only on step end.
  284. if (wp.change_condition() && !step_end) {
  285. continue;
  286. }
  287. // if recheck, ignore the cache results and reanalyze everything.
  288. // if not a recheck, check only unanalyzed tensors
  289. if (!recheck) {
  290. wp_lock_.lock();
  291. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  292. wp_lock_.unlock();
  293. if (wp_cache_hit) {
  294. continue;
  295. }
  296. }
  297. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  298. if (!found.empty()) {
  299. *qualified_tensor_name = found;
  300. watchpoints_to_check->push_back(w_table_item.second);
  301. #ifdef OFFLINE_DBG_MODE
  302. if (wp.change_condition()) {
  303. *previous_iter_tensor_needed = true;
  304. }
  305. #endif
  306. }
  307. }
  308. }
  309. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  310. const std::string &tensor_name) {
  311. // add analyzed tensor to cache
  312. if (!recheck) {
  313. wp_lock_.lock();
  314. (void)wp_id_cache_[tensor_name].insert(id);
  315. wp_lock_.unlock();
  316. }
  317. }
  318. void DebugServices::SetCheckWatchpointsResult(
  319. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  320. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  321. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  322. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  323. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  324. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  325. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  326. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  327. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  328. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  329. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  330. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  331. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  332. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  333. if (device_id != nullptr) {
  334. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  335. }
  336. if (root_graph_id != nullptr) {
  337. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  338. }
  339. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  340. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  341. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  342. }
  343. #ifdef OFFLINE_DBG_MODE
  344. /*
  345. * Feature group: Offline debugger.
  346. * Target device group: Ascend, GPU.
  347. * Runtime category: Old runtime, MindRT.
  348. * Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and NO_VALUE error_code (for
  349. * new python API feature). Sets checkwatchpoint results.
  350. */
  351. void DebugServices::CheckOutofMemoryandNoValue(
  352. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  353. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  354. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  355. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  356. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  357. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  358. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  359. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  360. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  361. const std::vector<parameter_t> &parameter_list) {
  362. bool set_is_needed = no_mem_to_read || error_on_no_value;
  363. int32_t error_code_to_set = 0;
  364. if (no_mem_to_read) {
  365. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  366. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  367. } else if (error_on_no_value) {
  368. error_code_to_set = ITensorSummary::NO_VALUE;
  369. }
  370. if (set_is_needed) {
  371. for (auto &wp : watchpoints_to_check) {
  372. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  373. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  374. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  375. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  376. parameter_list, error_code_to_set);
  377. }
  378. }
  379. }
  380. /*
  381. * Feature group: Offline debugger.
  382. * Target device group: Ascend, GPU.
  383. * Runtime category: Old runtime, MindRT.
  384. * Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
  385. * feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
  386. * required by other nodes' checking. If previous_tensor exists, change their status in a pair.
  387. */
  388. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  389. // set the tensor into not-in-use status in tensor_loader.
  390. auto tensor_name = tensor->GetName();
  391. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  392. std::to_string(tensor->GetRootGraphId()) + ":" +
  393. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  394. AppendToCacheEvictQueue(key_name_in_cache);
  395. if (previous_tensor_ptr != nullptr) {
  396. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  397. }
  398. }
  399. #endif
  400. #ifdef ONLINE_DBG_MODE
  401. /*
  402. * Feature group: Online debugger.
  403. * Target device group: Ascend, GPU.
  404. * Runtime category: Old runtime, MindRT.
  405. * Description: Compares the current root graph id with the given graph id and returns false if they are not equal
  406. * for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
  407. * root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
  408. * different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
  409. * reading tensor if tensor's root_graph_id is different from current_root_graph_id.
  410. */
  411. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  412. auto debugger = Debugger::GetInstance();
  413. auto ms_context = MsContext::GetInstance();
  414. MS_EXCEPTION_IF_NULL(ms_context);
  415. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  416. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  417. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  418. device_target == kAscendDevice) {
  419. if (cur_root_graph_id != id) {
  420. return false;
  421. }
  422. }
  423. return true;
  424. }
  425. /*
  426. * Feature group: Online debugger.
  427. * Target device group: Ascend, GPU.
  428. * Runtime category: Old runtime, MindRT.
  429. * Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
  430. * prev_tensor_data is not nullptr.
  431. */
  432. const void *DebugServices::PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name) {
  433. std::shared_ptr<TensorData> prev_tensor_data;
  434. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  435. // not supporting watchpoints that need prev tensor for multi root graph networks.
  436. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  437. prev_tensor_data = nullptr;
  438. } else {
  439. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  440. }
  441. if (prev_tensor_data) {
  442. *prev_num_elements = prev_tensor_data->GetNumElements();
  443. return prev_tensor_data->GetDataPtr();
  444. }
  445. return nullptr;
  446. }
  447. #endif
  448. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  449. // check history error_code only for offline debugger
  450. if (history_not_found) {
  451. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  452. }
  453. }
  454. /*
  455. * Feature group: Offline debugger, Online debugger.
  456. * Target device group: Ascend, GPU.
  457. * Runtime category: Old runtime, MindRT.
  458. * Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
  459. * watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
  460. * successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
  461. */
  462. void DebugServices::CheckWatchpointsForTensor(
  463. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  464. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  465. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  466. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  467. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  468. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  469. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  470. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  471. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  472. int list_size = tensor_list->size();
  473. if (end > list_size) {
  474. end = list_size;
  475. }
  476. for (int i = begin; i < end; i++) {
  477. auto &tensor = (*tensor_list)[i];
  478. const auto tensor_name = tensor->GetName();
  479. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  480. const auto tensor_slot = std::to_string(tensor->GetSlot());
  481. std::vector<watchpoint_t> watchpoints_to_check;
  482. std::string qualified_tensor_name;
  483. bool previous_iter_tensor_needed = false;
  484. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  485. &qualified_tensor_name, &watchpoints_to_check);
  486. // no wp set on current tensor
  487. if (watchpoints_to_check.empty()) {
  488. continue;
  489. }
  490. #ifdef OFFLINE_DBG_MODE
  491. // read data in offline mode
  492. bool no_mem_to_read = false;
  493. std::vector<std::shared_ptr<TensorData>> result_list;
  494. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  495. std::vector<unsigned int>{tensor->GetDeviceId()},
  496. std::vector<unsigned int>{tensor->GetIteration()},
  497. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  498. async_file_pool, &result_list, &no_mem_to_read);
  499. tensor = result_list[0];
  500. if (!tensor->GetByteSize()) {
  501. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  502. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  503. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  504. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  505. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  506. tensor->GetRootGraphId(), std::vector<parameter_t>());
  507. tensor.reset();
  508. continue;
  509. }
  510. #endif
  511. // no elements to analyze
  512. if (tensor->GetByteSize() == 0) {
  513. continue;
  514. }
  515. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  516. int tensor_dtype = tensor->GetType();
  517. uint64_t num_elements = tensor->GetNumElements();
  518. uint64_t prev_num_elements = 0;
  519. const void *previous_tensor_ptr = nullptr;
  520. #ifdef OFFLINE_DBG_MODE
  521. bool history_not_found = 0;
  522. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  523. #else
  524. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  525. MS_LOG(DEBUG)
  526. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  527. << tensor->GetName();
  528. continue;
  529. }
  530. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  531. #endif
  532. std::unique_ptr<ITensorSummary> base_summary_ptr;
  533. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  534. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  535. if (base_summary_ptr != nullptr) {
  536. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  537. }
  538. }
  539. for (auto &wp : watchpoints_to_check) {
  540. bool is_hit = false;
  541. int error_code = 0;
  542. std::vector<parameter_t> parameter_list = {};
  543. if (wp.condition.type == IS_OVERFLOW) {
  544. is_hit =
  545. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  546. } else if (base_summary_ptr != nullptr) {
  547. auto item = base_summary_ptr->IsWatchpointHit(wp);
  548. is_hit = std::get<ITensorSummary::eHitPos>(item);
  549. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  550. #ifdef OFFLINE_DBG_MODE
  551. CheckHistoryErrorCode(&error_code, history_not_found);
  552. #endif
  553. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  554. }
  555. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  556. if (is_hit || error_code) {
  557. SetCheckWatchpointsResult(
  558. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  559. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  560. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  561. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  562. }
  563. }
  564. #ifdef OFFLINE_DBG_MODE
  565. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  566. // in offline mode remove the need for the data
  567. tensor.reset();
  568. #endif
  569. }
  570. }
  571. /*
  572. * Feature group: Offline debugger, Online debugger.
  573. * Target device group: Ascend, GPU.
  574. * Runtime category: Old runtime, MindRT.
  575. * Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
  576. * Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
  577. * sorted. In the end, the time for checking the watchpoint in the current step is reported.
  578. */
  579. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  580. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  581. std::vector<std::vector<parameter_t>> *const parameters,
  582. std::vector<int32_t> *const error_codes,
  583. const std::vector<std::string> &op_overflows, const AsyncFilePool &async_file_pool,
  584. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  585. const bool init_dbg_suspend, const bool step_end, const bool recheck,
  586. std::vector<unsigned int> *const device_id,
  587. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  588. std::lock_guard<std::mutex> lg(lock_);
  589. auto t1 = std::chrono::high_resolution_clock::now();
  590. if (watchpoint_table_.empty()) {
  591. return;
  592. }
  593. // vector to store execution order of tensors hit
  594. std::vector<int> exec_order;
  595. std::vector<std::string> time_stamps;
  596. int tensor_list_size = tensor_list->size();
  597. uint64_t tensor_list_byte_size = 0;
  598. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  599. if (tensor_list_size <= 0) {
  600. return;
  601. }
  602. // default value for number of threads
  603. const int default_thread_num = 16;
  604. int max_thread_num = default_thread_num;
  605. if (max_thread_num > tensor_list_size) {
  606. max_thread_num = tensor_list_size;
  607. }
  608. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  609. int chunk_size = tensor_list_size / max_thread_num;
  610. int remainder = tensor_list_size % max_thread_num;
  611. partitioned_numbers chunk_exec_orders(max_thread_num);
  612. partitioned_names chunk_names(max_thread_num);
  613. partitioned_names chunk_slots(max_thread_num);
  614. partitioned_numbers chunk_conditions(max_thread_num);
  615. partitioned_id chunk_watchpoint_id(max_thread_num);
  616. partitioned_parameters chunk_parameters(max_thread_num);
  617. partitioned_error_code chunk_error_codes(max_thread_num);
  618. partitioned_id chunk_device_id(max_thread_num);
  619. partitioned_id chunk_root_graph_id(max_thread_num);
  620. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  621. partitioned_names chunk_time_stamp(max_thread_num);
  622. std::vector<std::future<void>> tensor_future_vec;
  623. int begin = 0;
  624. int end = begin;
  625. for (int i = 0; i < max_thread_num; i++) {
  626. end += chunk_size;
  627. if (remainder > 0) {
  628. end++;
  629. remainder--;
  630. }
  631. (void)tensor_future_vec.emplace_back(std::async(
  632. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  633. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  634. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  635. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  636. begin = end;
  637. }
  638. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  639. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  640. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  641. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  642. root_graph_id);
  643. auto t2 = std::chrono::high_resolution_clock::now();
  644. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  645. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  646. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  647. }
  648. /*
  649. * Feature group: Offline debugger, Online debugger.
  650. * Target device group: Ascend, GPU.
  651. * Runtime category: Old runtime, MindRT.
  652. * Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
  653. * debugger is based on the execution order and for the offline debugger is based on the time stamp.
  654. */
  655. void DebugServices::SortWatchpointsInfo(
  656. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  657. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  658. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  659. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  660. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  661. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  662. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  663. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  664. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  665. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  666. std::vector<unsigned int> *const root_graph_id) {
  667. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  668. (*tensor_future_vec)[i].wait();
  669. (*tensor_future_vec)[i].get();
  670. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  671. #ifdef ONLINE_DBG_MODE
  672. // if the execution order is repeated,inserts the new one before the others with same execution order.
  673. std::vector<int>::iterator iter =
  674. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  675. int position = iter - exec_order->begin();
  676. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  677. #endif
  678. #ifdef OFFLINE_DBG_MODE
  679. std::vector<std::string>::iterator iter =
  680. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  681. int position = iter - time_stamps->begin();
  682. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  683. #endif
  684. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  685. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  686. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  687. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  688. if (device_id != nullptr) {
  689. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  690. }
  691. if (root_graph_id != nullptr) {
  692. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  693. }
  694. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  695. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  696. }
  697. // free the memory for used vectors
  698. std::vector<int>().swap((*chunk_exec_orders)[i]);
  699. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  700. std::vector<std::string>().swap((*chunk_names)[i]);
  701. std::vector<std::string>().swap((*chunk_slots)[i]);
  702. std::vector<int>().swap((*chunk_conditions)[i]);
  703. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  704. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  705. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  706. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  707. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  708. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  709. }
  710. }
  711. #ifdef OFFLINE_DBG_MODE
  712. /*
  713. * Feature group: Offline debugger.
  714. * Target device group: Ascend, GPU.
  715. * Runtime category: Old runtime, MindRT.
  716. * Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
  717. * if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
  718. * than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
  719. * for the tensor.
  720. */
  721. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  722. std::string *const tensor_type, std::size_t *const size,
  723. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  724. bool *no_mem_to_read) {
  725. std::ifstream infile;
  726. std::string file_path = file_name;
  727. MS_LOG(INFO) << "Reading in file: " << file_path;
  728. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  729. if (!infile.is_open()) {
  730. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  731. const int kMaxFilenameLength = 128;
  732. char err_info[kMaxFilenameLength];
  733. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  734. if (ret != kStrErrorNone) {
  735. MS_LOG(ERROR) << " ErrInfo:" << ret;
  736. }
  737. return;
  738. }
  739. const int substr_len = 2;
  740. const int header_len_offset = 8;
  741. const int header_offset = 9;
  742. const int header_len_buffer_size = 2;
  743. const int type_offset = 10;
  744. // get header length
  745. (void)infile.seekg(0, std::ios::beg);
  746. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  747. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  748. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  749. return;
  750. }
  751. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  752. header_len_buffer.reset();
  753. // read in header
  754. (void)infile.seekg(0, std::ios::beg);
  755. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  756. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  757. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  758. return;
  759. }
  760. std::string header(header_buffer->data() + header_offset, header_len);
  761. header_buffer.reset();
  762. std::size_t type_i = header.find("descr") + type_offset;
  763. if (header.length() < type_i + substr_len) {
  764. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  765. return;
  766. }
  767. *tensor_type = header.substr(type_i, substr_len);
  768. std::size_t shape_i_open = header.find("(");
  769. std::size_t shape_i_close = header.find(")");
  770. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  771. std::string intermediate;
  772. std::stringstream check_shape(shape_str);
  773. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  774. while (getline(check_shape, intermediate, ',')) {
  775. shape->push_back(std::stoi(intermediate));
  776. }
  777. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  778. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  779. std::size_t data_size = data_len * word_size;
  780. if (!data_size) {
  781. return;
  782. }
  783. // Check memory available before loading tensor into host.
  784. bool has_enough_memory = true;
  785. if (tensor_loader_->EnableMemoryControl()) {
  786. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  787. }
  788. if (!has_enough_memory) {
  789. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  790. *no_mem_to_read = true;
  791. } else {
  792. (void)infile.seekg(header_len + type_offset);
  793. *data_buffer = new std::vector<char>(data_size);
  794. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  795. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  796. }
  797. *size = data_size;
  798. }
  799. }
  800. /*
  801. * Feature group: Offline debugger.
  802. * Target device group: Ascend.
  803. * Runtime category: Old runtime, MindRT.
  804. * Description: This function is to convert files in each directory from device format to host format and append the
  805. * converted npy file name into AsyncFilePool. It's for Ascend async dump only.
  806. */
  807. void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
  808. for (auto const &d : dir_to_files_map) {
  809. std::vector<std::string> files_to_convert_in_dir;
  810. std::vector<std::string> files_after_convert_in_dir;
  811. std::string dump_key = d.first;
  812. for (auto const &pair : d.second) {
  813. std::string file_name = pair.first;
  814. std::string file_name_without_scope = pair.second;
  815. // skip the file that was converted to npy already.
  816. if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
  817. return file_found.find(file_name_without_scope) == std::string::npos;
  818. })) {
  819. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  820. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  821. }
  822. }
  823. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  824. if (!files_to_convert_in_dir.empty()) {
  825. // Look for the installation path to the convert_async package. If not found, throw exception and terminate the
  826. // later task.
  827. {
  828. pybind11::gil_scoped_acquire acquire;
  829. try {
  830. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  831. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  832. (void)convert_obj.attr("convert_files")();
  833. } catch (pybind11::error_already_set &e) {
  834. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  835. }
  836. }
  837. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
  838. }
  839. }
  840. }
  841. /*
  842. * Feature group: Offline debugger.
  843. * Target device group: Ascend.
  844. * Runtime category: Old runtime, MindRT.
  845. * Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
  846. * append into AsyncFilePool. It's for Ascend async dump only.
  847. */
  848. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  849. const std::string &dump_key, AsyncFilePool *const result_list) {
  850. std::string real_dump_iter_dir = RealPath(dump_key);
  851. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  852. if (d_handle == nullptr) {
  853. MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
  854. return;
  855. }
  856. struct dirent *dir = nullptr;
  857. while ((dir = readdir(d_handle)) != nullptr) {
  858. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  859. if (IsRegFile(name)) {
  860. std::string candidate = dir->d_name;
  861. for (const std::string &file_to_find : files_after_convert_in_dir) {
  862. std::string file_n = file_to_find;
  863. auto last_slash_pos = file_to_find.find_last_of("\\/");
  864. if (last_slash_pos != std::string::npos) {
  865. file_n = file_to_find.substr(last_slash_pos + 1);
  866. }
  867. if (candidate.find(file_n + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
  868. // we found a converted file for this op
  869. std::string found_file = dump_key + "/" + candidate;
  870. result_list->insert(found_file);
  871. }
  872. }
  873. }
  874. }
  875. (void)closedir(d_handle);
  876. }
  877. /*
  878. * Feature group: Offline debugger.
  879. * Target device group: Ascend, GPU.
  880. * Runtime category: Old runtime, MindRT.
  881. * Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
  882. * dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
  883. * match the file.
  884. */
  885. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  886. if (dump_style_name.empty()) {
  887. return "";
  888. }
  889. std::size_t last_scope_marker;
  890. std::string delim = "/";
  891. last_scope_marker = dump_style_name.rfind(delim);
  892. if (last_scope_marker == std::string::npos) {
  893. return dump_style_name;
  894. }
  895. return dump_style_name.substr(last_scope_marker + delim.size());
  896. }
  897. /*
  898. * Feature group: Offline debugger.
  899. * Target device group: Ascend.
  900. * Runtime category: Old runtime, MindRT.
  901. * Description: This function is to search and prepare the target npy file to be read for each node. If the found file
  902. * is already npy format, push it to AsyncFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
  903. * npy format beforehand.
  904. */
  905. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  906. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  907. std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
  908. DirMap dir_to_files_map;
  909. for (unsigned int i = 0; i < backend_name.size(); i++) {
  910. // form prefix of the tensor file to read from graph pb node name
  911. std::string dump_style_kernel_name = backend_name[i];
  912. // remove slot from name
  913. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  914. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  915. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  916. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  917. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  918. // if node name is constant, skip
  919. if (prefix_dump_file_name.length() > (unsigned)strlen(constant_prefix) &&
  920. prefix_dump_file_name.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  921. continue;
  922. }
  923. // search files in dir for the one that meets the filename prefix and read the file into memory
  924. std::string abspath = RealPath(specific_dump_dir);
  925. DIR *d = opendir(abspath.c_str());
  926. if (d == nullptr) {
  927. MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
  928. return;
  929. }
  930. ProcessConvertList(prefix_dump_file_name, specific_dump_dir, &dir_to_files_map, result_list);
  931. (void)closedir(d);
  932. }
  933. ConvertToHostFormat(dir_to_files_map, result_list);
  934. }
  935. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  936. const std::string &specific_dump_dir, AsyncFilePool *const result_list) {
  937. DirMap dir_to_files_map;
  938. for (const auto &node : proto_dump) {
  939. std::string dump_name = std::get<1>(node);
  940. dump_name = dump_name.substr(0, dump_name.rfind("."));
  941. // search files in dir for the one that meets the filename prefix and read the file into memory
  942. std::string abspath = RealPath(specific_dump_dir);
  943. DIR *d = opendir(abspath.c_str());
  944. if (d == nullptr) {
  945. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  946. return;
  947. }
  948. ProcessConvertList(dump_name, specific_dump_dir, &dir_to_files_map, result_list);
  949. (void)closedir(d);
  950. }
  951. ConvertToHostFormat(dir_to_files_map, result_list);
  952. }
  953. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  954. DirMap *dir_to_files_map, AsyncFilePool *const result_list) {
  955. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  956. DIR *d = opendir(specific_dump_dir.c_str());
  957. struct dirent *dir = nullptr;
  958. while ((dir = readdir(d)) != nullptr) {
  959. std::string file_name = dir->d_name;
  960. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  961. if (!IsRegFile(file_path)) {
  962. continue;
  963. }
  964. std::string file_name_w_o_perfix = file_name;
  965. auto type_pos = file_name.find('.');
  966. // adding dot to avoid problematic matching in the scope.
  967. if (type_pos == std::string::npos ||
  968. file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
  969. continue;
  970. }
  971. if (file_name.rfind(kNpyExt) == std::string::npos) {
  972. std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
  973. file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
  974. // if file matches prefix and is in device format add to candidate files to convert.
  975. (*dir_to_files_map)[specific_dump_dir].push_back(std::make_pair(file_name, file_name_w_o_perfix));
  976. } else {
  977. // otherwise, if file matches prefix and already has been converted to host format
  978. // add to result of converted files.
  979. result_list->insert(file_path);
  980. }
  981. }
  982. (void)closedir(d);
  983. }
  984. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  985. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  986. uint32_t root_graph_id, const AsyncFilePool &async_file_pool,
  987. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  988. for (auto &node : proto_dump) {
  989. std::vector<size_t> slot_list;
  990. std::string dump_style_name = std::get<1>(node);
  991. // Get dump_name and output_str from the second element of tuple
  992. std::size_t found_dot = dump_style_name.rfind(".");
  993. std::string dump_name = dump_style_name.substr(0, found_dot);
  994. std::string output_str = dump_style_name.substr(found_dot + 1);
  995. bool output_flag = (output_str == "output");
  996. for (const std::string &file_name : async_file_pool) {
  997. std::string file_name_to_check = file_name;
  998. auto delim = file_name.rfind("/");
  999. if (delim != std::string::npos) {
  1000. file_name_to_check = file_name.substr(delim + 1);
  1001. }
  1002. std::size_t found = file_name_to_check.find("." + dump_name + ".");
  1003. std::size_t found_out = file_name_to_check.find(output_str, found + dump_name.length());
  1004. std::size_t found_dot_start = file_name_to_check.find(".", found_out);
  1005. std::size_t found_dot_end = file_name_to_check.find(".", found_dot_start);
  1006. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  1007. found_out != std::string::npos) {
  1008. slot_list.push_back(
  1009. std::stoul(file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  1010. }
  1011. }
  1012. for (auto slot : slot_list) {
  1013. // add a TensorData entry (data will be read when needed)
  1014. std::vector<int64_t> shape;
  1015. std::string orig_name = std::get<0>(node);
  1016. auto tensor_data = std::make_shared<TensorData>();
  1017. tensor_data->SetName(orig_name);
  1018. tensor_data->SetExecutionOrder(0);
  1019. tensor_data->SetSlot(slot);
  1020. tensor_data->SetIteration(iteration);
  1021. tensor_data->SetDeviceId(device_id);
  1022. tensor_data->SetRootGraphId(root_graph_id);
  1023. tensor_data->SetDataPtr(nullptr);
  1024. tensor_data->SetByteSize(0);
  1025. tensor_data->SetType("");
  1026. tensor_data->SetShape(shape);
  1027. tensor_data->SetIsOutput(output_flag);
  1028. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1029. tensor_list->push_back(tensor_data);
  1030. }
  1031. }
  1032. }
  1033. /*
  1034. * Feature group: Offline debugger.
  1035. * Target device group: Ascend, GPU.
  1036. * Runtime category: Old runtime, MindRT.
  1037. * Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
  1038. * from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
  1039. */
  1040. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  1041. std::regex re;
  1042. if (mode == "rank") {
  1043. re = "^rank_([0-9]+)$";
  1044. } else if (mode == "graph") {
  1045. re = "^([0-9]+)$";
  1046. }
  1047. std::smatch tokens;
  1048. if (regex_match(name, tokens, re)) {
  1049. return std::stoi(tokens[1]);
  1050. } else {
  1051. return UINT32_MAX;
  1052. }
  1053. }
  1054. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  1055. std::vector<uint32_t> rank_id_list;
  1056. std::string dump_dir = GetDumpDir();
  1057. DIR *d_handle = opendir(dump_dir.c_str());
  1058. if (d_handle == nullptr) {
  1059. MS_LOG(ERROR) << "Dump directory does not exist.";
  1060. return rank_id_list;
  1061. }
  1062. struct dirent *dir = nullptr;
  1063. while ((dir = readdir(d_handle)) != nullptr) {
  1064. struct stat st;
  1065. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  1066. int ret = stat(name.c_str(), &st);
  1067. if (ret != 0) {
  1068. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1069. (void)closedir(d_handle);
  1070. return rank_id_list;
  1071. }
  1072. if (S_ISDIR(st.st_mode)) {
  1073. std::string rank_dir_name = dir->d_name;
  1074. uint32_t rank_id = GetRankOrGraphId("rank", rank_dir_name);
  1075. if (rank_id != UINT32_MAX) {
  1076. rank_id_list.push_back(rank_id);
  1077. }
  1078. }
  1079. }
  1080. (void)closedir(d_handle);
  1081. return rank_id_list;
  1082. }
  1083. /*
  1084. * Feature group: Offline debugger.
  1085. * Target device group: Ascend, GPU.
  1086. * Runtime category: Old runtime, MindRT.
  1087. * Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
  1088. * graph_ids. Then the history file is read for all the extracted graph_ids.
  1089. */
  1090. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  1091. std::string net_name = GetNetName();
  1092. std::string dump_dir = GetDumpDir();
  1093. for (uint32_t rank_id : rank_id_list) {
  1094. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  1095. std::string abspath = RealPath(path);
  1096. DIR *d_handle_rank = opendir(abspath.c_str());
  1097. if (d_handle_rank == nullptr) {
  1098. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  1099. continue;
  1100. }
  1101. struct dirent *direc = nullptr;
  1102. while ((direc = readdir(d_handle_rank)) != nullptr) {
  1103. struct stat st;
  1104. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  1105. int ret = stat(name.c_str(), &st);
  1106. if (ret != 0) {
  1107. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1108. (void)closedir(d_handle_rank);
  1109. return;
  1110. }
  1111. if (S_ISDIR(st.st_mode)) {
  1112. std::string graph_dir = direc->d_name;
  1113. if (graph_dir == "." || graph_dir == "..") {
  1114. continue;
  1115. }
  1116. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  1117. if (graph_id != UINT32_MAX) {
  1118. ReadGraphsHistory(rank_id, graph_id);
  1119. }
  1120. }
  1121. }
  1122. (void)closedir(d_handle_rank);
  1123. }
  1124. }
  1125. void DebugServices::SetGraphsHistory() {
  1126. // extract rank_id_list
  1127. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  1128. // for each rank_id extract the graph_id list and set the dump version
  1129. // and for each graph read the graph history file
  1130. CheckDumpGraphIdList(rank_id_list);
  1131. }
  1132. /*
  1133. * Feature group: Offline debugger.
  1134. * Target device group: Ascend, GPU.
  1135. * Runtime category: Old runtime, MindRT.
  1136. * Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
  1137. * the data in graphs_run_history_ for the given rank and graph id.
  1138. */
  1139. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1140. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1141. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1142. // graph history was already stored for this rank_id and graph_id
  1143. return;
  1144. }
  1145. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1146. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1147. DIR *d_handle = opendir(exec_order_path.c_str());
  1148. if (d_handle == nullptr) {
  1149. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1150. return;
  1151. }
  1152. // read file and store the info
  1153. std::string full_path = exec_order_path + "/" + file_to_check;
  1154. std::string checked_path = RealPath(full_path);
  1155. if (!checked_path.empty()) {
  1156. ReadGraphRunIter(checked_path, rank_and_graph);
  1157. }
  1158. (void)closedir(d_handle);
  1159. }
  1160. /*
  1161. * Feature group: Offline debugger.
  1162. * Target device group: Ascend, GPU.
  1163. * Runtime category: Old runtime, MindRT.
  1164. * Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
  1165. * tuple with two elements, the first element is the node name and the second element is whether the node is output or
  1166. * not.
  1167. */
  1168. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1169. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1170. for (auto w_table_item : watchpoint_table_) {
  1171. auto wp = std::get<1>(w_table_item);
  1172. unsigned int index = 0;
  1173. for (auto check_node : wp.check_node_list) {
  1174. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1175. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1176. // graph represents root_graph for Ascend and kernel_graph for GPU
  1177. for (auto rank : ranks) {
  1178. for (auto graph : graphs) {
  1179. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1180. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1181. }
  1182. }
  1183. index++;
  1184. }
  1185. }
  1186. return rank_and_graph_to_nodes;
  1187. }
  1188. /*
  1189. * Feature group: Offline debugger.
  1190. * Target device group: Ascend, GPU.
  1191. * Runtime category: Old runtime, MindRT.
  1192. * Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
  1193. * graph in a vector and inserts it to graphs_run_history_ map.
  1194. */
  1195. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1196. std::ifstream infile;
  1197. std::string line;
  1198. infile.open(file_path.c_str());
  1199. if (!infile.is_open()) {
  1200. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1201. const int kMaxFilenameLength = NAME_MAX;
  1202. char err_info[kMaxFilenameLength];
  1203. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1204. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1205. }
  1206. return;
  1207. }
  1208. std::vector<uint32_t> run_iters_vec;
  1209. while (std::getline(infile, line)) {
  1210. uint32_t iter;
  1211. std::stringstream ss(line);
  1212. ss >> iter;
  1213. run_iters_vec.push_back(iter);
  1214. }
  1215. (void)graphs_run_history_.emplace(
  1216. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1217. }
  1218. /*
  1219. * Feature group: Offline debugger.
  1220. * Target device group: Ascend, GPU.
  1221. * Runtime category: Old runtime, MindRT.
  1222. * Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
  1223. * to the tensor_list_map_.
  1224. */
  1225. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1226. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1227. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1228. const std::string &type_name, const std::vector<int64_t> &shape,
  1229. std::vector<char> *buffer,
  1230. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1231. // call LoadNewTensor to store tensor in internal cache
  1232. auto tensor_data = std::make_shared<TensorData>();
  1233. tensor_data->SetName(backend_name);
  1234. tensor_data->SetExecutionOrder(0);
  1235. tensor_data->SetSlot(slot);
  1236. tensor_data->SetIteration(iteration);
  1237. tensor_data->SetDeviceId(device_id);
  1238. tensor_data->SetRootGraphId(root_graph_id);
  1239. tensor_data->SetIsOutput(is_output);
  1240. if (buffer != nullptr) {
  1241. tensor_data->SetDataPtr(buffer->data());
  1242. } else {
  1243. tensor_data->SetDataPtr(nullptr);
  1244. }
  1245. tensor_data->SetByteSize(data_size);
  1246. tensor_data->SetType(type_name);
  1247. tensor_data->SetShape(shape);
  1248. tensor_data->SetTimeStamp(time_stamp);
  1249. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1250. if (data_size) {
  1251. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1252. }
  1253. // add to result_list
  1254. result_list->push_back(tensor_data);
  1255. }
  1256. /*
  1257. * Feature group: Offline debugger.
  1258. * Target device group: Ascend, GPU.
  1259. * Runtime category: Old runtime, MindRT.
  1260. * Description: Generate a string in format of {no-scope-op-name}.{input-output}.{slot} to check and match files to
  1261. * read.
  1262. */
  1263. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1264. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1265. std::string dump_style_name_part = *dump_style_kernel_name;
  1266. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1267. std::string slot_str;
  1268. if (is_output) {
  1269. slot_str = ".output." + std::to_string(slot);
  1270. } else {
  1271. slot_str = ".input." + std::to_string(slot);
  1272. }
  1273. dump_style_name_part += slot_str;
  1274. *prefix_dump_file_name = dump_style_name_part;
  1275. *slot_string_to_check = slot_str;
  1276. }
  1277. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1278. // get file with the newest timestamp from the list.
  1279. if (file_list.empty()) {
  1280. return "";
  1281. }
  1282. std::sort(file_list.begin(), file_list.end());
  1283. return file_list.back();
  1284. }
  1285. std::string GetTimeStampStr(std::string file_path) {
  1286. // get the file_name from file_path.
  1287. size_t pos = file_path.rfind("/");
  1288. std::string file_name = file_path.substr(pos + 1);
  1289. size_t first_dot = file_name.rfind(".");
  1290. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1291. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1292. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1293. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1294. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1295. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1296. return time_stamp;
  1297. }
  1298. return "";
  1299. }
  1300. /*
  1301. * Feature group: Offline debugger.
  1302. * Target device group: Ascend, GPU.
  1303. * Runtime category: Old runtime, MindRT.
  1304. * Description: Search files in dir (sync mode) or in AsyncFilePool (async mode) for the one that meets the filename
  1305. * prefix and read the file into memory.
  1306. */
  1307. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1308. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1309. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1310. const AsyncFilePool &async_file_pool,
  1311. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1312. bool *no_mem_to_read) {
  1313. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1314. // form prefix of the tensor file to read from graph pb node name
  1315. std::string dump_style_kernel_name = backend_name[i];
  1316. // remove slot from name
  1317. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1318. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1319. std::string slot_string_to_check;
  1320. std::string prefix_dump_file_name;
  1321. std::string specific_dump_dir;
  1322. bool is_cst = false;
  1323. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1324. // prefix_dump_to_check is node name used to find corresponding dump file
  1325. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1326. // if node name has prefix of "Default--data-", consider as constant, search in cst folder
  1327. if (prefix_dump_to_check.length() > (unsigned)strlen(constant_prefix) &&
  1328. prefix_dump_to_check.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  1329. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1330. std::to_string(root_graph_id[i]) + "/constants";
  1331. is_cst = true;
  1332. const std::string prefix = "Default--";
  1333. prefix_dump_file_name = prefix_dump_file_name.substr(prefix.length());
  1334. prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
  1335. } else {
  1336. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1337. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1338. }
  1339. MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
  1340. if (is_sync_mode_ || is_cst) {
  1341. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1342. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1343. } else {
  1344. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1345. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1346. no_mem_to_read);
  1347. }
  1348. }
  1349. }
  1350. /*
  1351. * Feature group: Offline debugger.
  1352. * Target device group: Ascend, GPU.
  1353. * Runtime category: Old runtime, MindRT.
  1354. * Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
  1355. * tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
  1356. * data_size = 0, empty shape and nullptr buffer.
  1357. */
  1358. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1359. const std::string &backend_name, const unsigned int device_id,
  1360. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1361. bool *no_mem_to_read, unsigned int iteration,
  1362. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1363. std::string time_stamp = "";
  1364. std::string type_name = "";
  1365. size_t data_size = 0;
  1366. std::vector<int64_t> shape;
  1367. std::vector<char> *buffer = nullptr;
  1368. if (found) {
  1369. std::string result_path = GetNewestFilePath(matched_paths);
  1370. time_stamp = GetTimeStampStr(result_path);
  1371. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1372. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1373. std::to_string(slot);
  1374. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1375. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1376. type_name, shape, buffer, result_list);
  1377. } else {
  1378. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1379. buffer, result_list);
  1380. MS_LOG(INFO) << "Target tensor has not been found.";
  1381. }
  1382. }
  1383. /*
  1384. * Feature group: Offline debugger.
  1385. * Target device group: Ascend, GPU.
  1386. * Runtime category: Old runtime, MindRT.
  1387. * Description: Looks for the files that match the node_name (in the dump directory) for sync dump, read the newest file
  1388. * and add the related tensor_data object.
  1389. */
  1390. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1391. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1392. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1393. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1394. std::string abspath = RealPath(specific_dump_dir);
  1395. DIR *d = opendir(abspath.c_str());
  1396. bool found_file = false;
  1397. std::vector<std::string> matched_paths;
  1398. if (d == nullptr) {
  1399. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1400. } else {
  1401. struct dirent *dir = nullptr;
  1402. while ((dir = readdir(d)) != nullptr) {
  1403. std::string file_name = dir->d_name;
  1404. std::string file_path = abspath + std::string("/") + file_name;
  1405. if (IsRegFile(file_path)) {
  1406. std::string stripped_file_name = GetStrippedFilename(file_name);
  1407. if (stripped_file_name.empty()) {
  1408. continue;
  1409. }
  1410. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1411. if (found != 0) {
  1412. continue;
  1413. }
  1414. matched_paths.push_back(file_path);
  1415. found_file = true;
  1416. }
  1417. }
  1418. (void)closedir(d);
  1419. }
  1420. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1421. no_mem_to_read, iteration, result_list);
  1422. }
  1423. /*
  1424. * Feature group: Offline debugger.
  1425. * Target device group: Ascend.
  1426. * Runtime category: Old runtime, MindRT.
  1427. * Description: Iterates through all the file paths in the async_file_pool and looks for the files that match the
  1428. * node_name for async dump, read the newest file and add the related tensor_data object.
  1429. */
  1430. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1431. const std::string &slot_string_to_check, const std::string &backend_name,
  1432. size_t slot, unsigned int device_id, unsigned int iteration,
  1433. unsigned int root_graph_id, const bool &is_output,
  1434. const AsyncFilePool &async_file_pool,
  1435. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1436. bool found = false;
  1437. std::vector<std::string> matched_paths;
  1438. // if async mode
  1439. for (const std::string &file_path : async_file_pool) {
  1440. std::string file_name_to_check = file_path;
  1441. auto delim = file_path.rfind("/");
  1442. if (delim != std::string::npos) {
  1443. file_name_to_check = file_path.substr(delim + 1);
  1444. }
  1445. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1446. file_name_to_check.find("." + prefix_dump_to_check + ".") != std::string::npos &&
  1447. file_name_to_check.find(slot_string_to_check + ".") != std::string::npos) {
  1448. matched_paths.push_back(file_path);
  1449. found = true;
  1450. }
  1451. }
  1452. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1453. iteration, result_list);
  1454. }
  1455. /*
  1456. * Feature group: Offline debugger.
  1457. * Target device group: Ascend, GPU.
  1458. * Runtime category: Old runtime, MindRT.
  1459. * Description: Obtain opname, output_str and slot from the npy file. Make sure its return value is the same as
  1460. * SetPrefixToCheck(). The input/output examples look like:
  1461. * input: {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
  1462. * output: {op_name}.{output_or_input_string}.{slot}
  1463. */
  1464. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1465. // strip off the task_id, stream_id, and timestamp, then compare
  1466. size_t first_dot = file_name.find(".");
  1467. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1468. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1469. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1470. return std::string();
  1471. }
  1472. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1473. size_t second_dot = fifth_dot;
  1474. const int8_t kSecondDotPosition = 2;
  1475. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1476. second_dot = file_name.rfind(".", second_dot - 1);
  1477. }
  1478. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1479. return std::string();
  1480. }
  1481. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1482. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1483. std::string stripped_file_name = start_string + end_string;
  1484. return stripped_file_name;
  1485. }
  1486. /*
  1487. * Feature group: Offline debugger.
  1488. * Target device group: Ascend, GPU.
  1489. * Runtime category: Old runtime, MindRT.
  1490. * Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
  1491. * original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
  1492. * = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
  1493. * checkwatchpoint functions.
  1494. */
  1495. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration,
  1496. AsyncFilePool *const async_file_pool,
  1497. bool error_on_no_value) {
  1498. // get a list of nodes and the devices they are on to monitor
  1499. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1500. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1501. GetAllWpNodes();
  1502. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1503. // as they are found
  1504. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1505. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1506. uint32_t rank_id = std::get<0>(rank_and_graph);
  1507. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1508. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1509. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1510. std::string real_dump_dir = RealPath(specific_dump_dir);
  1511. if (real_dump_dir.empty()) {
  1512. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1513. continue;
  1514. }
  1515. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1516. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1517. // convert node names to dump style
  1518. for (auto node : wp_nodes) {
  1519. std::string orig_name = std::get<0>(node);
  1520. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1521. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1522. bool node_is_out = std::get<1>(node);
  1523. if (node_is_out) {
  1524. dump_style_name += ".output";
  1525. } else {
  1526. dump_style_name += ".input";
  1527. }
  1528. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1529. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1530. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1531. }
  1532. }
  1533. if (is_sync_mode_) {
  1534. // search files in dir for the one that meets the filename prefix and read the file into memory
  1535. ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1536. error_on_no_value);
  1537. } else {
  1538. // convert all files in proto_to_dump to npy and add to pool of async file names
  1539. ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
  1540. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1541. &tensor_list);
  1542. }
  1543. }
  1544. return tensor_list;
  1545. }
  1546. /*
  1547. * Feature group: Offline debugger.
  1548. * Target device group: Ascend, GPU.
  1549. * Runtime category: Old runtime, MindRT.
  1550. * Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
  1551. * names in proto_to_dump vector.
  1552. */
  1553. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1554. const std::string &specific_dump_dir, unsigned int iteration,
  1555. unsigned int device_id, unsigned int root_graph_id,
  1556. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1557. bool error_on_no_value) {
  1558. DIR *d = opendir(specific_dump_dir.c_str());
  1559. if (d == nullptr) {
  1560. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
  1561. } else {
  1562. struct dirent *dir = nullptr;
  1563. while ((dir = readdir(d)) != nullptr) {
  1564. std::string file_name = dir->d_name;
  1565. std::string file_path = specific_dump_dir + std::string("/") + file_name;
  1566. if (IsRegFile(file_path)) {
  1567. for (auto &node : proto_to_dump) {
  1568. std::string dump_name = std::get<1>(node);
  1569. std::string stripped_file_name = GetStrippedFilename(file_name);
  1570. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1571. continue;
  1572. }
  1573. std::size_t found = stripped_file_name.rfind(dump_name + ".", 0);
  1574. if (found == 0) {
  1575. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1576. std::vector<int64_t> shape;
  1577. std::string orig_name = std::get<0>(node);
  1578. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1579. bool output_flag = (output_str == "output");
  1580. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1581. nullptr, tensor_list);
  1582. break;
  1583. }
  1584. }
  1585. }
  1586. }
  1587. (void)closedir(d);
  1588. }
  1589. }
  1590. std::string DebugServices::IterationString(unsigned int iteration) {
  1591. std::string iteration_string;
  1592. bool init_dbg_suspend = (iteration == UINT_MAX);
  1593. if (init_dbg_suspend) {
  1594. iteration_string = "init";
  1595. } else {
  1596. iteration_string = std::to_string(iteration);
  1597. }
  1598. return iteration_string;
  1599. }
  1600. #endif
  1601. /*
  1602. * Feature group: Online debugger.
  1603. * Target device group: Ascend, GPU.
  1604. * Runtime category: Old runtime, MindRT.
  1605. * Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
  1606. * current root_graph_id, it updates the given vectors.
  1607. */
  1608. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1609. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1610. std::vector<unsigned int> *const dtype,
  1611. std::vector<std::vector<int64_t>> *const shape) {
  1612. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1613. tensor_loader_->SearchTensors(name, &result_list);
  1614. for (auto result : result_list) {
  1615. if (std::get<1>(result) == nullptr) {
  1616. continue;
  1617. }
  1618. #ifdef ONLINE_DBG_MODE
  1619. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1620. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1621. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1622. << ".";
  1623. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1624. }
  1625. #endif
  1626. (void)ret_name->emplace_back(std::get<0>(result));
  1627. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1628. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1629. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1630. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1631. }
  1632. }
  1633. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1634. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1635. if (result_list == nullptr) {
  1636. MS_LOG(DEBUG) << "result_list is nullptr.";
  1637. return;
  1638. }
  1639. tensor_loader_->SearchTensors(name, result_list);
  1640. }
  1641. #ifdef ONLINE_DBG_MODE
  1642. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1643. bool ret = false;
  1644. for (auto w_table_item : watchpoint_table_) {
  1645. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1646. for (auto check_node : check_node_list) {
  1647. std::string w_name = std::get<0>(check_node);
  1648. bool w_type = std::get<1>(check_node);
  1649. if ((w_type == true &&
  1650. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1651. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1652. ret = true;
  1653. return ret;
  1654. }
  1655. }
  1656. }
  1657. return ret;
  1658. }
  1659. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1660. if (kernel != nullptr && w_name.length() > 0) {
  1661. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1662. for (size_t j = 0; j < input_size; ++j) {
  1663. auto input_kernel = kernel->input(j + 1);
  1664. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1665. auto found = w_name.find_last_of('/');
  1666. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1667. return true;
  1668. }
  1669. return false;
  1670. } else {
  1671. return false;
  1672. }
  1673. }
  1674. #endif
  1675. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1676. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1677. return tensor_loader_->GetTensor(tensor_name);
  1678. }
  1679. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1680. #ifdef ONLINE_DBG_MODE
  1681. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1682. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1683. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1684. size_t slot) const {
  1685. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1686. device_type, addr_format, slot);
  1687. }
  1688. #endif
  1689. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1690. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1691. }
  1692. /*
  1693. * Feature group: Offline debugger.
  1694. * Target device group: Ascend, GPU.
  1695. * Runtime category: Old runtime, MindRT.
  1696. * Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
  1697. * run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
  1698. * prev_iteration.
  1699. */
  1700. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1701. uint32_t prev_iter;
  1702. uint32_t rank_id = tensor->GetDeviceId();
  1703. uint32_t root_graph_id = tensor->GetRootGraphId();
  1704. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1705. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1706. return UINT32_MAX;
  1707. }
  1708. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1709. tensor->GetIteration());
  1710. if (it == graphs_run_history_[rank_and_graph].end()) {
  1711. // The graph is not executed in that iteration
  1712. return UINT32_MAX;
  1713. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1714. // current iteration is the first iteration that the graph was run
  1715. // no prev iter is available
  1716. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1717. << " is the first run iteration for tensor: " << tensor->GetName();
  1718. return UINT32_MAX;
  1719. }
  1720. it--;
  1721. prev_iter = *it;
  1722. tensor->SetPrevIteration(prev_iter);
  1723. return prev_iter;
  1724. }
  1725. void DebugServices::ResetLoadedTensors() {
  1726. wp_id_cache_.clear();
  1727. MS_LOG(INFO) << "Resetting loaded tensors";
  1728. tensor_loader_->MoveParametersCurrentToPrev();
  1729. tensor_loader_->EmptyCurrentTensor();
  1730. // will move parameters from previous to current map
  1731. tensor_loader_->SwapCurrentPrev();
  1732. overflow_ops_.clear();
  1733. }
  1734. #ifdef ONLINE_DBG_MODE
  1735. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1736. MS_EXCEPTION_IF_NULL(kernel);
  1737. std::vector<std::shared_ptr<TensorData>> result;
  1738. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1739. auto kernel_name = GetKernelNodeName(kernel);
  1740. for (size_t j = 0; j < output_size; ++j) {
  1741. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1742. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1743. if (tensor != nullptr) {
  1744. result.push_back(tensor);
  1745. }
  1746. }
  1747. return result;
  1748. }
  1749. #endif
  1750. std::string GetOnlineOpOverflowDir() {
  1751. // only called for online debugger mode
  1752. // get operator overflow directory for current iteration
  1753. std::string overflow_bin_path = "";
  1754. #ifdef ONLINE_DBG_MODE
  1755. if (DumpJsonParser::GetInstance().path().empty()) {
  1756. MS_LOG(INFO) << "Dump config is not set.";
  1757. return "";
  1758. }
  1759. auto debugger = Debugger::GetInstance();
  1760. MS_EXCEPTION_IF_NULL(debugger);
  1761. auto cur_graph = debugger->GetGraphPtr();
  1762. if (cur_graph == nullptr) {
  1763. return "";
  1764. }
  1765. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1766. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1767. if (!realpath.has_value()) {
  1768. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1769. return "";
  1770. }
  1771. overflow_bin_path = realpath.value() + '/';
  1772. #endif
  1773. return overflow_bin_path;
  1774. }
  1775. void DebugServices::AddOpOverflowOpNames(const std::string &overflow_bin_path, std::vector<std::string> *op_names) {
  1776. MS_EXCEPTION_IF_NULL(op_names);
  1777. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1778. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1779. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1780. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1781. DIR *d = opendir(overflow_bin_path.c_str());
  1782. if (d == nullptr) {
  1783. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1784. } else {
  1785. struct dirent *dir = nullptr;
  1786. while ((dir = readdir(d)) != nullptr) {
  1787. std::string file_name = dir->d_name;
  1788. std::string file_path = overflow_bin_path + std::string("/") + file_name;
  1789. if (IsRegFile(file_path)) {
  1790. // attempt to read the file
  1791. std::ifstream infile;
  1792. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1793. if (!infile.is_open()) {
  1794. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1795. continue;
  1796. }
  1797. std::string node_name;
  1798. uint64_t task_id = 0;
  1799. uint64_t stream_id = 0;
  1800. // detect overflow bin file
  1801. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1802. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1803. continue;
  1804. }
  1805. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1806. << ".";
  1807. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1808. } else {
  1809. // regular bin file or npy file
  1810. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1811. if (success_parse) {
  1812. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1813. }
  1814. }
  1815. infile.close();
  1816. }
  1817. }
  1818. (void)closedir(d);
  1819. }
  1820. // find the op_names with an overflow hit
  1821. for (auto &task_stream : task_stream_hit) {
  1822. auto op_name = task_stream_to_opname[task_stream];
  1823. if (!op_name.empty()) {
  1824. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1825. op_names->push_back(op_name);
  1826. }
  1827. }
  1828. }
  1829. /*
  1830. * Feature group: Online debugger, Offline debugger.
  1831. * Target device group: Ascend.
  1832. * Runtime category: Old runtime, MindRT.
  1833. * Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
  1834. * directory. This function is for async mode only.
  1835. */
  1836. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1837. unsigned int iteration) {
  1838. if (is_sync_mode_) {
  1839. return false;
  1840. }
  1841. std::string overflow_bin_path = "";
  1842. #ifdef ONLINE_DBG_MODE
  1843. overflow_bin_path = GetOnlineOpOverflowDir();
  1844. #else
  1845. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1846. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1847. overflow_bin_path = RealPath(overflow_bin_path);
  1848. #endif
  1849. if (overflow_bin_path.empty()) {
  1850. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1851. return false;
  1852. }
  1853. // remove kernel_graph_#
  1854. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1855. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1856. // remove path
  1857. size_t last_slash = node_name_to_find.rfind("/");
  1858. std::string op_name_find = "";
  1859. if (last_slash != std::string::npos) {
  1860. op_name_find = node_name_to_find.substr(last_slash + 1);
  1861. }
  1862. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1863. std::vector<std::string> op_names;
  1864. overflow_wp_lock_.lock();
  1865. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1866. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1867. if (found_overflows != overflow_ops_.end()) {
  1868. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1869. op_names = overflow_ops_[overflow_bin_path];
  1870. } else {
  1871. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1872. overflow_ops_[overflow_bin_path] = op_names;
  1873. }
  1874. overflow_wp_lock_.unlock();
  1875. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1876. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1877. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1878. return true;
  1879. }
  1880. // determine if overflow wp has been triggered for the op name (from npy file)
  1881. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1882. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1883. return true;
  1884. }
  1885. return false;
  1886. }
  1887. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1888. std::string op_name_to_find = node_name_to_find;
  1889. const std::string kernel_prefix = "kernel_graph_";
  1890. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1891. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1892. if (start_of_op_name != std::string::npos) {
  1893. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1894. }
  1895. }
  1896. return op_name_to_find;
  1897. }
  1898. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1899. uint64_t *stream_id) {
  1900. size_t task_pos_start = overflow_file_prefix.length();
  1901. size_t task_pos_end = file_name.find(".", task_pos_start);
  1902. if (task_pos_end == std::string::npos) {
  1903. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1904. return false;
  1905. }
  1906. size_t stream_pos_start = task_pos_end + 1;
  1907. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1908. if (stream_pos_end == std::string::npos) {
  1909. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1910. return false;
  1911. }
  1912. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1913. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1914. *task_id = std::stoull(task_id_str);
  1915. *stream_id = std::stoull(stream_id_str);
  1916. return true;
  1917. }
  1918. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  1919. uint64_t *stream_id) {
  1920. // get the node_name, task_id, and stream_id from dump filename in the following two formats:
  1921. // 1. bin file: node_type.node_name.task_id.stream_id.timestamp
  1922. // 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
  1923. // Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
  1924. // to search the file name from right to left.
  1925. size_t first_dot = file_name.find(".");
  1926. size_t fourth_dot;
  1927. if (file_name.rfind(kNpyExt) != std::string::npos) {
  1928. // npy format file (converted file or A+M dump file)
  1929. size_t pos = file_name.rfind(".");
  1930. const int kFourthFromRight = 4;
  1931. for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
  1932. pos = file_name.rfind(".", pos - 1);
  1933. }
  1934. fourth_dot = pos;
  1935. } else {
  1936. // bin format file
  1937. fourth_dot = file_name.rfind(".");
  1938. }
  1939. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1940. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1941. // check if dots were found
  1942. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1943. fourth_dot == std::string::npos) {
  1944. return false;
  1945. }
  1946. // get node_name
  1947. if (first_dot < second_dot) {
  1948. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1949. } else {
  1950. MS_LOG(ERROR) << "filename parse error to get node_name.";
  1951. return false;
  1952. }
  1953. // get task id
  1954. if (second_dot < third_dot) {
  1955. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1956. try {
  1957. *task_id = std::stoull(extracted_task_id);
  1958. } catch (std::invalid_argument &e) {
  1959. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1960. return false;
  1961. } catch (std::out_of_range &e) {
  1962. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1963. return false;
  1964. }
  1965. } else {
  1966. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get task_id.";
  1967. return false;
  1968. }
  1969. // get stream id
  1970. if (third_dot < fourth_dot) {
  1971. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1972. try {
  1973. *stream_id = std::stoull(extracted_stream_id);
  1974. } catch (std::invalid_argument &e) {
  1975. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1976. return false;
  1977. } catch (std::out_of_range &e) {
  1978. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1979. return false;
  1980. }
  1981. } else {
  1982. MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get stream_id.";
  1983. return false;
  1984. }
  1985. return true;
  1986. }
  1987. std::string DebugServices::RealPath(const std::string &input_path) {
  1988. if (input_path.length() >= PATH_MAX) {
  1989. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1990. }
  1991. size_t path_split_pos = input_path.find_last_of('/');
  1992. // get real path
  1993. char real_path[PATH_MAX] = {0};
  1994. // input_path is dir + file_name
  1995. if (path_split_pos != std::string::npos) {
  1996. std::string prefix_path = input_path.substr(0, path_split_pos);
  1997. std::string file_name = input_path.substr(path_split_pos);
  1998. if (file_name.length() > NAME_MAX) {
  1999. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  2000. }
  2001. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  2002. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  2003. return "";
  2004. }
  2005. return std::string(real_path) + file_name;
  2006. }
  2007. // input_path is only file_name
  2008. if (input_path.length() > NAME_MAX) {
  2009. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  2010. }
  2011. if (realpath(input_path.c_str(), real_path) == nullptr) {
  2012. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  2013. }
  2014. return std::string(real_path);
  2015. }
  2016. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  2017. #if defined(__APPLE__)
  2018. return *reinterpret_cast<const uint64_t *>(buffer.data());
  2019. #else
  2020. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  2021. #endif
  2022. }
  2023. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  2024. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  2025. }
  2026. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  2027. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  2028. }
  2029. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  2030. if (tensor_loader_->EnableMemoryControl()) {
  2031. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  2032. }
  2033. }
  2034. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  2035. std::string DebugServices::GetNetName() { return net_name_; }
  2036. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  2037. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  2038. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  2039. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  2040. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  2041. #ifdef ONLINE_DBG_MODE
  2042. } // namespace mindspore
  2043. #endif