You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 88 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include <regex>
  29. #include "pybind11/embed.h"
  30. #include "pybind11/stl.h"
  31. #ifdef ONLINE_DBG_MODE
  32. #include "debug/common.h"
  33. #include "debug/debugger/debugger.h"
  34. #include "debug/anf_ir_utils.h"
  35. #include "backend/session/anf_runtime_algorithm.h"
  36. #endif
  37. #include "nlohmann/json.hpp"
  38. #include "debug/debugger/tensor_summary.h"
  39. #include "utils/file_utils.h"
  40. #include "climits"
  41. #ifdef ONLINE_DBG_MODE
  42. namespace mindspore {
  43. #endif
  44. static constexpr const char *constant_prefix = "Default--data-";
  45. namespace {
  46. #ifdef __APPLE__
  47. constexpr int kStrErrorNone = 0;
  48. #else
  49. constexpr char *kStrErrorNone = nullptr;
  50. #endif
  51. } // namespace
  52. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  53. DebugServices::DebugServices(const DebugServices &other) {
  54. wp_id_cache_ = other.wp_id_cache_;
  55. net_name_ = other.net_name_;
  56. dump_dir_ = other.dump_dir_;
  57. is_sync_mode_ = other.is_sync_mode_;
  58. tensor_loader_ = other.tensor_loader_;
  59. watchpoint_table_ = other.watchpoint_table_;
  60. }
  61. DebugServices &DebugServices::operator=(const DebugServices &other) {
  62. if (this != &other) {
  63. tensor_loader_ = other.tensor_loader_;
  64. watchpoint_table_ = other.watchpoint_table_;
  65. }
  66. return *this;
  67. }
  68. void DebugServices::AddWatchpoint(
  69. unsigned int id, unsigned int watch_condition, float parameter,
  70. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  71. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  72. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  73. std::lock_guard<std::mutex> lg(lock_);
  74. watchpoint_t watchpoint_item;
  75. watchpoint_item.id = id;
  76. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  77. watchpoint_item.condition.parameter = parameter;
  78. watchpoint_item.check_node_list = check_node_list;
  79. if (check_node_device_list != nullptr) {
  80. watchpoint_item.check_node_device_list = *check_node_device_list;
  81. }
  82. if (check_node_graph_list != nullptr) {
  83. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  84. }
  85. watchpoint_item.parameter_list = parameter_list;
  86. watchpoint_table_[id] = watchpoint_item;
  87. }
  88. void DebugServices::RemoveWatchpoint(unsigned int id) {
  89. std::lock_guard<std::mutex> lg(lock_);
  90. (void)watchpoint_table_.erase(id);
  91. }
  92. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  93. const void *const previous_tensor_ptr, uint32_t num_elements,
  94. uint32_t prev_num_elements, int tensor_dtype) {
  95. MS_EXCEPTION_IF_NULL(tensor);
  96. switch (tensor_dtype) {
  97. case DbgDataType::DT_UINT8: {
  98. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  99. prev_num_elements);
  100. }
  101. case DbgDataType::DT_INT8: {
  102. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  103. prev_num_elements);
  104. }
  105. case DbgDataType::DT_UINT16: {
  106. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  107. prev_num_elements);
  108. }
  109. case DbgDataType::DT_INT16: {
  110. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  111. prev_num_elements);
  112. }
  113. case DbgDataType::DT_UINT32: {
  114. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  115. prev_num_elements);
  116. }
  117. case DbgDataType::DT_INT32:
  118. case DbgDataType::DT_BASE_INT: {
  119. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  120. prev_num_elements);
  121. }
  122. case DbgDataType::DT_UINT64: {
  123. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  124. prev_num_elements);
  125. }
  126. case DbgDataType::DT_INT64: {
  127. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  128. prev_num_elements);
  129. }
  130. case DbgDataType::DT_FLOAT16: {
  131. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  132. prev_num_elements);
  133. }
  134. case DbgDataType::DT_FLOAT32:
  135. case DbgDataType::DT_BASE_FLOAT: {
  136. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  137. prev_num_elements);
  138. }
  139. case DbgDataType::DT_FLOAT64: {
  140. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  141. prev_num_elements);
  142. }
  143. case DbgDataType::DT_BOOL: {
  144. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  145. prev_num_elements);
  146. }
  147. default:
  148. MS_LOG(INFO) << "Unsupported tensor type";
  149. // return a null pointer
  150. return std::unique_ptr<TensorSummary<int32_t>>{};
  151. }
  152. }
  153. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  154. if (tensor == nullptr) {
  155. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  156. TensorStat empty_tensor_stat_data;
  157. return empty_tensor_stat_data;
  158. }
  159. std::unique_ptr<ITensorSummary> base_summary_ptr;
  160. void *previous_tensor_ptr = nullptr;
  161. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  162. if (base_summary_ptr == nullptr) {
  163. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  164. TensorStat empty_tensor_stat_data;
  165. return empty_tensor_stat_data;
  166. }
  167. base_summary_ptr->TensorStatistics(tensor->GetType());
  168. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  169. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  170. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  171. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  172. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  173. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  174. return tensor_stat_data;
  175. }
  176. #ifdef OFFLINE_DBG_MODE
  177. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  178. uint32_t *prev_num_elements, bool *history_not_found) {
  179. MS_EXCEPTION_IF_NULL(tensor);
  180. const void *previous_tensor_ptr = nullptr;
  181. std::shared_ptr<TensorData> tensor_prev;
  182. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  183. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  184. *history_not_found = 1;
  185. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  186. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  187. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  188. // read data in offline mode
  189. std::vector<std::string> file_paths;
  190. if (!is_sync_mode_) {
  191. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  192. std::vector<unsigned int>{tensor->GetDeviceId()},
  193. std::vector<unsigned int>{tensor->GetPrevIteration()},
  194. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  195. }
  196. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  197. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  198. std::vector<unsigned int>{tensor->GetDeviceId()},
  199. std::vector<unsigned int>{tensor->GetPrevIteration()},
  200. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  201. file_paths, &result_list_prev);
  202. tensor_prev = result_list_prev[0];
  203. if (!tensor_prev->GetByteSize()) {
  204. tensor_prev.reset();
  205. } else {
  206. previous_tensor_ptr = tensor_prev->GetDataPtr();
  207. *prev_num_elements = tensor_prev->GetNumElements();
  208. }
  209. }
  210. return previous_tensor_ptr;
  211. }
  212. #endif
  213. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  214. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  215. std::string *const qualified_tensor_name,
  216. std::vector<watchpoint_t> *const watchpoints_to_check) {
  217. if (tensor == nullptr) {
  218. MS_LOG(DEBUG) << "tensor is nullptr.";
  219. return;
  220. }
  221. const auto tensor_name = tensor->GetName();
  222. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  223. const auto tensor_device_id = tensor->GetDeviceId();
  224. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  225. for (auto w_table_item : watchpoint_table_) {
  226. auto wp = std::get<1>(w_table_item);
  227. // check ONLY init conditions on initial suspended state.
  228. // skip other conditions on initial suspended state
  229. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  230. continue;
  231. }
  232. // skip init condition if not init suspend
  233. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  234. continue;
  235. }
  236. // check change conditions only on step end.
  237. if (wp.change_condition() && !step_end) {
  238. continue;
  239. }
  240. // if recheck, ignore the cache results and reanalyze everything.
  241. // if not a recheck, check only unanalyzed tensors
  242. if (!recheck) {
  243. wp_lock_.lock();
  244. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  245. wp_lock_.unlock();
  246. if (wp_cache_hit) {
  247. continue;
  248. }
  249. }
  250. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  251. if (!found.empty()) {
  252. *qualified_tensor_name = found;
  253. watchpoints_to_check->push_back(w_table_item.second);
  254. #ifdef OFFLINE_DBG_MODE
  255. if (wp.change_condition()) {
  256. *previous_iter_tensor_needed = true;
  257. }
  258. #endif
  259. }
  260. }
  261. }
  262. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  263. const std::string &tensor_name) {
  264. // add analyzed tensor to cache
  265. if (!recheck) {
  266. wp_lock_.lock();
  267. (void)wp_id_cache_[tensor_name].insert(id);
  268. wp_lock_.unlock();
  269. }
  270. }
  271. void DebugServices::SetCheckWatchpointsResult(
  272. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  273. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  274. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  275. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  276. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  277. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  278. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  279. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  280. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  281. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  282. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  283. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  284. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  285. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  286. if (device_id != nullptr) {
  287. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  288. }
  289. if (root_graph_id != nullptr) {
  290. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  291. }
  292. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  293. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  294. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  295. }
  296. #ifdef OFFLINE_DBG_MODE
  297. void DebugServices::CheckOutofMemoryandNoValue(
  298. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  299. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  300. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  301. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  302. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  303. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  304. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  305. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  306. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  307. const std::vector<parameter_t> &parameter_list) {
  308. bool set_is_needed = no_mem_to_read || error_on_no_value;
  309. int32_t error_code_to_set = 0;
  310. if (no_mem_to_read) {
  311. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  312. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  313. } else if (error_on_no_value) {
  314. error_code_to_set = ITensorSummary::NO_VALUE;
  315. }
  316. if (set_is_needed) {
  317. for (auto &wp : watchpoints_to_check) {
  318. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  319. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  320. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  321. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  322. parameter_list, error_code_to_set);
  323. }
  324. }
  325. }
  326. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  327. // set the tensor into not-in-use status in tensor_loader.
  328. auto tensor_name = tensor->GetName();
  329. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  330. std::to_string(tensor->GetRootGraphId()) + ":" +
  331. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  332. AppendToCacheEvictQueue(key_name_in_cache);
  333. if (previous_tensor_ptr != nullptr) {
  334. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  335. }
  336. }
  337. #endif
  338. #ifdef ONLINE_DBG_MODE
  339. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  340. auto debugger = Debugger::GetInstance();
  341. auto ms_context = MsContext::GetInstance();
  342. MS_EXCEPTION_IF_NULL(ms_context);
  343. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  344. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  345. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  346. device_target == kAscendDevice) {
  347. if (cur_root_graph_id != id) {
  348. return false;
  349. }
  350. }
  351. return true;
  352. }
  353. const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
  354. std::shared_ptr<TensorData> prev_tensor_data;
  355. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  356. // not supporting watchpoints that need prev tensor for multi root graph networks.
  357. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  358. prev_tensor_data = nullptr;
  359. } else {
  360. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  361. }
  362. if (prev_tensor_data) {
  363. *prev_num_elements = prev_tensor_data->GetNumElements();
  364. return prev_tensor_data->GetDataPtr();
  365. }
  366. return nullptr;
  367. }
  368. #endif
  369. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  370. // check history error_code only for offline debugger
  371. if (history_not_found) {
  372. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  373. }
  374. }
  375. void DebugServices::CheckWatchpointsForTensor(
  376. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  377. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  378. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  379. const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
  380. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  381. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  382. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  383. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  384. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  385. int list_size = tensor_list->size();
  386. if (end > list_size) {
  387. end = list_size;
  388. }
  389. for (int i = begin; i < end; i++) {
  390. auto &tensor = (*tensor_list)[i];
  391. const auto tensor_name = tensor->GetName();
  392. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  393. const auto tensor_slot = std::to_string(tensor->GetSlot());
  394. std::vector<watchpoint_t> watchpoints_to_check;
  395. std::string qualified_tensor_name;
  396. bool previous_iter_tensor_needed = false;
  397. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  398. &qualified_tensor_name, &watchpoints_to_check);
  399. // no wp set on current tensor
  400. if (watchpoints_to_check.empty()) {
  401. continue;
  402. }
  403. #ifdef OFFLINE_DBG_MODE
  404. // read data in offline mode
  405. bool no_mem_to_read = false;
  406. std::vector<std::shared_ptr<TensorData>> result_list;
  407. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  408. std::vector<unsigned int>{tensor->GetDeviceId()},
  409. std::vector<unsigned int>{tensor->GetIteration()},
  410. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  411. async_file_pool, &result_list, &no_mem_to_read);
  412. tensor = result_list[0];
  413. if (!tensor->GetByteSize()) {
  414. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  415. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  416. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  417. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  418. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  419. tensor->GetRootGraphId(), std::vector<parameter_t>());
  420. tensor.reset();
  421. continue;
  422. }
  423. #endif
  424. // no elements to analyze
  425. if (tensor->GetByteSize() == 0) {
  426. continue;
  427. }
  428. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  429. int tensor_dtype = tensor->GetType();
  430. uint32_t num_elements = tensor->GetNumElements();
  431. uint32_t prev_num_elements = 0;
  432. const void *previous_tensor_ptr = nullptr;
  433. #ifdef OFFLINE_DBG_MODE
  434. bool history_not_found = 0;
  435. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  436. #else
  437. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  438. MS_LOG(DEBUG)
  439. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  440. << tensor->GetName();
  441. continue;
  442. }
  443. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  444. #endif
  445. std::unique_ptr<ITensorSummary> base_summary_ptr;
  446. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  447. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  448. if (base_summary_ptr != nullptr) {
  449. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  450. }
  451. }
  452. for (auto &wp : watchpoints_to_check) {
  453. bool is_hit = false;
  454. int error_code = 0;
  455. std::vector<parameter_t> parameter_list = {};
  456. if (wp.condition.type == IS_OVERFLOW) {
  457. is_hit =
  458. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  459. } else if (base_summary_ptr != nullptr) {
  460. auto item = base_summary_ptr->IsWatchpointHit(wp);
  461. is_hit = std::get<ITensorSummary::eHitPos>(item);
  462. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  463. #ifdef OFFLINE_DBG_MODE
  464. CheckHistoryErrorCode(&error_code, history_not_found);
  465. #endif
  466. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  467. }
  468. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  469. if (is_hit || error_code) {
  470. SetCheckWatchpointsResult(
  471. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  472. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  473. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  474. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  475. }
  476. }
  477. #ifdef OFFLINE_DBG_MODE
  478. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  479. // in offline mode remove the need for the data
  480. tensor.reset();
  481. #endif
  482. }
  483. }
  484. void DebugServices::CheckWatchpoints(
  485. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  486. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  487. std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
  488. const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  489. const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
  490. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  491. std::lock_guard<std::mutex> lg(lock_);
  492. auto t1 = std::chrono::high_resolution_clock::now();
  493. if (watchpoint_table_.empty()) {
  494. return;
  495. }
  496. // vector to store execution order of tensors hit
  497. std::vector<int> exec_order;
  498. std::vector<std::string> time_stamps;
  499. int tensor_list_size = tensor_list->size();
  500. uint64_t tensor_list_byte_size = 0;
  501. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  502. if (tensor_list_size <= 0) {
  503. return;
  504. }
  505. // default value for number of threads
  506. const int default_thread_num = 16;
  507. int max_thread_num = default_thread_num;
  508. if (max_thread_num > tensor_list_size) {
  509. max_thread_num = tensor_list_size;
  510. }
  511. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  512. int chunk_size = tensor_list_size / max_thread_num;
  513. int remainder = tensor_list_size % max_thread_num;
  514. partitioned_numbers chunk_exec_orders(max_thread_num);
  515. partitioned_names chunk_names(max_thread_num);
  516. partitioned_names chunk_slots(max_thread_num);
  517. partitioned_numbers chunk_conditions(max_thread_num);
  518. partitioned_id chunk_watchpoint_id(max_thread_num);
  519. partitioned_parameters chunk_parameters(max_thread_num);
  520. partitioned_error_code chunk_error_codes(max_thread_num);
  521. partitioned_id chunk_device_id(max_thread_num);
  522. partitioned_id chunk_root_graph_id(max_thread_num);
  523. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  524. partitioned_names chunk_time_stamp(max_thread_num);
  525. std::vector<std::future<void>> tensor_future_vec;
  526. int begin = 0;
  527. int end = begin;
  528. for (int i = 0; i < max_thread_num; i++) {
  529. end += chunk_size;
  530. if (remainder > 0) {
  531. end++;
  532. remainder--;
  533. }
  534. (void)tensor_future_vec.emplace_back(std::async(
  535. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  536. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  537. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  538. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  539. begin = end;
  540. }
  541. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  542. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  543. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  544. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  545. root_graph_id);
  546. auto t2 = std::chrono::high_resolution_clock::now();
  547. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  548. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  549. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  550. }
  551. void DebugServices::SortWatchpointsInfo(
  552. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  553. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  554. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  555. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  556. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  557. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  558. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  559. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  560. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  561. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  562. std::vector<unsigned int> *const root_graph_id) {
  563. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  564. (*tensor_future_vec)[i].wait();
  565. (*tensor_future_vec)[i].get();
  566. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  567. #ifdef ONLINE_DBG_MODE
  568. // if the execution order is repeated,inserts the new one before the others with same execution order.
  569. std::vector<int>::iterator iter =
  570. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  571. int position = iter - exec_order->begin();
  572. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  573. #endif
  574. #ifdef OFFLINE_DBG_MODE
  575. std::vector<std::string>::iterator iter =
  576. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  577. int position = iter - time_stamps->begin();
  578. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  579. #endif
  580. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  581. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  582. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  583. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  584. if (device_id != nullptr) {
  585. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  586. }
  587. if (root_graph_id != nullptr) {
  588. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  589. }
  590. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  591. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  592. }
  593. // free the memory for used vectors
  594. std::vector<int>().swap((*chunk_exec_orders)[i]);
  595. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  596. std::vector<std::string>().swap((*chunk_names)[i]);
  597. std::vector<std::string>().swap((*chunk_slots)[i]);
  598. std::vector<int>().swap((*chunk_conditions)[i]);
  599. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  600. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  601. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  602. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  603. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  604. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  605. }
  606. }
  607. #ifdef OFFLINE_DBG_MODE
  608. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  609. std::string *const tensor_type, std::size_t *const size,
  610. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  611. bool *no_mem_to_read) {
  612. std::ifstream infile;
  613. std::string file_path = file_name;
  614. MS_LOG(INFO) << "Reading in file: " << file_path;
  615. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  616. if (!infile.is_open()) {
  617. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  618. const int kMaxFilenameLength = 128;
  619. char err_info[kMaxFilenameLength];
  620. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  621. if (ret != kStrErrorNone) {
  622. MS_LOG(ERROR) << " ErrInfo:" << ret;
  623. }
  624. return;
  625. }
  626. const int substr_len = 2;
  627. const int header_len_offset = 8;
  628. const int header_offset = 9;
  629. const int header_len_buffer_size = 2;
  630. const int type_offset = 10;
  631. // get header length
  632. (void)infile.seekg(0, std::ios::beg);
  633. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  634. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  635. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  636. return;
  637. }
  638. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  639. header_len_buffer.reset();
  640. // read in header
  641. (void)infile.seekg(0, std::ios::beg);
  642. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  643. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  644. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  645. return;
  646. }
  647. std::string header(header_buffer->data() + header_offset, header_len);
  648. header_buffer.reset();
  649. std::size_t type_i = header.find("descr") + type_offset;
  650. if (header.length() < type_i + substr_len) {
  651. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  652. return;
  653. }
  654. *tensor_type = header.substr(type_i, substr_len);
  655. std::size_t shape_i_open = header.find("(");
  656. std::size_t shape_i_close = header.find(")");
  657. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  658. std::string intermediate;
  659. std::stringstream check_shape(shape_str);
  660. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  661. while (getline(check_shape, intermediate, ',')) {
  662. shape->push_back(std::stoi(intermediate));
  663. }
  664. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  665. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  666. std::size_t data_size = data_len * word_size;
  667. if (!data_size) {
  668. return;
  669. }
  670. // Check memory available before loading tensor into host.
  671. bool has_enough_memory = true;
  672. if (tensor_loader_->EnableMemoryControl()) {
  673. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  674. }
  675. if (!has_enough_memory) {
  676. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  677. *no_mem_to_read = true;
  678. } else {
  679. (void)infile.seekg(header_len + type_offset);
  680. *data_buffer = new std::vector<char>(data_size);
  681. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  682. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  683. }
  684. *size = data_size;
  685. }
  686. }
  687. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  688. std::vector<std::string> *const result_list) {
  689. std::string file_format = "npy";
  690. for (auto const &d : dir_to_files_map) {
  691. std::vector<std::string> files_to_convert_in_dir;
  692. std::vector<std::string> files_after_convert_in_dir;
  693. std::string dump_key = d.first;
  694. for (auto const &file_name : d.second) {
  695. bool already_converted = false;
  696. // Remove scope from the file_name for matching files converted by mindinsight tool.
  697. std::size_t found_first_dot = file_name.find(".");
  698. std::size_t found_last_underscore = file_name.find_last_of("_");
  699. std::string file_name_without_scope = file_name;
  700. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  701. file_name_without_scope =
  702. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  703. }
  704. for (std::string &file_found : *result_list) {
  705. if (file_found.find(file_name_without_scope) != std::string::npos) {
  706. already_converted = true;
  707. break;
  708. }
  709. }
  710. if (!already_converted) {
  711. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  712. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  713. }
  714. }
  715. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  716. if (!files_to_convert_in_dir.empty()) {
  717. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  718. // later task.
  719. try {
  720. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  721. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  722. (void)convert_obj.attr("convert_files")();
  723. } catch (pybind11::error_already_set &e) {
  724. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  725. }
  726. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
  727. }
  728. }
  729. }
  730. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  731. const std::string &dump_key, std::vector<std::string> *const result_list,
  732. const std::string &file_format) {
  733. std::string real_dump_iter_dir = RealPath(dump_key);
  734. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  735. if (d_handle == nullptr) {
  736. MS_LOG(INFO) << "Directory does not exist in ConvertToHostFormat.";
  737. return;
  738. }
  739. struct dirent *dir = nullptr;
  740. while ((dir = readdir(d_handle)) != nullptr) {
  741. struct stat st;
  742. std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
  743. int ret = stat(name.c_str(), &st);
  744. if (ret != 0) {
  745. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  746. (void)closedir(d_handle);
  747. return;
  748. }
  749. if (S_ISREG(st.st_mode)) {
  750. std::string candidate = dir->d_name;
  751. for (const std::string &file_to_find : files_after_convert_in_dir) {
  752. std::string file_n = file_to_find;
  753. auto last_slash_pos = file_to_find.find_last_of("\\/");
  754. if (last_slash_pos != std::string::npos) {
  755. file_n = file_to_find.substr(last_slash_pos + 1);
  756. }
  757. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  758. // we found a converted file for this op
  759. std::string found_file = dump_key + "/" + candidate;
  760. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  761. result_list->push_back(found_file);
  762. }
  763. }
  764. }
  765. }
  766. }
  767. (void)closedir(d_handle);
  768. }
  769. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  770. if (dump_style_name.empty()) {
  771. return "";
  772. }
  773. std::size_t last_scope_marker;
  774. std::string delim = "/";
  775. last_scope_marker = dump_style_name.rfind(delim);
  776. if (last_scope_marker == std::string::npos) {
  777. return dump_style_name;
  778. }
  779. return dump_style_name.substr(last_scope_marker + delim.size());
  780. }
  781. void ReplaceSrcFileName(std::string *dump_style_name) {
  782. if (dump_style_name == nullptr) {
  783. return;
  784. }
  785. const std::string strsrc = "/";
  786. std::string strdst = "_";
  787. std::string::size_type pos = 0;
  788. std::string::size_type srclen = strsrc.size();
  789. std::string::size_type dstlen = strdst.size();
  790. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  791. (void)dump_style_name->replace(pos, srclen, strdst);
  792. pos += dstlen;
  793. }
  794. }
  795. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  796. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  797. std::vector<unsigned int> root_graph_id,
  798. std::vector<std::string> *const result_list) {
  799. std::string file_format = "npy";
  800. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  801. for (unsigned int i = 0; i < backend_name.size(); i++) {
  802. // form prefix of the tensor file to read from graph pb node name
  803. std::string dump_style_kernel_name = backend_name[i];
  804. // remove slot from name
  805. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  806. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  807. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  808. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  809. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  810. // if node name is constant, skip
  811. if (prefix_dump_file_name.length() > (unsigned)strlen(constant_prefix) &&
  812. prefix_dump_file_name.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  813. continue;
  814. }
  815. // search files in dir for the one that meets the filename prefix and read the file into memory
  816. std::string abspath = RealPath(specific_dump_dir);
  817. DIR *d = opendir(abspath.c_str());
  818. if (d == nullptr) {
  819. MS_LOG(INFO) << "Directory does not exist in ConvertReadTensors.";
  820. return;
  821. }
  822. ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  823. (void)closedir(d);
  824. }
  825. ConvertToHostFormat(dir_to_files_map, result_list);
  826. }
  827. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  828. const std::string &specific_dump_dir,
  829. std::vector<std::string> *const result_list) {
  830. std::string file_format = "npy";
  831. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  832. for (const auto &node : proto_dump) {
  833. std::string dump_name = std::get<1>(node);
  834. dump_name = dump_name.substr(0, dump_name.rfind("."));
  835. // search files in dir for the one that meets the filename prefix and read the file into memory
  836. std::string abspath = RealPath(specific_dump_dir);
  837. DIR *d = opendir(abspath.c_str());
  838. if (d == nullptr) {
  839. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  840. return;
  841. }
  842. ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  843. (void)closedir(d);
  844. }
  845. ConvertToHostFormat(dir_to_files_map, result_list);
  846. }
  847. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
  848. const std::string &specific_dump_dir,
  849. std::map<std::string, std::vector<std::string>> *dir_to_files_map,
  850. std::vector<std::string> *const result_list) {
  851. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  852. DIR *d = opendir(specific_dump_dir.c_str());
  853. struct dirent *dir = nullptr;
  854. while ((dir = readdir(d)) != nullptr) {
  855. struct stat st;
  856. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  857. int ret = stat(name.c_str(), &st);
  858. if (ret != 0) {
  859. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  860. (void)closedir(d);
  861. return;
  862. }
  863. if (!(S_ISREG(st.st_mode))) {
  864. continue;
  865. }
  866. std::string file_name = dir->d_name;
  867. std::string file_name_w_o_perfix = file_name;
  868. auto type_pos = file_name.find('.');
  869. if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
  870. continue;
  871. }
  872. if (file_name.rfind(file_format) == std::string::npos) {
  873. // if file matches prefix and is in device format add to candidate files to convert.
  874. (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
  875. } else {
  876. // otherwise, if file matches prefix and already has been converted to host format
  877. // add to result of converted files.
  878. std::string found_file = specific_dump_dir + "/" + file_name;
  879. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  880. result_list->push_back(found_file);
  881. }
  882. }
  883. }
  884. (void)closedir(d);
  885. }
  886. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  887. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  888. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  889. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  890. for (auto &node : proto_dump) {
  891. std::vector<size_t> slot_list;
  892. std::string dump_style_name = std::get<1>(node);
  893. // Get dump_name and output_str from the second element of tuple
  894. std::size_t found_dot = dump_style_name.rfind(".");
  895. std::string dump_name = dump_style_name.substr(0, found_dot);
  896. std::string output_str = dump_style_name.substr(found_dot + 1);
  897. bool output_flag = (output_str == "output");
  898. for (const std::string &file_name : async_file_pool) {
  899. std::string file_name_to_check = file_name;
  900. auto delim = file_name.rfind("/");
  901. if (delim != std::string::npos) {
  902. file_name_to_check = file_name.substr(delim + 1);
  903. }
  904. std::size_t found = file_name_to_check.find(dump_name);
  905. std::size_t found_out = file_name_to_check.find(output_str);
  906. std::size_t found_dot_start = file_name_to_check.find(".", found_out);
  907. std::size_t found_dot_end = file_name_to_check.find(".", found_dot_start);
  908. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  909. found_out != std::string::npos) {
  910. slot_list.push_back(
  911. std::stoul(file_name_to_check.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  912. }
  913. }
  914. for (auto slot : slot_list) {
  915. // add a TensorData entry (data will be read when needed)
  916. std::vector<int64_t> shape;
  917. std::string orig_name = std::get<0>(node);
  918. auto tensor_data = std::make_shared<TensorData>();
  919. tensor_data->SetName(orig_name);
  920. tensor_data->SetExecutionOrder(0);
  921. tensor_data->SetSlot(slot);
  922. tensor_data->SetIteration(iteration);
  923. tensor_data->SetDeviceId(device_id);
  924. tensor_data->SetRootGraphId(root_graph_id);
  925. tensor_data->SetDataPtr(nullptr);
  926. tensor_data->SetByteSize(0);
  927. tensor_data->SetType("");
  928. tensor_data->SetShape(shape);
  929. tensor_data->SetIsOutput(output_flag);
  930. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  931. tensor_list->push_back(tensor_data);
  932. }
  933. }
  934. }
  935. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  936. std::regex re;
  937. if (mode == "rank") {
  938. re = "^rank_([0-9]+)$";
  939. } else if (mode == "graph") {
  940. re = "^([0-9]+)$";
  941. }
  942. std::smatch tokens;
  943. if (regex_match(name, tokens, re)) {
  944. return std::stoi(tokens[1]);
  945. } else {
  946. return UINT32_MAX;
  947. }
  948. }
  949. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  950. std::vector<uint32_t> rank_id_list;
  951. std::string dump_dir = GetDumpDir();
  952. DIR *d_handle = opendir(dump_dir.c_str());
  953. if (d_handle == nullptr) {
  954. MS_LOG(ERROR) << "Dump directory does not exist.";
  955. return rank_id_list;
  956. }
  957. struct dirent *dir = nullptr;
  958. while ((dir = readdir(d_handle)) != nullptr) {
  959. struct stat st;
  960. std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
  961. int ret = stat(name.c_str(), &st);
  962. if (ret != 0) {
  963. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  964. (void)closedir(d_handle);
  965. return rank_id_list;
  966. }
  967. if (S_ISDIR(st.st_mode)) {
  968. std::string rank_dir_name = dir->d_name;
  969. if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
  970. rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
  971. }
  972. }
  973. }
  974. (void)closedir(d_handle);
  975. return rank_id_list;
  976. }
  977. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  978. std::string net_name = GetNetName();
  979. std::string dump_dir = GetDumpDir();
  980. for (uint32_t rank_id : rank_id_list) {
  981. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  982. std::string abspath = RealPath(path);
  983. DIR *d_handle_rank = opendir(abspath.c_str());
  984. if (d_handle_rank == nullptr) {
  985. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  986. continue;
  987. }
  988. struct dirent *direc = nullptr;
  989. while ((direc = readdir(d_handle_rank)) != nullptr) {
  990. struct stat st;
  991. std::string name = abspath + std::string("/") + std::string(direc->d_name);
  992. int ret = stat(name.c_str(), &st);
  993. if (ret != 0) {
  994. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  995. (void)closedir(d_handle_rank);
  996. return;
  997. }
  998. if (S_ISDIR(st.st_mode)) {
  999. std::string graph_dir = direc->d_name;
  1000. if (graph_dir == "." || graph_dir == "..") {
  1001. continue;
  1002. }
  1003. if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
  1004. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  1005. ReadGraphsHistory(rank_id, graph_id);
  1006. }
  1007. }
  1008. }
  1009. (void)closedir(d_handle_rank);
  1010. }
  1011. }
  1012. void DebugServices::SetGraphsHistory() {
  1013. // extract rank_id_list
  1014. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  1015. // for each rank_id extract the graph_id list and set the dump version
  1016. // and for each graph read the graph history file
  1017. CheckDumpGraphIdList(rank_id_list);
  1018. }
  1019. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  1020. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  1021. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  1022. // graph history was already stored for this rank_id and graph_id
  1023. return;
  1024. }
  1025. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  1026. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  1027. DIR *d_handle = opendir(exec_order_path.c_str());
  1028. if (d_handle == nullptr) {
  1029. MS_LOG(ERROR) << "Execution order directory does not exist.";
  1030. return;
  1031. }
  1032. // read file and store the info
  1033. std::string full_path = exec_order_path + "/" + file_to_check;
  1034. std::string checked_path = RealPath(full_path);
  1035. if (!checked_path.empty()) {
  1036. ReadGraphRunIter(checked_path, rank_and_graph);
  1037. }
  1038. (void)closedir(d_handle);
  1039. }
  1040. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  1041. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  1042. for (auto w_table_item : watchpoint_table_) {
  1043. auto wp = std::get<1>(w_table_item);
  1044. unsigned int index = 0;
  1045. for (auto check_node : wp.check_node_list) {
  1046. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1047. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1048. // graph represents root_graph for Ascend and kernel_graph for GPU
  1049. for (auto rank : ranks) {
  1050. for (auto graph : graphs) {
  1051. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1052. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1053. }
  1054. }
  1055. index++;
  1056. }
  1057. }
  1058. return rank_and_graph_to_nodes;
  1059. }
  1060. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1061. std::ifstream infile;
  1062. std::string line;
  1063. infile.open(file_path.c_str());
  1064. if (!infile.is_open()) {
  1065. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1066. const int kMaxFilenameLength = NAME_MAX;
  1067. char err_info[kMaxFilenameLength];
  1068. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1069. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1070. }
  1071. return;
  1072. }
  1073. std::vector<uint32_t> run_iters_vec;
  1074. while (std::getline(infile, line)) {
  1075. uint32_t iter;
  1076. std::stringstream ss(line);
  1077. ss >> iter;
  1078. run_iters_vec.push_back(iter);
  1079. }
  1080. (void)graphs_run_history_.emplace(
  1081. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1082. }
  1083. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1084. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1085. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1086. const std::string &type_name, const std::vector<int64_t> &shape,
  1087. std::vector<char> *buffer,
  1088. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1089. // call LoadNewTensor to store tensor in internal cache
  1090. auto tensor_data = std::make_shared<TensorData>();
  1091. tensor_data->SetName(backend_name);
  1092. tensor_data->SetExecutionOrder(0);
  1093. tensor_data->SetSlot(slot);
  1094. tensor_data->SetIteration(iteration);
  1095. tensor_data->SetDeviceId(device_id);
  1096. tensor_data->SetRootGraphId(root_graph_id);
  1097. tensor_data->SetIsOutput(is_output);
  1098. if (buffer != nullptr) {
  1099. tensor_data->SetDataPtr(buffer->data());
  1100. } else {
  1101. tensor_data->SetDataPtr(nullptr);
  1102. }
  1103. tensor_data->SetByteSize(data_size);
  1104. tensor_data->SetType(type_name);
  1105. tensor_data->SetShape(shape);
  1106. tensor_data->SetTimeStamp(time_stamp);
  1107. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1108. if (data_size) {
  1109. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1110. }
  1111. // add to result_list
  1112. result_list->push_back(tensor_data);
  1113. }
  1114. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1115. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1116. std::string dump_style_name_part = *dump_style_kernel_name;
  1117. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1118. std::string slot_str;
  1119. if (is_output) {
  1120. slot_str = ".output." + std::to_string(slot);
  1121. } else {
  1122. slot_str = ".input." + std::to_string(slot);
  1123. }
  1124. dump_style_name_part += slot_str;
  1125. *prefix_dump_file_name = dump_style_name_part;
  1126. *slot_string_to_check = slot_str;
  1127. }
  1128. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1129. // get file with the newest timestamp from the list.
  1130. if (file_list.empty()) {
  1131. return "";
  1132. }
  1133. std::sort(file_list.begin(), file_list.end());
  1134. return file_list.back();
  1135. }
  1136. std::string GetTimeStampStr(std::string file_path) {
  1137. // get the file_name from file_path.
  1138. size_t pos = file_path.rfind("/");
  1139. std::string file_name = file_path.substr(pos + 1);
  1140. size_t first_dot = file_name.rfind(".");
  1141. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1142. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1143. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1144. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1145. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1146. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1147. return time_stamp;
  1148. }
  1149. return "";
  1150. }
  1151. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1152. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1153. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1154. const std::vector<std::string> &async_file_pool,
  1155. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1156. bool *no_mem_to_read) {
  1157. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1158. // form prefix of the tensor file to read from graph pb node name
  1159. std::string dump_style_kernel_name = backend_name[i];
  1160. // remove slot from name
  1161. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1162. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1163. std::string slot_string_to_check;
  1164. std::string prefix_dump_file_name;
  1165. std::string specific_dump_dir;
  1166. bool is_cst = false;
  1167. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1168. // prefix_dump_to_check is node name used to find corresponding dump file
  1169. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1170. // if node name has prefix of "Default--data-", consider as constant, search in cst folder
  1171. if (prefix_dump_to_check.length() > (unsigned)strlen(constant_prefix) &&
  1172. prefix_dump_to_check.substr(0, (unsigned)strlen(constant_prefix)).compare(constant_prefix) == 0) {
  1173. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1174. std::to_string(root_graph_id[i]) + "/constants";
  1175. is_cst = true;
  1176. const std::string prefix = "Default--";
  1177. prefix_dump_file_name = prefix_dump_file_name.substr(prefix.length());
  1178. prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
  1179. } else {
  1180. specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1181. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1182. }
  1183. MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
  1184. // search files in dir for the one that meets the filename prefix and read the file into memory
  1185. if (is_sync_mode_ || is_cst) {
  1186. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1187. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1188. } else {
  1189. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1190. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1191. no_mem_to_read);
  1192. }
  1193. }
  1194. }
  1195. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1196. const std::string &backend_name, const unsigned int device_id,
  1197. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1198. bool *no_mem_to_read, unsigned int iteration,
  1199. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1200. std::string time_stamp = "";
  1201. std::string type_name = "";
  1202. size_t data_size = 0;
  1203. std::vector<int64_t> shape;
  1204. std::vector<char> *buffer = nullptr;
  1205. if (found) {
  1206. std::string result_path = GetNewestFilePath(matched_paths);
  1207. time_stamp = GetTimeStampStr(result_path);
  1208. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1209. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1210. std::to_string(slot);
  1211. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1212. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1213. type_name, shape, buffer, result_list);
  1214. } else {
  1215. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1216. buffer, result_list);
  1217. MS_LOG(INFO) << "Target tensor has not been found.";
  1218. }
  1219. }
  1220. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1221. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1222. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1223. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1224. std::string abspath = RealPath(specific_dump_dir);
  1225. DIR *d = opendir(abspath.c_str());
  1226. bool found_file = false;
  1227. std::vector<std::string> matched_paths;
  1228. if (d == nullptr) {
  1229. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1230. } else {
  1231. struct dirent *dir = nullptr;
  1232. while ((dir = readdir(d)) != nullptr) {
  1233. struct stat st;
  1234. std::string name = abspath + std::string("/") + std::string(dir->d_name);
  1235. int ret = stat(name.c_str(), &st);
  1236. if (ret != 0) {
  1237. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1238. (void)closedir(d);
  1239. return;
  1240. }
  1241. if (S_ISREG(st.st_mode)) {
  1242. std::string file_name = dir->d_name;
  1243. std::string stripped_file_name = GetStrippedFilename(file_name);
  1244. if (stripped_file_name.empty()) {
  1245. continue;
  1246. }
  1247. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1248. if (found != 0) {
  1249. continue;
  1250. }
  1251. std::string full_path = specific_dump_dir + "/" + file_name;
  1252. matched_paths.push_back(full_path);
  1253. found_file = true;
  1254. }
  1255. }
  1256. (void)closedir(d);
  1257. }
  1258. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1259. no_mem_to_read, iteration, result_list);
  1260. }
  1261. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1262. const std::string &slot_string_to_check, const std::string &backend_name,
  1263. size_t slot, unsigned int device_id, unsigned int iteration,
  1264. unsigned int root_graph_id, const bool &is_output,
  1265. const std::vector<std::string> &async_file_pool,
  1266. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1267. bool found = false;
  1268. std::vector<std::string> matched_paths;
  1269. // if async mode
  1270. for (const std::string &file_path : async_file_pool) {
  1271. std::string file_name_to_check = file_path;
  1272. auto delim = file_path.rfind("/");
  1273. if (delim != std::string::npos) {
  1274. file_name_to_check = file_path.substr(delim + 1);
  1275. }
  1276. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1277. file_name_to_check.find(prefix_dump_to_check) != std::string::npos &&
  1278. file_name_to_check.find(slot_string_to_check) != std::string::npos) {
  1279. matched_paths.push_back(file_path);
  1280. found = true;
  1281. }
  1282. }
  1283. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1284. iteration, result_list);
  1285. }
  1286. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1287. // strip off the task_id, stream_id, and timestamp, then compare
  1288. size_t first_dot = file_name.find(".");
  1289. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1290. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1291. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1292. return std::string();
  1293. }
  1294. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1295. size_t second_dot = fifth_dot;
  1296. const int8_t kSecondDotPosition = 2;
  1297. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1298. second_dot = file_name.rfind(".", second_dot - 1);
  1299. }
  1300. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1301. return std::string();
  1302. }
  1303. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1304. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1305. std::string stripped_file_name = start_string + end_string;
  1306. return stripped_file_name;
  1307. }
  1308. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  1309. unsigned int iteration, std::vector<std::string> *const async_file_pool, bool error_on_no_value) {
  1310. // get a list of nodes and the devices they are on to monitor
  1311. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1312. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1313. GetAllWpNodes();
  1314. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1315. // as they are found
  1316. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1317. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1318. uint32_t rank_id = std::get<0>(rank_and_graph);
  1319. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1320. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1321. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1322. std::string real_dump_dir = RealPath(specific_dump_dir);
  1323. if (real_dump_dir.empty()) {
  1324. MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
  1325. continue;
  1326. }
  1327. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1328. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1329. // convert node names to dump style
  1330. for (auto node : wp_nodes) {
  1331. std::string orig_name = std::get<0>(node);
  1332. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1333. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1334. bool node_is_out = std::get<1>(node);
  1335. if (node_is_out) {
  1336. dump_style_name += ".output";
  1337. } else {
  1338. dump_style_name += ".input";
  1339. }
  1340. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1341. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1342. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1343. }
  1344. }
  1345. if (is_sync_mode_) {
  1346. // search files in dir for the one that meets the filename prefix and read the file into memory
  1347. ProcessTensorDataSync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1348. error_on_no_value);
  1349. } else {
  1350. // convert all files in proto_to_dump to npy and add to pool of async file names
  1351. ConvertWatchPointNodes(proto_to_dump, real_dump_dir, async_file_pool);
  1352. GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1353. &tensor_list);
  1354. }
  1355. }
  1356. return tensor_list;
  1357. }
  1358. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1359. const std::string &specific_dump_dir, unsigned int iteration,
  1360. unsigned int device_id, unsigned int root_graph_id,
  1361. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1362. bool error_on_no_value) {
  1363. DIR *d = opendir(specific_dump_dir.c_str());
  1364. if (d == nullptr) {
  1365. MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ProcessTensorDataSync.";
  1366. } else {
  1367. struct dirent *dir = nullptr;
  1368. while ((dir = readdir(d)) != nullptr) {
  1369. struct stat st;
  1370. std::string name = specific_dump_dir + std::string("/") + std::string(dir->d_name);
  1371. int ret = stat(name.c_str(), &st);
  1372. if (ret != 0) {
  1373. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1374. (void)closedir(d);
  1375. return;
  1376. }
  1377. if (S_ISREG(st.st_mode)) {
  1378. std::string file_name = dir->d_name;
  1379. for (auto &node : proto_to_dump) {
  1380. std::string dump_name = std::get<1>(node);
  1381. std::string stripped_file_name = GetStrippedFilename(file_name);
  1382. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1383. continue;
  1384. }
  1385. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1386. if (found == 0) {
  1387. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1388. std::vector<int64_t> shape;
  1389. std::string orig_name = std::get<0>(node);
  1390. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1391. bool output_flag = (output_str == "output");
  1392. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1393. nullptr, tensor_list);
  1394. break;
  1395. }
  1396. }
  1397. }
  1398. }
  1399. (void)closedir(d);
  1400. }
  1401. }
  1402. std::string DebugServices::IterationString(unsigned int iteration) {
  1403. std::string iteration_string;
  1404. bool init_dbg_suspend = (iteration == UINT_MAX);
  1405. if (init_dbg_suspend) {
  1406. iteration_string = "init";
  1407. } else {
  1408. iteration_string = std::to_string(iteration);
  1409. }
  1410. return iteration_string;
  1411. }
  1412. #endif
  1413. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1414. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1415. std::vector<unsigned int> *const dtype,
  1416. std::vector<std::vector<int64_t>> *const shape) {
  1417. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1418. tensor_loader_->SearchTensors(name, &result_list);
  1419. for (auto result : result_list) {
  1420. if (std::get<1>(result) == nullptr) {
  1421. continue;
  1422. }
  1423. #ifdef ONLINE_DBG_MODE
  1424. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1425. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1426. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1427. << ".";
  1428. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1429. }
  1430. #endif
  1431. (void)ret_name->emplace_back(std::get<0>(result));
  1432. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1433. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1434. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1435. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1436. }
  1437. }
  1438. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1439. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1440. if (result_list == nullptr) {
  1441. MS_LOG(DEBUG) << "result_list is nullptr.";
  1442. return;
  1443. }
  1444. tensor_loader_->SearchTensors(name, result_list);
  1445. }
  1446. #ifdef ONLINE_DBG_MODE
  1447. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1448. bool ret = false;
  1449. for (auto w_table_item : watchpoint_table_) {
  1450. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1451. for (auto check_node : check_node_list) {
  1452. std::string w_name = std::get<0>(check_node);
  1453. bool w_type = std::get<1>(check_node);
  1454. if ((w_type == true &&
  1455. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1456. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1457. ret = true;
  1458. return ret;
  1459. }
  1460. }
  1461. }
  1462. return ret;
  1463. }
  1464. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1465. if (kernel != nullptr && w_name.length() > 0) {
  1466. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1467. for (size_t j = 0; j < input_size; ++j) {
  1468. auto input_kernel = kernel->input(j + 1);
  1469. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1470. auto found = w_name.find_last_of('/');
  1471. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1472. return true;
  1473. }
  1474. return false;
  1475. } else {
  1476. return false;
  1477. }
  1478. }
  1479. #endif
  1480. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1481. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1482. return tensor_loader_->GetTensor(tensor_name);
  1483. }
  1484. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1485. #ifdef ONLINE_DBG_MODE
  1486. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1487. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1488. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1489. size_t slot) const {
  1490. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1491. device_type, addr_format, slot);
  1492. }
  1493. #endif
  1494. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1495. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1496. }
  1497. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1498. uint32_t prev_iter;
  1499. uint32_t rank_id = tensor->GetDeviceId();
  1500. uint32_t root_graph_id = tensor->GetRootGraphId();
  1501. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1502. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1503. return UINT32_MAX;
  1504. }
  1505. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1506. tensor->GetIteration());
  1507. if (it == graphs_run_history_[rank_and_graph].end()) {
  1508. // The graph is not executed in that iteration
  1509. return UINT32_MAX;
  1510. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1511. // current iteration is the first iteration that the graph was run
  1512. // no prev iter is available
  1513. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1514. << " is the first run iteration for tensor: " << tensor->GetName();
  1515. return UINT32_MAX;
  1516. }
  1517. it--;
  1518. prev_iter = *it;
  1519. tensor->SetPrevIteration(prev_iter);
  1520. return prev_iter;
  1521. }
  1522. void DebugServices::ResetLoadedTensors() {
  1523. wp_id_cache_.clear();
  1524. MS_LOG(INFO) << "Resetting loaded tensors";
  1525. tensor_loader_->MoveParametersCurrentToPrev();
  1526. tensor_loader_->EmptyCurrentTensor();
  1527. // will move parameters from previous to current map
  1528. tensor_loader_->SwapCurrentPrev();
  1529. overflow_ops_.clear();
  1530. }
  1531. #ifdef ONLINE_DBG_MODE
  1532. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1533. MS_EXCEPTION_IF_NULL(kernel);
  1534. std::vector<std::shared_ptr<TensorData>> result;
  1535. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1536. auto kernel_name = GetKernelNodeName(kernel);
  1537. for (size_t j = 0; j < output_size; ++j) {
  1538. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1539. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1540. if (tensor != nullptr) {
  1541. result.push_back(tensor);
  1542. }
  1543. }
  1544. return result;
  1545. }
  1546. #endif
  1547. std::string GetOnlineOpOverflowDir() {
  1548. // only called for online debugger mode
  1549. // get operator overflow directory for current iteration
  1550. std::string overflow_bin_path = "";
  1551. #ifdef ONLINE_DBG_MODE
  1552. if (DumpJsonParser::GetInstance().path().empty()) {
  1553. MS_LOG(INFO) << "Dump config is not set.";
  1554. return "";
  1555. }
  1556. auto debugger = Debugger::GetInstance();
  1557. MS_EXCEPTION_IF_NULL(debugger);
  1558. auto cur_graph = debugger->GetGraphPtr();
  1559. if (cur_graph == nullptr) {
  1560. return "";
  1561. }
  1562. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
  1563. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1564. if (!realpath.has_value()) {
  1565. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1566. return "";
  1567. }
  1568. overflow_bin_path = realpath.value() + '/';
  1569. #endif
  1570. return overflow_bin_path;
  1571. }
  1572. void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, std::vector<std::string> *op_names) {
  1573. MS_EXCEPTION_IF_NULL(op_names);
  1574. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1575. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1576. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1577. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1578. DIR *d = opendir(overflow_bin_path.c_str());
  1579. if (d == nullptr) {
  1580. MS_LOG(INFO) << "OverFlow bin directory does not exist!";
  1581. } else {
  1582. struct dirent *dir = nullptr;
  1583. while ((dir = readdir(d)) != nullptr) {
  1584. struct stat st;
  1585. std::string name = overflow_bin_path + std::string("/") + std::string(dir->d_name);
  1586. int ret = stat(name.c_str(), &st);
  1587. if (ret != 0) {
  1588. MS_LOG(ERROR) << "stat error, ret is: " << ret;
  1589. (void)closedir(d);
  1590. return;
  1591. }
  1592. if (S_ISREG(st.st_mode)) {
  1593. // form fully qualified filename
  1594. std::string file_path = name;
  1595. std::string file_name = dir->d_name;
  1596. // attempt to read the file
  1597. std::ifstream infile;
  1598. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1599. if (!infile.is_open()) {
  1600. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1601. continue;
  1602. }
  1603. std::string node_name;
  1604. uint64_t task_id = 0;
  1605. uint64_t stream_id = 0;
  1606. // detect overflow bin file
  1607. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1608. if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
  1609. continue;
  1610. }
  1611. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1612. << ".";
  1613. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1614. } else {
  1615. // regular bin file
  1616. bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
  1617. if (success_parse) {
  1618. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1619. }
  1620. }
  1621. infile.close();
  1622. }
  1623. }
  1624. (void)closedir(d);
  1625. }
  1626. // find the op_names with an overflow hit
  1627. for (auto &task_stream : task_stream_hit) {
  1628. auto op_name = task_stream_to_opname[task_stream];
  1629. if (!op_name.empty()) {
  1630. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1631. op_names->push_back(op_name);
  1632. }
  1633. }
  1634. }
  1635. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1636. unsigned int iteration) {
  1637. std::string overflow_bin_path = "";
  1638. #ifdef ONLINE_DBG_MODE
  1639. overflow_bin_path = GetOnlineOpOverflowDir();
  1640. #else
  1641. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1642. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1643. overflow_bin_path = RealPath(overflow_bin_path);
  1644. #endif
  1645. if (overflow_bin_path.empty()) {
  1646. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1647. return false;
  1648. }
  1649. // remove kernel_graph_#
  1650. std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
  1651. std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
  1652. // remove path
  1653. size_t last_slash = node_name_to_find.rfind("/");
  1654. std::string op_name_find = "";
  1655. if (last_slash != std::string::npos) {
  1656. op_name_find = node_name_to_find.substr(last_slash + 1);
  1657. }
  1658. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1659. std::vector<std::string> op_names;
  1660. overflow_wp_lock_.lock();
  1661. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1662. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1663. if (found_overflows != overflow_ops_.end()) {
  1664. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1665. op_names = overflow_ops_[overflow_bin_path];
  1666. } else {
  1667. AddOpOverflowOpNames(overflow_bin_path, &op_names);
  1668. overflow_ops_[overflow_bin_path] = op_names;
  1669. }
  1670. overflow_wp_lock_.unlock();
  1671. // determine if overflow wp has been triggered for the op name with path (from bin file)
  1672. if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
  1673. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1674. return true;
  1675. }
  1676. // determine if overflow wp has been triggered for the op name (from npy file)
  1677. if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
  1678. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1679. return true;
  1680. }
  1681. return false;
  1682. }
  1683. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1684. std::string op_name_to_find = node_name_to_find;
  1685. const std::string kernel_prefix = "kernel_graph_";
  1686. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1687. auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
  1688. if (start_of_op_name != std::string::npos) {
  1689. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1690. }
  1691. }
  1692. return op_name_to_find;
  1693. }
  1694. bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *task_id,
  1695. uint64_t *stream_id) {
  1696. size_t task_pos_start = overflow_file_prefix.length();
  1697. size_t task_pos_end = file_name.find(".", task_pos_start);
  1698. if (task_pos_end == std::string::npos) {
  1699. MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
  1700. return false;
  1701. }
  1702. size_t stream_pos_start = task_pos_end + 1;
  1703. size_t stream_pos_end = file_name.find(".", stream_pos_start);
  1704. if (stream_pos_end == std::string::npos) {
  1705. MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
  1706. return false;
  1707. }
  1708. std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
  1709. std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
  1710. *task_id = std::stoull(task_id_str);
  1711. *stream_id = std::stoull(stream_id_str);
  1712. return true;
  1713. }
  1714. bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id,
  1715. uint64_t *stream_id) {
  1716. // get the node_name, task_id, and stream_id from dump filename
  1717. // node_type.node_name.task_id.stream_id.{etcetera}
  1718. size_t first_dot = file_name.find(".");
  1719. size_t second_dot = file_name.find(".", first_dot + 1);
  1720. size_t third_dot = file_name.find(".", second_dot + 1);
  1721. size_t fourth_dot = file_name.find(".", third_dot + 1);
  1722. // check if dots were found
  1723. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1724. fourth_dot == std::string::npos) {
  1725. return false;
  1726. }
  1727. // get node_name
  1728. if (first_dot < second_dot) {
  1729. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1730. } else {
  1731. MS_LOG(ERROR) << "filename parse error to get node_name.";
  1732. return false;
  1733. }
  1734. // get task id
  1735. if (second_dot < third_dot) {
  1736. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1737. try {
  1738. *task_id = std::stoull(extracted_task_id);
  1739. } catch (std::invalid_argument &e) {
  1740. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1741. return false;
  1742. } catch (std::out_of_range &e) {
  1743. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1744. return false;
  1745. }
  1746. } else {
  1747. MS_LOG(ERROR) << "filename parse error to get task_id.";
  1748. return false;
  1749. }
  1750. // get stream id
  1751. if (third_dot < fourth_dot) {
  1752. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1753. try {
  1754. *stream_id = std::stoull(extracted_stream_id);
  1755. } catch (std::invalid_argument &e) {
  1756. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1757. return false;
  1758. } catch (std::out_of_range &e) {
  1759. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1760. return false;
  1761. }
  1762. } else {
  1763. MS_LOG(ERROR) << "filename parse error to get stream_id.";
  1764. return false;
  1765. }
  1766. return true;
  1767. }
  1768. std::string DebugServices::RealPath(const std::string &input_path) {
  1769. if (input_path.length() >= PATH_MAX) {
  1770. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1771. }
  1772. size_t path_split_pos = input_path.find_last_of('/');
  1773. // get real path
  1774. char real_path[PATH_MAX] = {0};
  1775. // input_path is dir + file_name
  1776. if (path_split_pos != std::string::npos) {
  1777. std::string prefix_path = input_path.substr(0, path_split_pos);
  1778. std::string file_name = input_path.substr(path_split_pos);
  1779. if (file_name.length() > NAME_MAX) {
  1780. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1781. }
  1782. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1783. MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
  1784. return "";
  1785. }
  1786. return std::string(real_path) + file_name;
  1787. }
  1788. // input_path is only file_name
  1789. if (input_path.length() > NAME_MAX) {
  1790. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1791. }
  1792. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1793. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1794. }
  1795. return std::string(real_path);
  1796. }
  1797. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1798. #if defined(__APPLE__)
  1799. return *reinterpret_cast<const uint64_t *>(buffer.data());
  1800. #else
  1801. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1802. #endif
  1803. }
  1804. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1805. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1806. }
  1807. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1808. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1809. }
  1810. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1811. if (tensor_loader_->EnableMemoryControl()) {
  1812. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1813. }
  1814. }
  1815. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1816. std::string DebugServices::GetNetName() { return net_name_; }
  1817. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1818. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1819. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1820. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1821. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1822. #ifdef ONLINE_DBG_MODE
  1823. } // namespace mindspore
  1824. #endif