You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 73 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include "pybind11/embed.h"
  29. #include "pybind11/stl.h"
  30. #ifdef ONLINE_DBG_MODE
  31. #include "debug/common.h"
  32. #include "debug/debugger/debugger.h"
  33. #include "debug/anf_ir_utils.h"
  34. #include "backend/session/anf_runtime_algorithm.h"
  35. #endif
  36. #include "debug/debugger/tensor_summary.h"
  37. #include "utils/file_utils.h"
  38. #ifdef ONLINE_DBG_MODE
  39. namespace mindspore {
  40. #endif
  41. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  42. DebugServices::DebugServices(const DebugServices &other) {
  43. wp_id_cache_ = other.wp_id_cache_;
  44. net_name_ = other.net_name_;
  45. dump_dir_ = other.dump_dir_;
  46. is_sync_mode_ = other.is_sync_mode_;
  47. tensor_loader_ = other.tensor_loader_;
  48. watchpoint_table_ = other.watchpoint_table_;
  49. }
  50. DebugServices &DebugServices::operator=(const DebugServices &other) {
  51. if (this != &other) {
  52. tensor_loader_ = other.tensor_loader_;
  53. watchpoint_table_ = other.watchpoint_table_;
  54. }
  55. return *this;
  56. }
  57. void DebugServices::AddWatchpoint(
  58. unsigned int id, unsigned int watch_condition, float parameter,
  59. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  60. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  61. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  62. std::lock_guard<std::mutex> lg(lock_);
  63. watchpoint_t watchpoint_item;
  64. watchpoint_item.id = id;
  65. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  66. watchpoint_item.condition.parameter = parameter;
  67. watchpoint_item.check_node_list = check_node_list;
  68. if (check_node_device_list != nullptr) {
  69. watchpoint_item.check_node_device_list = *check_node_device_list;
  70. }
  71. if (check_node_graph_list != nullptr) {
  72. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  73. }
  74. watchpoint_item.parameter_list = parameter_list;
  75. watchpoint_table_[id] = watchpoint_item;
  76. }
  77. void DebugServices::RemoveWatchpoint(unsigned int id) {
  78. std::lock_guard<std::mutex> lg(lock_);
  79. (void)watchpoint_table_.erase(id);
  80. }
  81. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  82. const void *const previous_tensor_ptr, uint32_t num_elements,
  83. uint32_t prev_num_elements, int tensor_dtype) {
  84. MS_EXCEPTION_IF_NULL(tensor);
  85. switch (tensor_dtype) {
  86. case DbgDataType::DT_UINT8: {
  87. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  88. prev_num_elements);
  89. }
  90. case DbgDataType::DT_INT8: {
  91. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  92. prev_num_elements);
  93. }
  94. case DbgDataType::DT_UINT16: {
  95. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  96. prev_num_elements);
  97. }
  98. case DbgDataType::DT_INT16: {
  99. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  100. prev_num_elements);
  101. }
  102. case DbgDataType::DT_UINT32: {
  103. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  104. prev_num_elements);
  105. }
  106. case DbgDataType::DT_INT32:
  107. case DbgDataType::DT_BASE_INT: {
  108. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  109. prev_num_elements);
  110. }
  111. case DbgDataType::DT_UINT64: {
  112. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  113. prev_num_elements);
  114. }
  115. case DbgDataType::DT_INT64: {
  116. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  117. prev_num_elements);
  118. }
  119. case DbgDataType::DT_FLOAT16: {
  120. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  121. prev_num_elements);
  122. }
  123. case DbgDataType::DT_FLOAT32:
  124. case DbgDataType::DT_BASE_FLOAT: {
  125. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  126. prev_num_elements);
  127. }
  128. case DbgDataType::DT_FLOAT64: {
  129. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  130. prev_num_elements);
  131. }
  132. case DbgDataType::DT_BOOL: {
  133. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  134. prev_num_elements);
  135. }
  136. default:
  137. MS_LOG(INFO) << "Unsupported tensor type";
  138. // return a null pointer
  139. return std::unique_ptr<TensorSummary<int32_t>>{};
  140. }
  141. }
  142. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  143. if (tensor == nullptr) {
  144. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  145. TensorStat empty_tensor_stat_data;
  146. return empty_tensor_stat_data;
  147. }
  148. std::unique_ptr<ITensorSummary> base_summary_ptr;
  149. void *previous_tensor_ptr = nullptr;
  150. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  151. if (base_summary_ptr == nullptr) {
  152. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  153. TensorStat empty_tensor_stat_data;
  154. return empty_tensor_stat_data;
  155. }
  156. base_summary_ptr->TensorStatistics(tensor->GetType());
  157. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  158. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  159. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  160. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  161. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  162. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  163. return tensor_stat_data;
  164. }
  165. #ifdef OFFLINE_DBG_MODE
  166. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  167. uint32_t *prev_num_elements) {
  168. MS_EXCEPTION_IF_NULL(tensor);
  169. const void *previous_tensor_ptr = nullptr;
  170. std::shared_ptr<TensorData> tensor_prev;
  171. if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
  172. // read data in offline mode
  173. std::vector<std::string> file_paths;
  174. if (!is_sync_mode_) {
  175. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  176. std::vector<unsigned int>{tensor->GetDeviceId()},
  177. std::vector<unsigned int>{tensor->GetIteration() - 1},
  178. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  179. }
  180. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  181. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  182. std::vector<unsigned int>{tensor->GetDeviceId()},
  183. std::vector<unsigned int>{tensor->GetIteration() - 1},
  184. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  185. file_paths, &result_list_prev);
  186. tensor_prev = result_list_prev[0];
  187. if (!tensor_prev->GetByteSize()) {
  188. tensor_prev.reset();
  189. } else {
  190. previous_tensor_ptr = tensor_prev->GetDataPtr();
  191. *prev_num_elements = tensor_prev->GetNumElements();
  192. }
  193. }
  194. return previous_tensor_ptr;
  195. }
  196. #endif
  197. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  198. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  199. std::string *const qualified_tensor_name,
  200. std::vector<watchpoint_t> *const watchpoints_to_check) {
  201. if (tensor == nullptr) {
  202. MS_LOG(DEBUG) << "tensor is nullptr.";
  203. return;
  204. }
  205. const auto tensor_name = tensor->GetName();
  206. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  207. const auto tensor_device_id = tensor->GetDeviceId();
  208. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  209. for (auto w_table_item : watchpoint_table_) {
  210. auto wp = std::get<1>(w_table_item);
  211. // check ONLY init conditions on initial suspended state.
  212. // skip other conditions on initial suspended state
  213. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  214. continue;
  215. }
  216. // skip init condition if not init suspend
  217. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  218. continue;
  219. }
  220. // check change conditions only on step end.
  221. if (wp.change_condition() && !step_end) {
  222. continue;
  223. }
  224. // if recheck, ignore the cache results and reanalyze everything.
  225. // if not a recheck, check only unanalyzed tensors
  226. if (!recheck) {
  227. wp_lock_.lock();
  228. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  229. wp_lock_.unlock();
  230. if (wp_cache_hit) {
  231. continue;
  232. }
  233. }
  234. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  235. if (!found.empty()) {
  236. *qualified_tensor_name = found;
  237. watchpoints_to_check->push_back(w_table_item.second);
  238. #ifdef OFFLINE_DBG_MODE
  239. if (wp.change_condition()) {
  240. *previous_iter_tensor_needed = true;
  241. }
  242. #endif
  243. }
  244. }
  245. }
  246. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  247. const std::string &tensor_name) {
  248. // add analyzed tensor to cache
  249. if (!recheck) {
  250. wp_lock_.lock();
  251. (void)wp_id_cache_[tensor_name].insert(id);
  252. wp_lock_.unlock();
  253. }
  254. }
  255. void DebugServices::SetCheckWatchpointsResult(
  256. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  257. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  258. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  259. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  260. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  261. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  262. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  263. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  264. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  265. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  266. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  267. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  268. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  269. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  270. if (device_id != nullptr) {
  271. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  272. }
  273. if (root_graph_id != nullptr) {
  274. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  275. }
  276. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  277. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  278. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  279. }
  280. #ifdef OFFLINE_DBG_MODE
  281. void DebugServices::ProcessCheckpointsOutofMemory(
  282. const bool no_mem_to_read, const std::vector<watchpoint_t> watchpoints_to_check, int chunk_id,
  283. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  284. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  285. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  286. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  287. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  288. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  289. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  290. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  291. const std::vector<parameter_t> &parameter_list) {
  292. if (no_mem_to_read) {
  293. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  294. int32_t oversize_error_code = 8;
  295. for (auto &wp : watchpoints_to_check) {
  296. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  297. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  298. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  299. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  300. parameter_list, oversize_error_code);
  301. }
  302. }
  303. }
  304. #endif
  305. void DebugServices::CheckWatchpointsForTensor(
  306. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  307. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  308. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  309. const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
  310. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  311. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  312. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  313. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  314. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id) {
  315. int list_size = tensor_list->size();
  316. if (end > list_size) {
  317. end = list_size;
  318. }
  319. for (int i = begin; i < end; i++) {
  320. auto &tensor = (*tensor_list)[i];
  321. const auto tensor_name = tensor->GetName();
  322. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  323. const auto tensor_slot = std::to_string(tensor->GetSlot());
  324. std::vector<watchpoint_t> watchpoints_to_check;
  325. std::string qualified_tensor_name;
  326. bool previous_iter_tensor_needed = false;
  327. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  328. &qualified_tensor_name, &watchpoints_to_check);
  329. // no wp set on current tensor
  330. if (watchpoints_to_check.empty()) {
  331. continue;
  332. }
  333. #ifdef OFFLINE_DBG_MODE
  334. // read data in offline mode
  335. bool no_mem_to_read = false;
  336. std::vector<std::shared_ptr<TensorData>> result_list;
  337. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  338. std::vector<unsigned int>{tensor->GetDeviceId()},
  339. std::vector<unsigned int>{tensor->GetIteration()},
  340. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  341. async_file_pool, &result_list, &no_mem_to_read);
  342. tensor = result_list[0];
  343. if (!tensor->GetByteSize()) {
  344. ProcessCheckpointsOutofMemory(
  345. no_mem_to_read, watchpoints_to_check, chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  346. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id,
  347. device_id, root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name,
  348. tensor_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), std::vector<parameter_t>());
  349. tensor.reset();
  350. continue;
  351. }
  352. #endif
  353. // no elements to analyze
  354. if (tensor->GetByteSize() == 0) {
  355. continue;
  356. }
  357. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  358. int tensor_dtype = tensor->GetType();
  359. uint32_t num_elements = tensor->GetNumElements();
  360. uint32_t prev_num_elements = 0;
  361. const void *previous_tensor_ptr = nullptr;
  362. #ifdef OFFLINE_DBG_MODE
  363. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
  364. #else
  365. std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  366. if (prev_tensor_data) {
  367. previous_tensor_ptr = prev_tensor_data->GetDataPtr();
  368. prev_num_elements = prev_tensor_data->GetNumElements();
  369. }
  370. #endif
  371. std::unique_ptr<ITensorSummary> base_summary_ptr;
  372. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  373. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  374. if (base_summary_ptr != nullptr) {
  375. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  376. }
  377. }
  378. for (auto &wp : watchpoints_to_check) {
  379. bool is_hit = false;
  380. int error_code = 0;
  381. std::vector<parameter_t> parameter_list = {};
  382. if (wp.condition.type == IS_OVERFLOW) {
  383. is_hit =
  384. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  385. } else if (base_summary_ptr != nullptr) {
  386. auto item = base_summary_ptr->IsWatchpointHit(wp);
  387. is_hit = std::get<ITensorSummary::eHitPos>(item);
  388. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  389. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  390. }
  391. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  392. if (is_hit || error_code) {
  393. SetCheckWatchpointsResult(
  394. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  395. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  396. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  397. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  398. }
  399. }
  400. #ifdef OFFLINE_DBG_MODE
  401. // set the tensor into not-in-use status in tensor_loader.
  402. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  403. std::to_string(tensor->GetRootGraphId()) + ":" +
  404. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  405. AppendToCacheEvictQueue(key_name_in_cache);
  406. if (previous_tensor_ptr != nullptr) {
  407. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  408. }
  409. // in offline mode remove the need for the data
  410. tensor.reset();
  411. #endif
  412. }
  413. }
  414. void DebugServices::CheckWatchpoints(
  415. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  416. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  417. std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
  418. const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  419. const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
  420. std::vector<unsigned int> *const root_graph_id) {
  421. std::lock_guard<std::mutex> lg(lock_);
  422. auto t1 = std::chrono::high_resolution_clock::now();
  423. if (watchpoint_table_.empty()) {
  424. return;
  425. }
  426. // vector to store execution order of tensors hit
  427. std::vector<int> exec_order;
  428. std::vector<std::string> time_stamps;
  429. int tensor_list_size = tensor_list->size();
  430. uint64_t tensor_list_byte_size = 0;
  431. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  432. if (tensor_list_size <= 0) {
  433. return;
  434. }
  435. // default value for number of threads
  436. const int default_thread_num = 16;
  437. int max_thread_num = default_thread_num;
  438. if (max_thread_num > tensor_list_size) {
  439. max_thread_num = tensor_list_size;
  440. }
  441. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  442. int chunk_size = tensor_list_size / max_thread_num;
  443. int remainder = tensor_list_size % max_thread_num;
  444. partitioned_numbers chunk_exec_orders(max_thread_num);
  445. partitioned_names chunk_names(max_thread_num);
  446. partitioned_names chunk_slots(max_thread_num);
  447. partitioned_numbers chunk_conditions(max_thread_num);
  448. partitioned_id chunk_watchpoint_id(max_thread_num);
  449. partitioned_parameters chunk_parameters(max_thread_num);
  450. partitioned_error_code chunk_error_codes(max_thread_num);
  451. partitioned_id chunk_device_id(max_thread_num);
  452. partitioned_id chunk_root_graph_id(max_thread_num);
  453. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  454. partitioned_names chunk_time_stamp(max_thread_num);
  455. std::vector<std::future<void>> tensor_future_vec;
  456. int begin = 0;
  457. int end = begin;
  458. for (int i = 0; i < max_thread_num; i++) {
  459. end += chunk_size;
  460. if (remainder > 0) {
  461. end++;
  462. remainder--;
  463. }
  464. (void)tensor_future_vec.emplace_back(std::async(
  465. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  466. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  467. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  468. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id));
  469. begin = end;
  470. }
  471. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  472. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  473. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  474. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  475. root_graph_id);
  476. auto t2 = std::chrono::high_resolution_clock::now();
  477. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  478. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  479. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  480. }
  481. void DebugServices::SortWatchpointsInfo(
  482. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  483. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  484. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  485. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  486. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  487. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  488. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  489. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  490. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  491. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  492. std::vector<unsigned int> *const root_graph_id) {
  493. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  494. (*tensor_future_vec)[i].wait();
  495. (*tensor_future_vec)[i].get();
  496. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  497. #ifdef ONLINE_DBG_MODE
  498. // if the execution order is repeated,inserts the new one before the others with same execution order.
  499. std::vector<int>::iterator iter =
  500. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  501. int position = iter - exec_order->begin();
  502. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  503. #endif
  504. #ifdef OFFLINE_DBG_MODE
  505. std::vector<std::string>::iterator iter =
  506. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  507. int position = iter - time_stamps->begin();
  508. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  509. #endif
  510. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  511. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  512. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  513. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  514. if (device_id != nullptr) {
  515. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  516. }
  517. if (root_graph_id != nullptr) {
  518. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  519. }
  520. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  521. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  522. }
  523. // free the memory for used vectors
  524. std::vector<int>().swap((*chunk_exec_orders)[i]);
  525. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  526. std::vector<std::string>().swap((*chunk_names)[i]);
  527. std::vector<std::string>().swap((*chunk_slots)[i]);
  528. std::vector<int>().swap((*chunk_conditions)[i]);
  529. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  530. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  531. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  532. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  533. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  534. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  535. }
  536. }
  537. #ifdef OFFLINE_DBG_MODE
  538. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  539. std::string *const tensor_type, std::size_t *const size,
  540. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  541. bool *no_mem_to_read) {
  542. std::ifstream infile;
  543. std::string file_path = file_name;
  544. MS_LOG(INFO) << "Reading in file: " << file_path;
  545. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  546. if (!infile.is_open()) {
  547. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  548. const int kMaxFilenameLength = 128;
  549. char err_info[kMaxFilenameLength];
  550. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  551. if (ret != nullptr) {
  552. MS_LOG(ERROR) << " ErrInfo:" << ret;
  553. }
  554. return;
  555. }
  556. const int substr_len = 2;
  557. const int header_len_offset = 8;
  558. const int header_offset = 9;
  559. const int header_len_buffer_size = 2;
  560. const int type_offset = 10;
  561. // get header length
  562. (void)infile.seekg(0, std::ios::beg);
  563. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  564. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  565. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  566. return;
  567. }
  568. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  569. header_len_buffer.reset();
  570. // read in header
  571. (void)infile.seekg(0, std::ios::beg);
  572. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  573. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  574. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  575. return;
  576. }
  577. std::string header(header_buffer->data() + header_offset, header_len);
  578. header_buffer.reset();
  579. std::size_t type_i = header.find("descr") + type_offset;
  580. if (header.length() < type_i + substr_len) {
  581. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  582. return;
  583. }
  584. *tensor_type = header.substr(type_i, substr_len);
  585. std::size_t shape_i_open = header.find("(");
  586. std::size_t shape_i_close = header.find(")");
  587. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  588. std::string intermediate;
  589. std::stringstream check_shape(shape_str);
  590. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  591. while (getline(check_shape, intermediate, ',')) {
  592. shape->push_back(std::stoi(intermediate));
  593. }
  594. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  595. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  596. std::size_t data_size = data_len * word_size;
  597. if (!data_size) {
  598. return;
  599. }
  600. // Check memory available before loading tensor into host.
  601. bool has_enough_memory = true;
  602. if (tensor_loader_->EnableMemoryControl()) {
  603. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  604. }
  605. if (!has_enough_memory) {
  606. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  607. *no_mem_to_read = true;
  608. } else {
  609. (void)infile.seekg(header_len + type_offset);
  610. *data_buffer = new std::vector<char>(data_size);
  611. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  612. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  613. }
  614. *size = data_size;
  615. }
  616. }
  617. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  618. std::vector<std::string> *const result_list) {
  619. std::string file_format = "npy";
  620. for (auto const &d : dir_to_files_map) {
  621. std::vector<std::string> files_to_convert_in_dir;
  622. std::vector<std::string> files_after_convert_in_dir;
  623. std::string dump_key = d.first;
  624. for (auto const &file_name : d.second) {
  625. bool already_converted = false;
  626. // Remove scope from the file_name for matching files converted by mindinsight tool.
  627. std::size_t found_first_dot = file_name.find(".");
  628. std::size_t found_last_underscore = file_name.find_last_of("_");
  629. std::string file_name_without_scope = file_name;
  630. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  631. file_name_without_scope =
  632. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  633. }
  634. for (std::string &file_found : *result_list) {
  635. if (file_found.find(file_name_without_scope) != std::string::npos) {
  636. already_converted = true;
  637. break;
  638. }
  639. }
  640. if (!already_converted) {
  641. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  642. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  643. }
  644. }
  645. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  646. if (!files_to_convert_in_dir.empty()) {
  647. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  648. // later task.
  649. try {
  650. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  651. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  652. (void)convert_obj.attr("convert_files")();
  653. } catch (pybind11::error_already_set &e) {
  654. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  655. }
  656. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
  657. }
  658. }
  659. }
  660. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  661. const std::string &dump_key, std::vector<std::string> *const result_list,
  662. const std::string &file_format) {
  663. std::string real_dump_iter_dir = RealPath(dump_key);
  664. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  665. if (d_handle == nullptr) {
  666. MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
  667. return;
  668. }
  669. struct dirent *dir = nullptr;
  670. while ((dir = readdir(d_handle)) != nullptr) {
  671. if (dir->d_type == DT_REG) {
  672. std::string candidate = dir->d_name;
  673. for (const std::string &file_to_find : files_after_convert_in_dir) {
  674. std::string file_n = file_to_find;
  675. auto last_slash_pos = file_to_find.find_last_of("\\/");
  676. if (last_slash_pos != std::string::npos) {
  677. file_n = file_to_find.substr(last_slash_pos + 1);
  678. }
  679. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  680. // we found a converted file for this op
  681. std::string found_file = dump_key + "/" + candidate;
  682. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  683. result_list->push_back(found_file);
  684. }
  685. }
  686. }
  687. }
  688. }
  689. (void)closedir(d_handle);
  690. }
  691. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  692. if (dump_style_name.empty()) {
  693. return "";
  694. }
  695. std::size_t last_scope_marker;
  696. std::string delim = "/";
  697. last_scope_marker = dump_style_name.rfind(delim);
  698. if (last_scope_marker == std::string::npos) {
  699. return dump_style_name;
  700. }
  701. return dump_style_name.substr(last_scope_marker + delim.size());
  702. }
  703. void ReplaceSrcFileName(std::string *dump_style_name) {
  704. if (dump_style_name == nullptr) {
  705. return;
  706. }
  707. const std::string strsrc = "/";
  708. std::string strdst = "_";
  709. std::string::size_type pos = 0;
  710. std::string::size_type srclen = strsrc.size();
  711. std::string::size_type dstlen = strdst.size();
  712. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  713. (void)dump_style_name->replace(pos, srclen, strdst);
  714. pos += dstlen;
  715. }
  716. }
  717. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  718. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  719. std::vector<unsigned int> root_graph_id,
  720. std::vector<std::string> *const result_list) {
  721. std::string file_format = "npy";
  722. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  723. for (unsigned int i = 0; i < backend_name.size(); i++) {
  724. // form prefix of the tensor file to read from graph pb node name
  725. std::string dump_style_kernel_name = backend_name[i];
  726. // remove slot from name
  727. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  728. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  729. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  730. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  731. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  732. // search files in dir for the one that meets the filename prefix and read the file into memory
  733. std::string abspath = RealPath(specific_dump_dir);
  734. DIR *d = opendir(abspath.c_str());
  735. if (d == nullptr) {
  736. MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
  737. return;
  738. }
  739. ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  740. (void)closedir(d);
  741. }
  742. ConvertToHostFormat(dir_to_files_map, result_list);
  743. }
  744. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  745. const std::string &specific_dump_dir,
  746. std::vector<std::string> *const result_list) {
  747. std::string file_format = "npy";
  748. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  749. for (const auto &node : proto_dump) {
  750. std::string dump_name = std::get<1>(node);
  751. dump_name = dump_name.substr(0, dump_name.rfind("."));
  752. // search files in dir for the one that meets the filename prefix and read the file into memory
  753. std::string abspath = RealPath(specific_dump_dir);
  754. DIR *d = opendir(abspath.c_str());
  755. if (d == nullptr) {
  756. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  757. return;
  758. }
  759. ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  760. (void)closedir(d);
  761. }
  762. ConvertToHostFormat(dir_to_files_map, result_list);
  763. }
  764. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
  765. const std::string &specific_dump_dir,
  766. std::map<std::string, std::vector<std::string>> *dir_to_files_map,
  767. std::vector<std::string> *const result_list) {
  768. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  769. DIR *d = opendir(specific_dump_dir.c_str());
  770. struct dirent *dir = nullptr;
  771. while ((dir = readdir(d)) != nullptr) {
  772. if (dir->d_type != DT_REG) {
  773. continue;
  774. }
  775. std::string file_name = dir->d_name;
  776. std::string file_name_w_o_perfix = file_name;
  777. auto type_pos = file_name.find('.');
  778. if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
  779. continue;
  780. }
  781. if (file_name.rfind(file_format) == std::string::npos) {
  782. // if file matches prefix and is in device format add to candidate files to convert.
  783. (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
  784. } else {
  785. // otherwise, if file matches prefix and already has been converted to host format
  786. // add to result of converted files.
  787. std::string found_file = specific_dump_dir + "/" + file_name;
  788. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  789. result_list->push_back(found_file);
  790. }
  791. }
  792. }
  793. (void)closedir(d);
  794. }
  795. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  796. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  797. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  798. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  799. for (auto &node : proto_dump) {
  800. std::vector<size_t> slot_list;
  801. std::string dump_style_name = std::get<1>(node);
  802. // Get dump_name and output_str from the second element of tuple
  803. std::size_t found_dot = dump_style_name.rfind(".");
  804. std::string dump_name = dump_style_name.substr(0, found_dot);
  805. std::string output_str = dump_style_name.substr(found_dot + 1);
  806. bool output_flag = (output_str == "output");
  807. for (const std::string &file_name : async_file_pool) {
  808. std::size_t found = file_name.find(dump_name);
  809. std::size_t found_out = file_name.find(output_str);
  810. std::size_t found_dot_start = file_name.find(".", found_out);
  811. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  812. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  813. found_out != std::string::npos) {
  814. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  815. }
  816. }
  817. for (auto slot : slot_list) {
  818. // add a TensorData entry (data will be read when needed)
  819. std::vector<int64_t> shape;
  820. std::string orig_name = std::get<0>(node);
  821. auto tensor_data = std::make_shared<TensorData>();
  822. tensor_data->SetName(orig_name);
  823. tensor_data->SetExecutionOrder(0);
  824. tensor_data->SetSlot(slot);
  825. tensor_data->SetIteration(iteration);
  826. tensor_data->SetDeviceId(device_id);
  827. tensor_data->SetRootGraphId(root_graph_id);
  828. tensor_data->SetDataPtr(nullptr);
  829. tensor_data->SetByteSize(0);
  830. tensor_data->SetType("");
  831. tensor_data->SetShape(shape);
  832. tensor_data->SetIsOutput(output_flag);
  833. tensor_list->push_back(tensor_data);
  834. }
  835. }
  836. }
  837. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  838. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  839. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  840. const std::string &type_name, const std::vector<int64_t> &shape,
  841. std::vector<char> *buffer,
  842. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  843. // call LoadNewTensor to store tensor in internal cache
  844. auto tensor_data = std::make_shared<TensorData>();
  845. tensor_data->SetName(backend_name);
  846. tensor_data->SetExecutionOrder(0);
  847. tensor_data->SetSlot(slot);
  848. tensor_data->SetIteration(iteration);
  849. tensor_data->SetDeviceId(device_id);
  850. tensor_data->SetRootGraphId(root_graph_id);
  851. tensor_data->SetIsOutput(is_output);
  852. if (buffer != nullptr) {
  853. tensor_data->SetDataPtr(buffer->data());
  854. } else {
  855. tensor_data->SetDataPtr(nullptr);
  856. }
  857. tensor_data->SetByteSize(data_size);
  858. tensor_data->SetType(type_name);
  859. tensor_data->SetShape(shape);
  860. tensor_data->SetTimeStamp(time_stamp);
  861. if (data_size) {
  862. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  863. }
  864. // add to result_list
  865. result_list->push_back(tensor_data);
  866. }
  867. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  868. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  869. std::string dump_style_name_part = *dump_style_kernel_name;
  870. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  871. std::string slot_str;
  872. if (is_output) {
  873. slot_str = ".output." + std::to_string(slot);
  874. } else {
  875. slot_str = ".input." + std::to_string(slot);
  876. }
  877. dump_style_name_part += slot_str;
  878. *prefix_dump_file_name = dump_style_name_part;
  879. *slot_string_to_check = slot_str;
  880. }
  881. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  882. // get file with the newest timestamp from the list.
  883. if (file_list.empty()) {
  884. return "";
  885. }
  886. std::sort(file_list.begin(), file_list.end());
  887. return file_list.back();
  888. }
  889. std::string GetTimeStampStr(std::string file_path) {
  890. // get the file_name from file_path.
  891. size_t pos = file_path.rfind("/");
  892. std::string file_name = file_path.substr(pos + 1);
  893. size_t first_dot = file_name.rfind(".");
  894. size_t second_dot = file_name.rfind(".", first_dot - 1);
  895. size_t third_dot = file_name.rfind(".", second_dot - 1);
  896. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  897. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  898. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  899. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  900. return time_stamp;
  901. }
  902. return "";
  903. }
  904. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  905. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  906. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  907. const std::vector<std::string> &async_file_pool,
  908. std::vector<std::shared_ptr<TensorData>> *const result_list,
  909. bool *no_mem_to_read) {
  910. for (unsigned int i = 0; i < backend_name.size(); i++) {
  911. // form prefix of the tensor file to read from graph pb node name
  912. std::string dump_style_kernel_name = backend_name[i];
  913. // remove slot from name
  914. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  915. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  916. std::string slot_string_to_check;
  917. std::string prefix_dump_file_name;
  918. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  919. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  920. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  921. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  922. // search files in dir for the one that meets the filename prefix and read the file into memory
  923. if (is_sync_mode_) {
  924. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  925. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  926. } else {
  927. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  928. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  929. no_mem_to_read);
  930. }
  931. }
  932. }
  933. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  934. const std::string &backend_name, const unsigned int device_id,
  935. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  936. bool *no_mem_to_read, unsigned int iteration,
  937. std::vector<std::shared_ptr<TensorData>> *result_list) {
  938. std::string time_stamp = "";
  939. std::string type_name = "";
  940. uint64_t data_size = 0;
  941. std::vector<int64_t> shape;
  942. std::vector<char> *buffer = nullptr;
  943. if (found) {
  944. std::string result_path = GetNewestFilePath(matched_paths);
  945. time_stamp = GetTimeStampStr(result_path);
  946. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  947. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  948. std::to_string(slot);
  949. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  950. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  951. type_name, shape, buffer, result_list);
  952. } else {
  953. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  954. buffer, result_list);
  955. MS_LOG(INFO) << "Target tensor has not been found.";
  956. }
  957. }
  958. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  959. const std::string &backend_name, size_t slot, const unsigned int device_id,
  960. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  961. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  962. std::string abspath = RealPath(specific_dump_dir);
  963. DIR *d = opendir(abspath.c_str());
  964. bool found_file = false;
  965. std::vector<std::string> matched_paths;
  966. if (d == nullptr) {
  967. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  968. } else {
  969. struct dirent *dir = nullptr;
  970. while ((dir = readdir(d)) != nullptr) {
  971. if (dir->d_type == DT_REG) {
  972. std::string file_name = dir->d_name;
  973. std::string stripped_file_name = GetStrippedFilename(file_name);
  974. if (stripped_file_name.empty()) {
  975. continue;
  976. }
  977. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  978. if (found != 0) {
  979. continue;
  980. }
  981. std::string full_path = specific_dump_dir + "/" + file_name;
  982. matched_paths.push_back(full_path);
  983. found_file = true;
  984. }
  985. }
  986. (void)closedir(d);
  987. }
  988. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  989. no_mem_to_read, iteration, result_list);
  990. }
  991. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  992. const std::string &slot_string_to_check, const std::string &backend_name,
  993. size_t slot, unsigned int device_id, unsigned int iteration,
  994. unsigned int root_graph_id, const bool &is_output,
  995. const std::vector<std::string> &async_file_pool,
  996. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  997. bool found = false;
  998. std::vector<std::string> matched_paths;
  999. // if async mode
  1000. for (const std::string &file_path : async_file_pool) {
  1001. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1002. file_path.find(prefix_dump_to_check) != std::string::npos &&
  1003. file_path.find(slot_string_to_check) != std::string::npos) {
  1004. matched_paths.push_back(file_path);
  1005. found = true;
  1006. }
  1007. }
  1008. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1009. iteration, result_list);
  1010. }
  1011. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1012. // strip off the task_id, stream_id, and timestamp, then compare
  1013. size_t first_dot = file_name.find(".");
  1014. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1015. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1016. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1017. return std::string();
  1018. }
  1019. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1020. size_t second_dot = fifth_dot;
  1021. const int8_t kSecondDotPosition = 2;
  1022. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1023. second_dot = file_name.rfind(".", second_dot - 1);
  1024. }
  1025. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1026. return std::string();
  1027. }
  1028. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1029. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1030. std::string stripped_file_name = start_string + end_string;
  1031. return stripped_file_name;
  1032. }
  1033. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  1034. unsigned int iteration, std::vector<std::string> *const async_file_pool) {
  1035. // get a list of nodes and the devices they are on to monitor
  1036. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1037. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
  1038. for (auto w_table_item : watchpoint_table_) {
  1039. auto wp = std::get<1>(w_table_item);
  1040. unsigned int index = 0;
  1041. for (auto check_node : wp.check_node_list) {
  1042. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  1043. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1044. for (auto device : devices) {
  1045. for (auto graph : graphs) {
  1046. std::tuple<uint32_t, uint32_t> key(device, graph);
  1047. device_and_graph_to_nodes[key].push_back(check_node);
  1048. }
  1049. }
  1050. index++;
  1051. }
  1052. }
  1053. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1054. // as they are found
  1055. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  1056. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  1057. uint32_t device_id = std::get<0>(device_and_graph);
  1058. uint32_t root_graph_id = std::get<1>(device_and_graph);
  1059. std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
  1060. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1061. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1062. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1063. // convert node names to dump style
  1064. for (auto node : wp_nodes) {
  1065. std::string orig_name = std::get<0>(node);
  1066. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1067. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1068. bool node_is_out = std::get<1>(node);
  1069. if (node_is_out) {
  1070. dump_style_name += ".output";
  1071. } else {
  1072. dump_style_name += ".input";
  1073. }
  1074. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1075. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1076. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1077. }
  1078. }
  1079. if (is_sync_mode_) {
  1080. // search files in dir for the one that meets the filename prefix and read the file into memory
  1081. std::string abspath = RealPath(specific_dump_dir);
  1082. ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
  1083. &tensor_list);
  1084. } else {
  1085. // convert all files in proto_to_dump to npy and add to pool of async file names
  1086. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  1087. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
  1088. &tensor_list);
  1089. }
  1090. }
  1091. return tensor_list;
  1092. }
  1093. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1094. const std::string &abspath, const std::string &specific_dump_dir,
  1095. unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
  1096. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  1097. DIR *d = opendir(abspath.c_str());
  1098. if (d == nullptr) {
  1099. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
  1100. } else {
  1101. struct dirent *dir = nullptr;
  1102. while ((dir = readdir(d)) != nullptr) {
  1103. if (dir->d_type == DT_REG) {
  1104. std::string file_name = dir->d_name;
  1105. for (auto &node : proto_to_dump) {
  1106. std::string dump_name = std::get<1>(node);
  1107. std::string stripped_file_name = GetStrippedFilename(file_name);
  1108. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1109. continue;
  1110. }
  1111. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1112. if (found == 0) {
  1113. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1114. std::vector<int64_t> shape;
  1115. std::string orig_name = std::get<0>(node);
  1116. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1117. bool output_flag = (output_str == "output");
  1118. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1119. nullptr, tensor_list);
  1120. break;
  1121. }
  1122. }
  1123. }
  1124. }
  1125. (void)closedir(d);
  1126. }
  1127. }
  1128. std::string DebugServices::IterationString(unsigned int iteration) {
  1129. std::string iteration_string;
  1130. bool init_dbg_suspend = (iteration == UINT_MAX);
  1131. if (init_dbg_suspend) {
  1132. iteration_string = "init";
  1133. } else {
  1134. iteration_string = std::to_string(iteration);
  1135. }
  1136. return iteration_string;
  1137. }
  1138. #endif
  1139. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1140. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1141. std::vector<unsigned int> *const dtype,
  1142. std::vector<std::vector<int64_t>> *const shape) {
  1143. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1144. tensor_loader_->SearchTensors(name, &result_list);
  1145. for (auto result : result_list) {
  1146. if (std::get<1>(result) == nullptr) {
  1147. continue;
  1148. }
  1149. (void)ret_name->emplace_back(std::get<0>(result));
  1150. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1151. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1152. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1153. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1154. }
  1155. }
  1156. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1157. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1158. if (result_list == nullptr) {
  1159. MS_LOG(DEBUG) << "result_list is nullptr.";
  1160. return;
  1161. }
  1162. tensor_loader_->SearchTensors(name, result_list);
  1163. }
  1164. #ifdef ONLINE_DBG_MODE
  1165. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1166. bool ret = false;
  1167. for (auto w_table_item : watchpoint_table_) {
  1168. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1169. for (auto check_node : check_node_list) {
  1170. std::string w_name = std::get<0>(check_node);
  1171. bool w_type = std::get<1>(check_node);
  1172. if ((w_type == true &&
  1173. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1174. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1175. ret = true;
  1176. return ret;
  1177. }
  1178. }
  1179. }
  1180. return ret;
  1181. }
  1182. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1183. if (kernel != nullptr && w_name.length() > 0) {
  1184. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1185. for (size_t j = 0; j < input_size; ++j) {
  1186. auto input_kernel = kernel->input(j + 1);
  1187. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1188. auto found = w_name.find_last_of('/');
  1189. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1190. return true;
  1191. }
  1192. return false;
  1193. } else {
  1194. return false;
  1195. }
  1196. }
  1197. #endif
  1198. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1199. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1200. #ifdef ONLINE_DBG_MODE
  1201. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1202. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1203. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1204. size_t slot) const {
  1205. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1206. device_type, addr_format, slot);
  1207. }
  1208. #endif
  1209. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1210. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1211. }
  1212. void DebugServices::ResetLoadedTensors() {
  1213. wp_id_cache_.clear();
  1214. MS_LOG(INFO) << "Resetting loaded tensors";
  1215. tensor_loader_->MoveParametersCurrentToPrev();
  1216. tensor_loader_->EmptyCurrentTensor();
  1217. // will move parameters from previous to current map
  1218. tensor_loader_->SwapCurrentPrev();
  1219. overflow_ops_.clear();
  1220. }
  1221. #ifdef ONLINE_DBG_MODE
  1222. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1223. MS_EXCEPTION_IF_NULL(kernel);
  1224. std::vector<std::shared_ptr<TensorData>> result;
  1225. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1226. auto kernel_name = GetKernelNodeName(kernel);
  1227. for (size_t j = 0; j < output_size; ++j) {
  1228. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1229. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1230. if (tensor != nullptr) {
  1231. result.push_back(tensor);
  1232. }
  1233. }
  1234. return result;
  1235. }
  1236. #endif
  1237. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1238. unsigned int iteration) {
  1239. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1240. std::vector<std::string> op_names;
  1241. std::string overflow_bin_path;
  1242. #ifdef ONLINE_DBG_MODE
  1243. if (DumpJsonParser::GetInstance().path().empty()) {
  1244. // Dump config is not set.
  1245. return false;
  1246. }
  1247. auto debugger = Debugger::GetInstance();
  1248. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->root_graph_id());
  1249. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1250. if (!realpath.has_value()) {
  1251. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1252. return false;
  1253. }
  1254. overflow_bin_path = realpath.value() + '/';
  1255. #else
  1256. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1257. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1258. overflow_bin_path = RealPath(overflow_bin_path);
  1259. #endif
  1260. overflow_wp_lock_.lock();
  1261. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1262. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1263. if (found_overflows != overflow_ops_.end()) {
  1264. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1265. op_names = overflow_ops_[overflow_bin_path];
  1266. } else {
  1267. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1268. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1269. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1270. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1271. std::string abspath = RealPath(overflow_bin_path);
  1272. DIR *d = opendir(abspath.c_str());
  1273. if (d == nullptr) {
  1274. MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
  1275. } else {
  1276. struct dirent *dir = nullptr;
  1277. while ((dir = readdir(d)) != nullptr) {
  1278. if (dir->d_type == DT_REG) {
  1279. // form fully qualified filename
  1280. std::string file_path = overflow_bin_path;
  1281. std::string file_name = dir->d_name;
  1282. (void)file_path.append(file_name);
  1283. // attempt to read the file
  1284. std::ifstream infile;
  1285. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1286. if (!infile.is_open()) {
  1287. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1288. continue;
  1289. }
  1290. std::string node_name;
  1291. uint64_t task_id = 0;
  1292. uint64_t stream_id = 0;
  1293. // detect overflow bin file
  1294. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1295. // start of op overflow data in bin file
  1296. const uint32_t offset = 321;
  1297. (void)infile.seekg(offset, std::ios::beg);
  1298. std::vector<char> buffer;
  1299. // size of op overflow info section
  1300. const size_t buf_size = 256;
  1301. buffer.resize(buf_size);
  1302. (void)infile.read(buffer.data(), buf_size);
  1303. if (infile.gcount() != buf_size) {
  1304. MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
  1305. continue;
  1306. }
  1307. const uint8_t stream_id_offset = 16;
  1308. const uint8_t task_id_offset = 24;
  1309. // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
  1310. // byte values currently.
  1311. stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
  1312. task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
  1313. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1314. << ".";
  1315. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1316. } else {
  1317. // regular bin file
  1318. bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
  1319. if (success_parse) {
  1320. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1321. }
  1322. }
  1323. infile.close();
  1324. }
  1325. }
  1326. (void)closedir(d);
  1327. }
  1328. // find the op_names with an overflow hit
  1329. for (auto &task_stream : task_stream_hit) {
  1330. auto op_name = task_stream_to_opname[task_stream];
  1331. if (!op_name.empty()) {
  1332. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1333. op_names.push_back(op_name);
  1334. }
  1335. }
  1336. overflow_ops_[overflow_bin_path] = op_names;
  1337. }
  1338. overflow_wp_lock_.unlock();
  1339. // determine if overflow wp has been triggered for node_name_to_find
  1340. if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) {
  1341. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1342. return true;
  1343. }
  1344. return false;
  1345. }
  1346. bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *const node_name,
  1347. uint64_t *task_id, uint64_t *stream_id) {
  1348. // get the node_name, task_id, and stream_id from async dump filename
  1349. // node_type.node_name.task_id.stram_id.timestamp
  1350. // WARNING: node_name may have dots in it
  1351. size_t fourth_dot = file_name.rfind(".");
  1352. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1353. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1354. size_t first_dot = file_name.find(".");
  1355. // check if dots were found
  1356. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1357. fourth_dot == std::string::npos) {
  1358. return false;
  1359. }
  1360. // check if its not an async bin file
  1361. if (file_name.substr(fourth_dot) == ".npy") {
  1362. return false;
  1363. }
  1364. // get node_name
  1365. if (first_dot < second_dot) {
  1366. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1367. } else {
  1368. MS_LOG(ERROR) << "Async filename parse error to get node_name.";
  1369. return false;
  1370. }
  1371. // get task id
  1372. if (second_dot < third_dot) {
  1373. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1374. try {
  1375. *task_id = std::stoull(extracted_task_id);
  1376. } catch (std::invalid_argument &e) {
  1377. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1378. return false;
  1379. } catch (std::out_of_range &e) {
  1380. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1381. return false;
  1382. }
  1383. } else {
  1384. MS_LOG(ERROR) << "Async filename parse error to get task_id.";
  1385. return false;
  1386. }
  1387. // get stream id
  1388. if (third_dot < fourth_dot) {
  1389. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1390. try {
  1391. *stream_id = std::stoull(extracted_stream_id);
  1392. } catch (std::invalid_argument &e) {
  1393. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1394. return false;
  1395. } catch (std::out_of_range &e) {
  1396. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1397. return false;
  1398. }
  1399. } else {
  1400. MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
  1401. return false;
  1402. }
  1403. return true;
  1404. }
  1405. std::string DebugServices::RealPath(const std::string &input_path) {
  1406. if (input_path.length() >= PATH_MAX) {
  1407. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1408. }
  1409. size_t path_split_pos = input_path.find_last_of('/');
  1410. // get real path
  1411. char real_path[PATH_MAX] = {0};
  1412. // input_path is dir + file_name
  1413. if (path_split_pos != std::string::npos) {
  1414. std::string prefix_path = input_path.substr(0, path_split_pos);
  1415. std::string file_name = input_path.substr(path_split_pos);
  1416. if (file_name.length() > NAME_MAX) {
  1417. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1418. }
  1419. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1420. MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist.";
  1421. return "";
  1422. }
  1423. return std::string(real_path) + file_name;
  1424. }
  1425. // input_path is only file_name
  1426. if (input_path.length() > NAME_MAX) {
  1427. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1428. }
  1429. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1430. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1431. }
  1432. return std::string(real_path);
  1433. }
  1434. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1435. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1436. }
  1437. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1438. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1439. }
  1440. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1441. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1442. }
  1443. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1444. if (tensor_loader_->EnableMemoryControl()) {
  1445. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1446. }
  1447. }
  1448. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1449. std::string DebugServices::GetNetName() { return net_name_; }
  1450. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1451. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1452. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1453. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1454. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1455. #ifdef ONLINE_DBG_MODE
  1456. } // namespace mindspore
  1457. #endif