You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 70 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include "pybind11/embed.h"
  29. #include "pybind11/stl.h"
  30. #ifdef ONLINE_DBG_MODE
  31. #include "debug/common.h"
  32. #include "debug/debugger/debugger.h"
  33. #include "debug/anf_ir_utils.h"
  34. #include "backend/session/anf_runtime_algorithm.h"
  35. #endif
  36. #include "debug/debugger/tensor_summary.h"
  37. #ifdef ONLINE_DBG_MODE
  38. namespace mindspore {
  39. #endif
  40. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  41. DebugServices::DebugServices(const DebugServices &other) {
  42. wp_id_cache_ = other.wp_id_cache_;
  43. net_name_ = other.net_name_;
  44. dump_dir_ = other.dump_dir_;
  45. is_sync_mode_ = other.is_sync_mode_;
  46. tensor_loader_ = other.tensor_loader_;
  47. watchpoint_table_ = other.watchpoint_table_;
  48. }
  49. DebugServices &DebugServices::operator=(const DebugServices &other) {
  50. if (this != &other) {
  51. tensor_loader_ = other.tensor_loader_;
  52. watchpoint_table_ = other.watchpoint_table_;
  53. }
  54. return *this;
  55. }
  56. void DebugServices::AddWatchpoint(
  57. unsigned int id, unsigned int watch_condition, float parameter,
  58. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  59. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  60. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  61. std::lock_guard<std::mutex> lg(lock_);
  62. watchpoint_t watchpoint_item;
  63. watchpoint_item.id = id;
  64. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  65. watchpoint_item.condition.parameter = parameter;
  66. watchpoint_item.check_node_list = check_node_list;
  67. if (check_node_device_list != nullptr) {
  68. watchpoint_item.check_node_device_list = *check_node_device_list;
  69. }
  70. if (check_node_graph_list != nullptr) {
  71. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  72. }
  73. watchpoint_item.parameter_list = parameter_list;
  74. watchpoint_table_[id] = watchpoint_item;
  75. }
  76. void DebugServices::RemoveWatchpoint(unsigned int id) {
  77. std::lock_guard<std::mutex> lg(lock_);
  78. watchpoint_table_.erase(id);
  79. }
  80. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  81. void *const previous_tensor_ptr, uint32_t num_elements,
  82. uint32_t prev_num_elements, int tensor_dtype) {
  83. switch (tensor_dtype) {
  84. case DbgDataType::DT_UINT8: {
  85. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  86. prev_num_elements);
  87. }
  88. case DbgDataType::DT_INT8: {
  89. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  90. prev_num_elements);
  91. }
  92. case DbgDataType::DT_UINT16: {
  93. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  94. prev_num_elements);
  95. }
  96. case DbgDataType::DT_INT16: {
  97. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  98. prev_num_elements);
  99. }
  100. case DbgDataType::DT_UINT32: {
  101. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  102. prev_num_elements);
  103. }
  104. case DbgDataType::DT_INT32:
  105. case DbgDataType::DT_BASE_INT: {
  106. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  107. prev_num_elements);
  108. }
  109. case DbgDataType::DT_UINT64: {
  110. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  111. prev_num_elements);
  112. }
  113. case DbgDataType::DT_INT64: {
  114. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  115. prev_num_elements);
  116. }
  117. case DbgDataType::DT_FLOAT16: {
  118. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  119. prev_num_elements);
  120. }
  121. case DbgDataType::DT_FLOAT32:
  122. case DbgDataType::DT_BASE_FLOAT: {
  123. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  124. prev_num_elements);
  125. }
  126. case DbgDataType::DT_FLOAT64: {
  127. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  128. prev_num_elements);
  129. }
  130. case DbgDataType::DT_BOOL: {
  131. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  132. prev_num_elements);
  133. }
  134. default:
  135. MS_LOG(INFO) << "Unsupported tensor type";
  136. // return a null pointer
  137. return std::unique_ptr<TensorSummary<int32_t>>{};
  138. }
  139. }
  140. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  141. if (tensor == nullptr) {
  142. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  143. TensorStat empty_tensor_stat_data;
  144. return empty_tensor_stat_data;
  145. }
  146. std::unique_ptr<ITensorSummary> base_summary_ptr;
  147. void *previous_tensor_ptr = nullptr;
  148. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  149. if (base_summary_ptr == nullptr) {
  150. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  151. TensorStat empty_tensor_stat_data;
  152. return empty_tensor_stat_data;
  153. }
  154. base_summary_ptr->TensorStatistics(tensor->GetType());
  155. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  156. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  157. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  158. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  159. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  160. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  161. return tensor_stat_data;
  162. }
  163. #ifdef OFFLINE_DBG_MODE
  164. void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  165. uint32_t *prev_num_elements) {
  166. void *previous_tensor_ptr = nullptr;
  167. std::shared_ptr<TensorData> tensor_prev;
  168. if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
  169. // read data in offline mode
  170. std::vector<std::string> file_paths;
  171. if (!is_sync_mode_) {
  172. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  173. std::vector<unsigned int>{tensor->GetDeviceId()},
  174. std::vector<unsigned int>{tensor->GetIteration() - 1},
  175. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  176. }
  177. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  178. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  179. std::vector<unsigned int>{tensor->GetDeviceId()},
  180. std::vector<unsigned int>{tensor->GetIteration() - 1},
  181. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  182. file_paths, &result_list_prev);
  183. tensor_prev = result_list_prev[0];
  184. if (!tensor_prev->GetByteSize()) {
  185. tensor_prev.reset();
  186. } else {
  187. previous_tensor_ptr = tensor_prev->GetDataPtr();
  188. *prev_num_elements = tensor_prev->GetNumElements();
  189. }
  190. }
  191. return previous_tensor_ptr;
  192. }
  193. #endif
  194. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  195. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  196. std::string *const qualified_tensor_name,
  197. std::vector<watchpoint_t> *const watchpoints_to_check) {
  198. if (tensor == nullptr) {
  199. MS_LOG(DEBUG) << "tensor is nullptr.";
  200. return;
  201. }
  202. const auto tensor_name = tensor->GetName();
  203. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  204. const auto tensor_device_id = tensor->GetDeviceId();
  205. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  206. for (auto w_table_item : watchpoint_table_) {
  207. auto wp = std::get<1>(w_table_item);
  208. // check ONLY init conditions on initial suspended state.
  209. // skip other conditions on initial suspended state
  210. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  211. continue;
  212. }
  213. // skip init condition if not init suspend
  214. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  215. continue;
  216. }
  217. // check change conditions only on step end.
  218. if (wp.change_condition() && !step_end) {
  219. continue;
  220. }
  221. // if recheck, ignore the cache results and reanalyze everything.
  222. // if not a recheck, check only unanalyzed tensors
  223. if (!recheck) {
  224. wp_lock_.lock();
  225. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  226. wp_lock_.unlock();
  227. if (wp_cache_hit) {
  228. continue;
  229. }
  230. }
  231. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  232. if (!found.empty()) {
  233. *qualified_tensor_name = found;
  234. watchpoints_to_check->push_back(w_table_item.second);
  235. #ifdef OFFLINE_DBG_MODE
  236. if (wp.change_condition()) {
  237. *previous_iter_tensor_needed = true;
  238. }
  239. #endif
  240. }
  241. }
  242. }
  243. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  244. const std::string &tensor_name) {
  245. // add analyzed tensor to cache
  246. if (!recheck) {
  247. wp_lock_.lock();
  248. wp_id_cache_[tensor_name].insert(id);
  249. wp_lock_.unlock();
  250. }
  251. }
  252. void DebugServices::SetCheckWatchpointsResult(
  253. const int chunk_id, partitioned_names *chunk_names, partitioned_names *chunk_slots,
  254. partitioned_numbers *chunk_conditions, partitioned_id *chunk_watchpoint_id, partitioned_parameters *chunk_parameters,
  255. partitioned_error_code *chunk_error_codes, partitioned_numbers *chunk_exec_orders,
  256. partitioned_names *chunk_time_stamp, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  257. std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id, const int exec_order,
  258. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  259. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  260. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  261. (*chunk_exec_orders)[chunk_id].push_back(exec_order);
  262. (*chunk_names)[chunk_id].push_back(qualified_tensor_name);
  263. (*chunk_slots)[chunk_id].push_back(tensor_slot);
  264. (*chunk_conditions)[chunk_id].push_back(wp.condition.type);
  265. (*chunk_watchpoint_id)[chunk_id].push_back(wp.id);
  266. if (device_id != nullptr) {
  267. (*chunk_device_id)[chunk_id].push_back(device_id_val);
  268. }
  269. if (root_graph_id != nullptr) {
  270. (*chunk_root_graph_id)[chunk_id].push_back(root_graph_id_val);
  271. }
  272. (*chunk_parameters)[chunk_id].push_back(parameter_list);
  273. (*chunk_error_codes)[chunk_id].push_back(error_code);
  274. (*chunk_time_stamp)[chunk_id].push_back(time_stamp);
  275. }
  276. void DebugServices::CheckWatchpointsForTensor(
  277. partitioned_names *chunk_names, partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions,
  278. partitioned_id *const chunk_watchpoint_id, partitioned_parameters *chunk_parameters,
  279. partitioned_error_code *chunk_error_codes, const std::vector<std::string> &op_overflows,
  280. const std::vector<std::string> &async_file_pool, partitioned_numbers *chunk_exec_orders,
  281. std::vector<std::shared_ptr<TensorData>> *tensor_list, int begin, int end, int chunk_id, const bool init_dbg_suspend,
  282. const bool step_end, const bool recheck, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  283. std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_names *chunk_time_stamp,
  284. std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id) {
  285. for (int i = begin; i < end; i++) {
  286. auto &tensor = (*tensor_list)[i];
  287. const auto tensor_name = tensor->GetName();
  288. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  289. const auto tensor_slot = std::to_string(tensor->GetSlot());
  290. std::vector<watchpoint_t> watchpoints_to_check;
  291. std::string qualified_tensor_name;
  292. bool previous_iter_tensor_needed = false;
  293. // Add do nothing line in case offline debug is off, prevent unused var warning
  294. (void)previous_iter_tensor_needed;
  295. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  296. &qualified_tensor_name, &watchpoints_to_check);
  297. // no wp set on current tensor
  298. if (watchpoints_to_check.empty()) {
  299. continue;
  300. }
  301. #ifdef OFFLINE_DBG_MODE
  302. // read data in offline mode
  303. bool no_mem_to_read = false;
  304. std::vector<std::shared_ptr<TensorData>> result_list;
  305. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  306. std::vector<unsigned int>{tensor->GetDeviceId()},
  307. std::vector<unsigned int>{tensor->GetIteration()},
  308. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  309. async_file_pool, &result_list, &no_mem_to_read);
  310. tensor = result_list[0];
  311. if (!tensor->GetByteSize()) {
  312. if (no_mem_to_read) {
  313. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  314. int32_t oversize_error_code = 8;
  315. for (auto &wp : watchpoints_to_check) {
  316. SetCheckWatchpointsResult(
  317. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  318. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  319. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  320. tensor->GetDeviceId(), tensor->GetRootGraphId(), std::vector<parameter_t>(), oversize_error_code);
  321. }
  322. }
  323. tensor.reset();
  324. continue;
  325. }
  326. #endif
  327. // no elements to analyze
  328. if (tensor->GetByteSize() == 0) {
  329. continue;
  330. }
  331. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  332. int tensor_dtype = tensor->GetType();
  333. uint32_t num_elements = tensor->GetNumElements();
  334. uint32_t prev_num_elements = 0;
  335. void *previous_tensor_ptr = nullptr;
  336. #ifdef OFFLINE_DBG_MODE
  337. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
  338. #else
  339. std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  340. if (prev_tensor_data) {
  341. previous_tensor_ptr = prev_tensor_data->GetDataPtr();
  342. prev_num_elements = prev_tensor_data->GetNumElements();
  343. }
  344. #endif
  345. std::unique_ptr<ITensorSummary> base_summary_ptr;
  346. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  347. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  348. if (base_summary_ptr != nullptr) {
  349. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  350. }
  351. }
  352. for (auto &wp : watchpoints_to_check) {
  353. bool is_hit = false;
  354. int error_code = 0;
  355. std::vector<parameter_t> parameter_list = {};
  356. if (wp.condition.type == IS_OVERFLOW) {
  357. is_hit =
  358. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  359. } else if (base_summary_ptr != nullptr) {
  360. auto item = base_summary_ptr->IsWatchpointHit(wp);
  361. is_hit = std::get<ITensorSummary::eHitPos>(item);
  362. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  363. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  364. }
  365. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  366. if (is_hit || error_code) {
  367. SetCheckWatchpointsResult(
  368. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  369. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  370. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  371. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  372. }
  373. }
  374. #ifdef OFFLINE_DBG_MODE
  375. // set the tensor into not-in-use status in tensor_loader.
  376. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  377. std::to_string(tensor->GetRootGraphId()) + ":" +
  378. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  379. AppendToCacheEvictQueue(key_name_in_cache);
  380. if (previous_tensor_ptr != nullptr) {
  381. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  382. }
  383. // in offline mode remove the need for the data
  384. tensor.reset();
  385. #endif
  386. }
  387. }
  388. void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
  389. std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
  390. std::vector<std::vector<parameter_t>> *const parameters,
  391. std::vector<int32_t> *const error_codes,
  392. const std::vector<std::string> &op_overflows,
  393. const std::vector<std::string> &async_file_pool,
  394. std::vector<std::shared_ptr<TensorData>> *tensor_list, const bool init_dbg_suspend,
  395. const bool step_end, const bool recheck, std::vector<unsigned int> *device_id,
  396. std::vector<unsigned int> *root_graph_id) {
  397. std::lock_guard<std::mutex> lg(lock_);
  398. auto t1 = std::chrono::high_resolution_clock::now();
  399. if (watchpoint_table_.empty()) return;
  400. // vector to store execution order of tensors hit
  401. std::vector<int> exec_order;
  402. std::vector<std::string> time_stamps;
  403. int tensor_list_size = tensor_list->size();
  404. uint64_t tensor_list_byte_size = 0;
  405. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  406. if (tensor_list_size == 0) return;
  407. // default value for number of threads
  408. const int default_thread_num = 16;
  409. int max_thread_num = default_thread_num;
  410. if (max_thread_num > tensor_list_size) {
  411. max_thread_num = tensor_list_size;
  412. }
  413. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  414. int chunk_size = tensor_list_size / max_thread_num;
  415. int remainder = tensor_list_size % max_thread_num;
  416. partitioned_numbers chunk_exec_orders(max_thread_num);
  417. partitioned_names chunk_names(max_thread_num);
  418. partitioned_names chunk_slots(max_thread_num);
  419. partitioned_numbers chunk_conditions(max_thread_num);
  420. partitioned_id chunk_watchpoint_id(max_thread_num);
  421. partitioned_parameters chunk_parameters(max_thread_num);
  422. partitioned_error_code chunk_error_codes(max_thread_num);
  423. partitioned_id chunk_device_id(max_thread_num);
  424. partitioned_id chunk_root_graph_id(max_thread_num);
  425. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  426. partitioned_names chunk_time_stamp(max_thread_num);
  427. std::vector<std::future<void>> tensor_future_vec;
  428. int begin = 0;
  429. int end = begin;
  430. for (int i = 0; i < max_thread_num; i++) {
  431. end += chunk_size;
  432. if (remainder > 0) {
  433. end++;
  434. remainder--;
  435. }
  436. tensor_future_vec.push_back(std::async(
  437. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  438. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  439. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  440. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id));
  441. begin = end;
  442. }
  443. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  444. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  445. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  446. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  447. root_graph_id);
  448. auto t2 = std::chrono::high_resolution_clock::now();
  449. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  450. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  451. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  452. }
  453. void DebugServices::SortWatchpointsInfo(
  454. std::vector<std::future<void>> *tensor_future_vec, std::vector<int> *exec_order,
  455. std::vector<std::string> *time_stamps, uint64_t *tensor_list_byte_size, std::vector<std::string> *name,
  456. std::vector<std::string> *slot, std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
  457. std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_codes, partitioned_names *chunk_names,
  458. partitioned_names *chunk_slots, partitioned_numbers *chunk_conditions, partitioned_id *chunk_watchpoint_id,
  459. partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes,
  460. partitioned_numbers *chunk_exec_orders, partitioned_names *chunk_time_stamp,
  461. std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id,
  462. std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id) {
  463. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  464. (*tensor_future_vec)[i].wait();
  465. (*tensor_future_vec)[i].get();
  466. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  467. #ifdef ONLINE_DBG_MODE
  468. // if the execution order is repeated,inserts the new one before the others with same execution order.
  469. std::vector<int>::iterator iter =
  470. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  471. int position = iter - exec_order->begin();
  472. exec_order->insert(iter, (*chunk_exec_orders)[i][j]);
  473. #endif
  474. #ifdef OFFLINE_DBG_MODE
  475. std::vector<std::string>::iterator iter =
  476. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  477. int position = iter - time_stamps->begin();
  478. time_stamps->insert(iter, (*chunk_time_stamp)[i][j]);
  479. #endif
  480. name->insert(name->begin() + position, (*chunk_names)[i][j]);
  481. slot->insert(slot->begin() + position, (*chunk_slots)[i][j]);
  482. condition->insert(condition->begin() + position, (*chunk_conditions)[i][j]);
  483. watchpoint_id->insert(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  484. if (device_id != nullptr) {
  485. device_id->insert(device_id->begin() + position, (*chunk_device_id)[i][j]);
  486. }
  487. if (root_graph_id != nullptr) {
  488. root_graph_id->insert(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  489. }
  490. parameters->insert(parameters->begin() + position, (*chunk_parameters)[i][j]);
  491. error_codes->insert(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  492. }
  493. // free the memory for used vectors
  494. std::vector<int>().swap((*chunk_exec_orders)[i]);
  495. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  496. std::vector<std::string>().swap((*chunk_names)[i]);
  497. std::vector<std::string>().swap((*chunk_slots)[i]);
  498. std::vector<int>().swap((*chunk_conditions)[i]);
  499. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  500. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  501. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  502. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  503. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  504. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  505. }
  506. }
  507. #ifdef OFFLINE_DBG_MODE
  508. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  509. std::string *tensor_type, std::size_t *size, std::vector<int64_t> *shape,
  510. std::vector<char> **data_buffer, bool *no_mem_to_read) {
  511. std::ifstream infile;
  512. std::string file_path = file_name;
  513. MS_LOG(INFO) << "Reading in file: " << file_path;
  514. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  515. if (!infile.is_open()) {
  516. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno
  517. << " ErrInfo:" << strerror(errno);
  518. return;
  519. }
  520. const int substr_len = 2;
  521. const int header_len_offset = 8;
  522. const int header_offset = 9;
  523. const int type_offset = 10;
  524. // get header length
  525. infile.seekg(0, std::ios::beg);
  526. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + 2);
  527. if (!infile.read(header_len_buffer->data(), header_len_offset + 2)) {
  528. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  529. return;
  530. }
  531. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  532. header_len_buffer.reset();
  533. // read in header
  534. infile.seekg(0, std::ios::beg);
  535. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  536. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  537. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  538. return;
  539. }
  540. std::string header(header_buffer->data() + header_offset, header_len);
  541. header_buffer.reset();
  542. std::size_t type_i = header.find("descr") + type_offset;
  543. if (header.length() < type_i + substr_len) {
  544. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  545. return;
  546. }
  547. *tensor_type = header.substr(type_i, substr_len);
  548. std::size_t shape_i_open = header.find("(");
  549. std::size_t shape_i_close = header.find(")");
  550. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  551. std::string intermediate;
  552. std::stringstream check_shape(shape_str);
  553. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  554. while (getline(check_shape, intermediate, ',')) {
  555. shape->push_back(std::stoi(intermediate));
  556. }
  557. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  558. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  559. std::size_t data_size = data_len * word_size;
  560. // Check memory available before loading tensor into host.
  561. bool has_enough_memory = true;
  562. if (tensor_loader_->EnableMemoryControl()) {
  563. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  564. }
  565. if (!has_enough_memory) {
  566. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  567. *no_mem_to_read = true;
  568. } else {
  569. infile.seekg(header_len + type_offset);
  570. *data_buffer = new std::vector<char>(data_size);
  571. if (data_buffer == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  572. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  573. }
  574. *size = data_size;
  575. }
  576. }
  577. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  578. std::vector<std::string> *result_list) {
  579. std::string file_format = "npy";
  580. for (auto const &d : dir_to_files_map) {
  581. std::vector<std::string> files_to_convert_in_dir;
  582. std::vector<std::string> files_after_convert_in_dir;
  583. std::string dump_key = d.first;
  584. for (auto const &file_name : d.second) {
  585. bool already_converted = false;
  586. // Remove scope from the file_name for matching files converted by mindinsight tool.
  587. std::size_t found_first_dot = file_name.find(".");
  588. std::size_t found_last_underscore = file_name.find_last_of("_");
  589. std::string file_name_without_scope = file_name;
  590. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  591. file_name_without_scope =
  592. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  593. }
  594. for (std::string &file_found : *result_list) {
  595. if (file_found.find(file_name_without_scope) != std::string::npos) {
  596. already_converted = true;
  597. }
  598. }
  599. if (!already_converted) {
  600. files_to_convert_in_dir.push_back(dump_key + "/" + file_name);
  601. files_after_convert_in_dir.push_back(dump_key + "/" + file_name_without_scope);
  602. }
  603. }
  604. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  605. if (!files_to_convert_in_dir.empty()) {
  606. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  607. // later task.
  608. try {
  609. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  610. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  611. (void)convert_obj.attr("convert_files")();
  612. } catch (pybind11::error_already_set &e) {
  613. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  614. }
  615. std::string abspath = RealPath(dump_key);
  616. DIR *d_handle = opendir(abspath.c_str());
  617. if (d_handle == nullptr) {
  618. MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
  619. return;
  620. }
  621. struct dirent *dir = nullptr;
  622. while ((dir = readdir(d_handle)) != nullptr) {
  623. if (dir->d_type == DT_REG) {
  624. std::string candidate = dir->d_name;
  625. for (const std::string &file_to_find : files_after_convert_in_dir) {
  626. std::string file_n = file_to_find.substr(file_to_find.find_last_of("\\/") + 1);
  627. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  628. // we found a converted file for this op
  629. std::string found_file = dump_key + "/" + candidate;
  630. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  631. result_list->push_back(found_file);
  632. }
  633. }
  634. }
  635. }
  636. }
  637. (void)closedir(d_handle);
  638. }
  639. }
  640. }
  641. void GetNodeNameWithoutScope(std::string *dump_style_name) {
  642. if (dump_style_name == nullptr) {
  643. return;
  644. }
  645. std::string node_name_without_scope = *dump_style_name;
  646. std::size_t last_scope_marker;
  647. std::string delim = "/";
  648. last_scope_marker = node_name_without_scope.rfind(delim);
  649. if (last_scope_marker != std::string::npos) {
  650. node_name_without_scope = node_name_without_scope.substr(last_scope_marker + delim.size());
  651. }
  652. *dump_style_name = node_name_without_scope;
  653. }
  654. void ReplaceSrcFileName(std::string *dump_style_name) {
  655. if (dump_style_name == nullptr) {
  656. return;
  657. }
  658. const std::string strsrc = "/";
  659. std::string strdst = "_";
  660. std::string::size_type pos = 0;
  661. std::string::size_type srclen = strsrc.size();
  662. std::string::size_type dstlen = strdst.size();
  663. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  664. dump_style_name->replace(pos, srclen, strdst);
  665. pos += dstlen;
  666. }
  667. }
  668. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  669. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  670. std::vector<unsigned int> root_graph_id, std::vector<std::string> *result_list) {
  671. std::string file_format = "npy";
  672. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  673. for (unsigned int i = 0; i < backend_name.size(); i++) {
  674. // form prefix of the tensor file to read from graph pb node name
  675. std::string dump_style_kernel_name = backend_name[i];
  676. // remove slot from name
  677. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  678. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  679. std::string prefix_dump_file_name = dump_style_kernel_name;
  680. GetNodeNameWithoutScope(&prefix_dump_file_name);
  681. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  682. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  683. // search files in dir for the one that meets the filename prefix and read the file into memory
  684. std::string abspath = RealPath(specific_dump_dir);
  685. DIR *d = opendir(abspath.c_str());
  686. if (d == nullptr) {
  687. MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
  688. return;
  689. }
  690. struct dirent *dir = nullptr;
  691. while ((dir = readdir(d)) != nullptr) {
  692. if (dir->d_type == DT_REG) {
  693. std::string file_name = dir->d_name;
  694. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  695. if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  696. file_name.rfind(file_format) == std::string::npos) {
  697. // if file matches prefix and is in device format add to candidate files to convert.
  698. dir_to_files_map[specific_dump_dir].push_back(file_name);
  699. } else if (file_name_w_o_perfix.rfind(prefix_dump_file_name) != std::string::npos &&
  700. file_name.rfind(file_format) != std::string::npos) {
  701. // otherwise, if file matches prefix and already has been converted to host format
  702. // add to result of converted files.
  703. std::string found_file = specific_dump_dir + "/" + file_name;
  704. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  705. result_list->push_back(found_file);
  706. }
  707. }
  708. }
  709. }
  710. (void)closedir(d);
  711. }
  712. ConvertToHostFormat(dir_to_files_map, result_list);
  713. }
  714. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  715. const std::string &specific_dump_dir,
  716. std::vector<std::string> *result_list) {
  717. std::string file_format = "npy";
  718. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  719. for (const auto &node : proto_dump) {
  720. std::string dump_name = std::get<1>(node);
  721. dump_name = dump_name.substr(0, dump_name.rfind("."));
  722. // search files in dir for the one that meets the filename prefix and read the file into memory
  723. std::string abspath = RealPath(specific_dump_dir);
  724. DIR *d = opendir(abspath.c_str());
  725. if (d == nullptr) {
  726. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  727. return;
  728. }
  729. struct dirent *dir = nullptr;
  730. while ((dir = readdir(d)) != nullptr) {
  731. if (dir->d_type == DT_REG) {
  732. std::string file_name = dir->d_name;
  733. std::string file_name_w_o_perfix = file_name.substr(file_name.find('.') + 1);
  734. if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  735. file_name.rfind(file_format) == std::string::npos) {
  736. // if file matches prefix and is in device format add to candidate files to convert.
  737. dir_to_files_map[specific_dump_dir].push_back(file_name);
  738. } else if (file_name_w_o_perfix.rfind(dump_name) != std::string::npos &&
  739. file_name.rfind(file_format) != std::string::npos) {
  740. // otherwise, if file matches prefix and already has been converted to host format
  741. // add to result of converted files.
  742. std::string found_file = specific_dump_dir + "/" + file_name;
  743. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  744. result_list->push_back(found_file);
  745. }
  746. }
  747. }
  748. }
  749. (void)closedir(d);
  750. }
  751. ConvertToHostFormat(dir_to_files_map, result_list);
  752. }
  753. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  754. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  755. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  756. std::vector<std::shared_ptr<TensorData>> *tensor_list) {
  757. for (auto &node : proto_dump) {
  758. std::vector<size_t> slot_list;
  759. std::string dump_style_name = std::get<1>(node);
  760. // Get dump_name and output_str from the second element of tuple
  761. std::size_t found_dot = dump_style_name.rfind(".");
  762. std::string dump_name = dump_style_name.substr(0, found_dot);
  763. std::string output_str = dump_style_name.substr(found_dot + 1);
  764. bool output_flag = (output_str == "output");
  765. for (const std::string &file_name : async_file_pool) {
  766. std::size_t found = file_name.find(dump_name);
  767. std::size_t found_out = file_name.find(output_str);
  768. std::size_t found_dot_start = file_name.find(".", found_out);
  769. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  770. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  771. found_out != std::string::npos) {
  772. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  773. }
  774. }
  775. for (auto slot : slot_list) {
  776. // add a TensorData entry (data will be read when needed)
  777. std::vector<int64_t> shape;
  778. std::string orig_name = std::get<0>(node);
  779. auto tensor_data = std::make_shared<TensorData>();
  780. tensor_data->SetName(orig_name);
  781. tensor_data->SetExecutionOrder(0);
  782. tensor_data->SetSlot(slot);
  783. tensor_data->SetIteration(iteration);
  784. tensor_data->SetDeviceId(device_id);
  785. tensor_data->SetRootGraphId(root_graph_id);
  786. tensor_data->SetDataPtr(nullptr);
  787. tensor_data->SetByteSize(0);
  788. tensor_data->SetType("");
  789. tensor_data->SetShape(shape);
  790. tensor_data->SetIsOutput(output_flag);
  791. tensor_list->push_back(tensor_data);
  792. }
  793. }
  794. }
  795. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  796. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  797. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  798. const std::string &type_name, const std::vector<int64_t> &shape,
  799. std::vector<char> *buffer, std::vector<std::shared_ptr<TensorData>> *result_list) {
  800. // call LoadNewTensor to store tensor in internal cache
  801. auto tensor_data = std::make_shared<TensorData>();
  802. tensor_data->SetName(backend_name);
  803. tensor_data->SetExecutionOrder(0);
  804. tensor_data->SetSlot(slot);
  805. tensor_data->SetIteration(iteration);
  806. tensor_data->SetDeviceId(device_id);
  807. tensor_data->SetRootGraphId(root_graph_id);
  808. tensor_data->SetIsOutput(is_output);
  809. if (data_size) {
  810. tensor_data->SetDataPtr(buffer->data());
  811. } else {
  812. tensor_data->SetDataPtr(nullptr);
  813. }
  814. tensor_data->SetByteSize(data_size);
  815. tensor_data->SetType(type_name);
  816. tensor_data->SetShape(shape);
  817. tensor_data->SetTimeStamp(time_stamp);
  818. if (data_size) {
  819. tensor_loader_->LoadNewTensor(tensor_data, false);
  820. }
  821. // add to result_list
  822. result_list->push_back(tensor_data);
  823. }
  824. void DebugServices::SetPrefixToCheck(std::string *prefix_dump_file_name, std::string *slot_string_to_check,
  825. std::string *dump_style_kernel_name, size_t slot, bool is_output) {
  826. std::string dump_style_name_part = *dump_style_kernel_name;
  827. GetNodeNameWithoutScope(&dump_style_name_part);
  828. std::string slot_str;
  829. if (is_output) {
  830. slot_str = ".output." + std::to_string(slot);
  831. } else {
  832. slot_str = ".input." + std::to_string(slot);
  833. }
  834. dump_style_name_part += slot_str;
  835. *prefix_dump_file_name = dump_style_name_part;
  836. *slot_string_to_check = slot_str;
  837. }
  838. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  839. // get file with the newest timestamp from the list.
  840. std::string newest_file;
  841. if (file_list.empty()) {
  842. return newest_file;
  843. }
  844. std::sort(file_list.begin(), file_list.end());
  845. return file_list.back();
  846. }
  847. std::string GetTimeStampStr(std::string file_path) {
  848. // get the file_name from file_path.
  849. size_t pos = file_path.rfind("/");
  850. std::string file_name = file_path.substr(pos + 1);
  851. size_t first_dot = file_name.rfind(".");
  852. size_t second_dot = file_name.rfind(".", first_dot - 1);
  853. size_t third_dot = file_name.rfind(".", second_dot - 1);
  854. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  855. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  856. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  857. return time_stamp;
  858. }
  859. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  860. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  861. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  862. const std::vector<std::string> &async_file_pool,
  863. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  864. for (unsigned int i = 0; i < backend_name.size(); i++) {
  865. // form prefix of the tensor file to read from graph pb node name
  866. std::string dump_style_kernel_name = backend_name[i];
  867. // remove slot from name
  868. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  869. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  870. std::string slot_string_to_check;
  871. std::string prefix_dump_file_name;
  872. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  873. std::string prefix_dump_to_check = dump_style_kernel_name + '.';
  874. GetNodeNameWithoutScope(&prefix_dump_to_check);
  875. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  876. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  877. // search files in dir for the one that meets the filename prefix and read the file into memory
  878. if (is_sync_mode_) {
  879. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  880. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  881. } else {
  882. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  883. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  884. no_mem_to_read);
  885. }
  886. }
  887. }
  888. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  889. const std::string &backend_name, size_t slot, unsigned int device_id,
  890. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  891. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  892. std::vector<char> *buffer = nullptr;
  893. std::string type_name = "";
  894. std::vector<int64_t> shape;
  895. uint64_t data_size = 0;
  896. std::string time_stamp;
  897. std::string abspath = RealPath(specific_dump_dir);
  898. DIR *d = opendir(abspath.c_str());
  899. bool found_file = false;
  900. std::vector<std::string> matched_paths;
  901. if (d == nullptr) {
  902. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  903. } else {
  904. struct dirent *dir = nullptr;
  905. while ((dir = readdir(d)) != nullptr) {
  906. if (dir->d_type == DT_REG) {
  907. std::string file_name = dir->d_name;
  908. std::string stripped_file_name = GetStrippedFilename(file_name);
  909. if (stripped_file_name.empty()) {
  910. continue;
  911. }
  912. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  913. if (found != 0) {
  914. continue;
  915. }
  916. std::string full_path = specific_dump_dir + "/" + file_name;
  917. matched_paths.push_back(full_path);
  918. found_file = true;
  919. }
  920. }
  921. }
  922. if (found_file) {
  923. shape.clear();
  924. std::string result_path = GetNewestFilePath(matched_paths);
  925. time_stamp = GetTimeStampStr(result_path);
  926. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  927. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  928. std::to_string(slot);
  929. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  930. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  931. type_name, shape, buffer, result_list);
  932. } else {
  933. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  934. buffer, result_list);
  935. MS_LOG(INFO) << "Target tensor has not been found.";
  936. }
  937. (void)closedir(d);
  938. }
  939. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  940. const std::string &slot_string_to_check, const std::string &backend_name,
  941. size_t slot, unsigned int device_id, unsigned int iteration,
  942. unsigned int root_graph_id, const bool &is_output,
  943. const std::vector<std::string> &async_file_pool,
  944. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  945. std::vector<char> *buffer = nullptr;
  946. std::string type_name = "";
  947. std::vector<int64_t> shape;
  948. uint64_t data_size = 0;
  949. std::string time_stamp;
  950. bool found = false;
  951. std::vector<std::string> matched_paths;
  952. // if async mode
  953. for (const std::string &file_path : async_file_pool) {
  954. if (file_path.find(specific_dump_dir) != std::string::npos &&
  955. file_path.find(prefix_dump_to_check) != std::string::npos &&
  956. file_path.find(slot_string_to_check) != std::string::npos) {
  957. matched_paths.push_back(file_path);
  958. found = true;
  959. }
  960. }
  961. if (found) {
  962. shape.clear();
  963. std::string result_path = GetNewestFilePath(matched_paths);
  964. time_stamp = GetTimeStampStr(result_path);
  965. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  966. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  967. std::to_string(slot);
  968. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  969. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  970. type_name, shape, buffer, result_list);
  971. } else {
  972. // If no npy file is found, add empty tensor data.
  973. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  974. buffer, result_list);
  975. MS_LOG(INFO) << "Target tensor has not been found.";
  976. }
  977. }
  978. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  979. // strip off the task_id, stream_id, and timestamp, then compare
  980. size_t first_dot = file_name.find(".");
  981. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  982. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  983. if (fifth_dot == std::string::npos) {
  984. return std::string();
  985. }
  986. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  987. size_t second_dot = fifth_dot;
  988. const int8_t kSecondDotPosition = 2;
  989. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  990. second_dot = file_name.rfind(".", second_dot - 1);
  991. }
  992. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  993. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  994. std::string stripped_file_name = start_string + end_string;
  995. return stripped_file_name;
  996. }
  997. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  998. unsigned int iteration, std::vector<std::string> *async_file_pool) {
  999. // get a list of nodes and the devices they are on to monitor
  1000. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1001. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
  1002. for (auto w_table_item : watchpoint_table_) {
  1003. auto wp = std::get<1>(w_table_item);
  1004. unsigned int index = 0;
  1005. for (auto check_node : wp.check_node_list) {
  1006. std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
  1007. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1008. for (auto device : devices) {
  1009. for (auto graph : graphs) {
  1010. std::tuple<uint32_t, uint32_t> key(device, graph);
  1011. device_and_graph_to_nodes[key].push_back(check_node);
  1012. }
  1013. }
  1014. index++;
  1015. }
  1016. }
  1017. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1018. // as they are found
  1019. for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
  1020. std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
  1021. uint32_t device_id = std::get<0>(device_and_graph);
  1022. uint32_t root_graph_id = std::get<1>(device_and_graph);
  1023. std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
  1024. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1025. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1026. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1027. // convert node names to dump style
  1028. for (auto node : wp_nodes) {
  1029. std::string orig_name = std::get<0>(node);
  1030. std::string dump_style_name = orig_name;
  1031. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1032. GetNodeNameWithoutScope(&dump_style_name);
  1033. bool node_is_out = std::get<1>(node);
  1034. if (node_is_out) {
  1035. dump_style_name += ".output";
  1036. } else {
  1037. dump_style_name += ".input";
  1038. }
  1039. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1040. }
  1041. if (!is_sync_mode_) {
  1042. // convert all files in proto_to_dump to npy and add to pool of async file names
  1043. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  1044. }
  1045. if (is_sync_mode_) {
  1046. // search files in dir for the one that meets the filename prefix and read the file into memory
  1047. std::string abspath = RealPath(specific_dump_dir);
  1048. DIR *d = opendir(abspath.c_str());
  1049. if (d == nullptr) {
  1050. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
  1051. } else {
  1052. struct dirent *dir = nullptr;
  1053. while ((dir = readdir(d)) != nullptr) {
  1054. if (dir->d_type == DT_REG) {
  1055. std::string file_name = dir->d_name;
  1056. for (auto &node : proto_to_dump) {
  1057. std::string dump_name = std::get<1>(node);
  1058. std::string stripped_file_name = GetStrippedFilename(file_name);
  1059. if (stripped_file_name.empty()) {
  1060. continue;
  1061. }
  1062. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1063. if (found == 0) {
  1064. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1065. std::vector<int64_t> shape;
  1066. std::string orig_name = std::get<0>(node);
  1067. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1068. bool output_flag = (output_str == "output");
  1069. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1070. NULL, &tensor_list);
  1071. break;
  1072. }
  1073. }
  1074. }
  1075. }
  1076. (void)closedir(d);
  1077. }
  1078. } else {
  1079. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
  1080. &tensor_list);
  1081. }
  1082. }
  1083. return tensor_list;
  1084. }
  1085. std::string DebugServices::IterationString(unsigned int iteration) {
  1086. std::string iteration_string;
  1087. bool init_dbg_suspend = (iteration == UINT_MAX);
  1088. if (init_dbg_suspend) {
  1089. iteration_string = "init";
  1090. } else {
  1091. iteration_string = std::to_string(iteration);
  1092. }
  1093. return iteration_string;
  1094. }
  1095. #endif
  1096. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1097. std::vector<char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1098. std::vector<unsigned int> *const dtype,
  1099. std::vector<std::vector<int64_t>> *const shape) {
  1100. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1101. tensor_loader_->SearchTensors(name, &result_list);
  1102. for (auto result : result_list) {
  1103. if (!std::get<1>(result)) {
  1104. continue;
  1105. }
  1106. ret_name->push_back(std::get<0>(result));
  1107. data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetDataPtr()));
  1108. data_size->push_back(std::get<1>(result)->GetByteSize());
  1109. dtype->push_back(std::get<1>(result)->GetType());
  1110. shape->push_back(std::get<1>(result)->GetShape());
  1111. }
  1112. }
  1113. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1114. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1115. if (result_list == nullptr) {
  1116. MS_LOG(DEBUG) << "result_list is nullptr.";
  1117. return;
  1118. }
  1119. tensor_loader_->SearchTensors(name, result_list);
  1120. }
  1121. #ifdef ONLINE_DBG_MODE
  1122. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1123. bool ret = false;
  1124. for (auto w_table_item : watchpoint_table_) {
  1125. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1126. for (auto check_node : check_node_list) {
  1127. std::string w_name = std::get<0>(check_node);
  1128. bool w_type = std::get<1>(check_node);
  1129. if ((w_type == true &&
  1130. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1131. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1132. ret = true;
  1133. return ret;
  1134. }
  1135. }
  1136. }
  1137. return ret;
  1138. }
  1139. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1140. if (kernel && w_name.length() > 0) {
  1141. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1142. for (size_t j = 0; j < input_size; ++j) {
  1143. auto input_kernel = kernel->input(j + 1);
  1144. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1145. auto found = w_name.find_last_of('/');
  1146. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1147. return true;
  1148. }
  1149. return false;
  1150. } else {
  1151. return false;
  1152. }
  1153. }
  1154. #endif
  1155. void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
  1156. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1157. uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
  1158. void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
  1159. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1160. #ifdef ONLINE_DBG_MODE
  1161. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1162. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1163. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1164. size_t slot) const {
  1165. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1166. device_type, addr_format, slot);
  1167. }
  1168. #endif
  1169. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1170. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1171. }
  1172. std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
  1173. return watchpoint_table_;
  1174. }
  1175. void DebugServices::ResetLoadedTensors() {
  1176. wp_id_cache_.clear();
  1177. MS_LOG(INFO) << "Resetting loaded tensors";
  1178. tensor_loader_->MoveParametersCurrentToPrev();
  1179. tensor_loader_->EmptyCurrentTensor();
  1180. // will move parameters from previous to current map
  1181. tensor_loader_->SwapCurrentPrev();
  1182. overflow_ops_.clear();
  1183. }
  1184. #ifdef ONLINE_DBG_MODE
  1185. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1186. MS_EXCEPTION_IF_NULL(kernel);
  1187. std::vector<std::shared_ptr<TensorData>> result;
  1188. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1189. auto kernel_name = GetKernelNodeName(kernel);
  1190. for (size_t j = 0; j < output_size; ++j) {
  1191. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1192. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1193. if (tensor) result.push_back(tensor);
  1194. }
  1195. return result;
  1196. }
  1197. #endif
  1198. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1199. unsigned int iteration) {
  1200. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1201. std::vector<std::string> op_names;
  1202. std::string overflow_bin_path;
  1203. #ifdef ONLINE_DBG_MODE
  1204. auto debugger = Debugger::GetInstance();
  1205. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->graph_id());
  1206. std::string check_overflow_bin_path = RealPath(overflow_bin_path);
  1207. if (check_overflow_bin_path.empty()) {
  1208. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1209. return false;
  1210. }
  1211. overflow_bin_path = check_overflow_bin_path;
  1212. #else
  1213. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1214. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1215. overflow_bin_path = RealPath(overflow_bin_path);
  1216. #endif
  1217. overflow_wp_lock_.lock();
  1218. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1219. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1220. if (found_overflows != overflow_ops_.end()) {
  1221. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1222. op_names = overflow_ops_[overflow_bin_path];
  1223. } else {
  1224. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1225. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1226. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1227. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1228. std::string abspath = RealPath(overflow_bin_path);
  1229. DIR *d = opendir(abspath.c_str());
  1230. if (d == nullptr) {
  1231. MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
  1232. } else {
  1233. struct dirent *dir = nullptr;
  1234. while ((dir = readdir(d)) != nullptr) {
  1235. if (dir->d_type == DT_REG) {
  1236. // form fully qualified filename
  1237. std::string file_path = overflow_bin_path;
  1238. std::string file_name = dir->d_name;
  1239. file_path.append(file_name);
  1240. // attempt to read the file
  1241. std::ifstream infile;
  1242. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1243. if (!infile.is_open()) {
  1244. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno
  1245. << " ErrInfo:" << strerror(errno);
  1246. continue;
  1247. }
  1248. std::string node_name;
  1249. uint64_t task_id = 0;
  1250. uint64_t stream_id = 0;
  1251. // detect overflow bin file
  1252. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1253. // start of op overflow data in bin file
  1254. const uint32_t offset = 321;
  1255. (void)infile.seekg(offset, std::ios::beg);
  1256. std::vector<char> buffer;
  1257. // size of op overflow info section
  1258. const size_t buf_size = 256;
  1259. buffer.resize(buf_size);
  1260. (void)infile.read(buffer.data(), buf_size);
  1261. if (infile.gcount() != buf_size) {
  1262. MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
  1263. continue;
  1264. }
  1265. const uint8_t stream_id_offset = 16;
  1266. const uint8_t task_id_offset = 24;
  1267. // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
  1268. // byte values currently.
  1269. stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
  1270. task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
  1271. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1272. << ".";
  1273. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1274. } else {
  1275. // regular bin file
  1276. bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
  1277. if (success_parse) {
  1278. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1279. }
  1280. }
  1281. infile.close();
  1282. }
  1283. }
  1284. (void)closedir(d);
  1285. }
  1286. // find the op_names with an overflow hit
  1287. for (auto &task_stream : task_stream_hit) {
  1288. auto op_name = task_stream_to_opname[task_stream];
  1289. if (!op_name.empty()) {
  1290. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1291. op_names.push_back(op_name);
  1292. }
  1293. }
  1294. overflow_ops_[overflow_bin_path] = op_names;
  1295. }
  1296. overflow_wp_lock_.unlock();
  1297. // determine if overflow wp has been triggered for node_name_to_find
  1298. if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) {
  1299. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1300. return true;
  1301. }
  1302. return false;
  1303. }
  1304. bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *node_name, uint64_t *task_id,
  1305. uint64_t *stream_id) {
  1306. // get the node_name, task_id, and stream_id from async dump filename
  1307. // node_type.node_name.task_id.stram_id.timestamp
  1308. // WARNING: node_name may have dots in it
  1309. size_t fourth_dot = file_name.rfind(".");
  1310. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1311. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1312. size_t first_dot = file_name.find(".");
  1313. // check if dots were found
  1314. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1315. fourth_dot == std::string::npos) {
  1316. return false;
  1317. }
  1318. // check if its not an async bin file
  1319. if (file_name.substr(fourth_dot) == ".npy") {
  1320. return false;
  1321. }
  1322. // get node_name
  1323. if (first_dot < second_dot) {
  1324. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1325. } else {
  1326. MS_LOG(ERROR) << "Async filename parse error to get node_name.";
  1327. return false;
  1328. }
  1329. // get task id
  1330. if (second_dot < third_dot) {
  1331. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1332. try {
  1333. *task_id = std::stoull(extracted_task_id);
  1334. } catch (...) {
  1335. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id.";
  1336. return false;
  1337. }
  1338. } else {
  1339. MS_LOG(ERROR) << "Async filename parse error to get task_id.";
  1340. return false;
  1341. }
  1342. // get stream id
  1343. if (third_dot < fourth_dot) {
  1344. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1345. try {
  1346. *stream_id = std::stoull(extracted_stream_id);
  1347. } catch (...) {
  1348. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id.";
  1349. return false;
  1350. }
  1351. } else {
  1352. MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
  1353. return false;
  1354. }
  1355. return true;
  1356. }
  1357. std::string DebugServices::RealPath(const std::string &input_path) {
  1358. if (input_path.length() >= PATH_MAX) {
  1359. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1360. }
  1361. size_t path_split_pos = input_path.find_last_of('/');
  1362. // get real path
  1363. char real_path[PATH_MAX] = {0};
  1364. // input_path is dir + file_name
  1365. if (path_split_pos != std::string::npos) {
  1366. std::string prefix_path = input_path.substr(0, path_split_pos);
  1367. std::string file_name = input_path.substr(path_split_pos);
  1368. if (file_name.length() > NAME_MAX) {
  1369. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1370. }
  1371. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1372. MS_LOG(WARNING) << "The dir " << prefix_path << " does not exist.";
  1373. return "";
  1374. }
  1375. return std::string(real_path) + file_name;
  1376. }
  1377. // input_path is only file_name
  1378. if (input_path.length() > NAME_MAX) {
  1379. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1380. }
  1381. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1382. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1383. }
  1384. return std::string(real_path);
  1385. }
  1386. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1387. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1388. }
  1389. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1390. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1391. }
  1392. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1393. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1394. }
  1395. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1396. if (tensor_loader_->EnableMemoryControl()) {
  1397. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1398. }
  1399. }
  1400. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1401. std::string DebugServices::GetNetName() { return net_name_; }
  1402. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1403. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1404. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1405. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1406. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1407. #ifdef ONLINE_DBG_MODE
  1408. } // namespace mindspore
  1409. #endif