You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug_services.cc 83 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/debug_services.h"
  17. #include <dirent.h>
  18. #include <algorithm>
  19. #include <functional>
  20. #include <fstream>
  21. #include <future>
  22. #include <thread>
  23. #include <iterator>
  24. #include <map>
  25. #include <numeric>
  26. #include <unordered_set>
  27. #include <utility>
  28. #include <regex>
  29. #include "pybind11/embed.h"
  30. #include "pybind11/stl.h"
  31. #ifdef ONLINE_DBG_MODE
  32. #include "debug/common.h"
  33. #include "debug/debugger/debugger.h"
  34. #include "debug/anf_ir_utils.h"
  35. #include "backend/session/anf_runtime_algorithm.h"
  36. #endif
  37. #include "nlohmann/json.hpp"
  38. #include "debug/debugger/tensor_summary.h"
  39. #include "utils/file_utils.h"
  40. #include "climits"
  41. #ifdef ONLINE_DBG_MODE
  42. namespace mindspore {
  43. #endif
  44. namespace {
  45. #ifdef __APPLE__
  46. constexpr int kStrErrorNone = 0;
  47. #else
  48. constexpr char *kStrErrorNone = nullptr;
  49. #endif
  50. } // namespace
  51. DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
  52. DebugServices::DebugServices(const DebugServices &other) {
  53. wp_id_cache_ = other.wp_id_cache_;
  54. net_name_ = other.net_name_;
  55. dump_dir_ = other.dump_dir_;
  56. is_sync_mode_ = other.is_sync_mode_;
  57. tensor_loader_ = other.tensor_loader_;
  58. watchpoint_table_ = other.watchpoint_table_;
  59. }
  60. DebugServices &DebugServices::operator=(const DebugServices &other) {
  61. if (this != &other) {
  62. tensor_loader_ = other.tensor_loader_;
  63. watchpoint_table_ = other.watchpoint_table_;
  64. }
  65. return *this;
  66. }
  67. void DebugServices::AddWatchpoint(
  68. unsigned int id, unsigned int watch_condition, float parameter,
  69. const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
  70. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
  71. const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
  72. std::lock_guard<std::mutex> lg(lock_);
  73. watchpoint_t watchpoint_item;
  74. watchpoint_item.id = id;
  75. watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
  76. watchpoint_item.condition.parameter = parameter;
  77. watchpoint_item.check_node_list = check_node_list;
  78. if (check_node_device_list != nullptr) {
  79. watchpoint_item.check_node_device_list = *check_node_device_list;
  80. }
  81. if (check_node_graph_list != nullptr) {
  82. watchpoint_item.check_node_graph_list = *check_node_graph_list;
  83. }
  84. watchpoint_item.parameter_list = parameter_list;
  85. watchpoint_table_[id] = watchpoint_item;
  86. }
  87. void DebugServices::RemoveWatchpoint(unsigned int id) {
  88. std::lock_guard<std::mutex> lg(lock_);
  89. (void)watchpoint_table_.erase(id);
  90. }
  91. std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
  92. const void *const previous_tensor_ptr, uint32_t num_elements,
  93. uint32_t prev_num_elements, int tensor_dtype) {
  94. MS_EXCEPTION_IF_NULL(tensor);
  95. switch (tensor_dtype) {
  96. case DbgDataType::DT_UINT8: {
  97. return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  98. prev_num_elements);
  99. }
  100. case DbgDataType::DT_INT8: {
  101. return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  102. prev_num_elements);
  103. }
  104. case DbgDataType::DT_UINT16: {
  105. return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  106. prev_num_elements);
  107. }
  108. case DbgDataType::DT_INT16: {
  109. return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  110. prev_num_elements);
  111. }
  112. case DbgDataType::DT_UINT32: {
  113. return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  114. prev_num_elements);
  115. }
  116. case DbgDataType::DT_INT32:
  117. case DbgDataType::DT_BASE_INT: {
  118. return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  119. prev_num_elements);
  120. }
  121. case DbgDataType::DT_UINT64: {
  122. return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  123. prev_num_elements);
  124. }
  125. case DbgDataType::DT_INT64: {
  126. return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  127. prev_num_elements);
  128. }
  129. case DbgDataType::DT_FLOAT16: {
  130. return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  131. prev_num_elements);
  132. }
  133. case DbgDataType::DT_FLOAT32:
  134. case DbgDataType::DT_BASE_FLOAT: {
  135. return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  136. prev_num_elements);
  137. }
  138. case DbgDataType::DT_FLOAT64: {
  139. return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  140. prev_num_elements);
  141. }
  142. case DbgDataType::DT_BOOL: {
  143. return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
  144. prev_num_elements);
  145. }
  146. default:
  147. MS_LOG(INFO) << "Unsupported tensor type";
  148. // return a null pointer
  149. return std::unique_ptr<TensorSummary<int32_t>>{};
  150. }
  151. }
  152. DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
  153. if (tensor == nullptr) {
  154. MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
  155. TensorStat empty_tensor_stat_data;
  156. return empty_tensor_stat_data;
  157. }
  158. std::unique_ptr<ITensorSummary> base_summary_ptr;
  159. void *previous_tensor_ptr = nullptr;
  160. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
  161. if (base_summary_ptr == nullptr) {
  162. MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
  163. TensorStat empty_tensor_stat_data;
  164. return empty_tensor_stat_data;
  165. }
  166. base_summary_ptr->TensorStatistics(tensor->GetType());
  167. TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
  168. base_summary_ptr->max_value(), base_summary_ptr->min_value(),
  169. base_summary_ptr->avg_value(), base_summary_ptr->count(),
  170. base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
  171. base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
  172. base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
  173. return tensor_stat_data;
  174. }
  175. #ifdef OFFLINE_DBG_MODE
  176. const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
  177. uint32_t *prev_num_elements, bool *history_not_found) {
  178. MS_EXCEPTION_IF_NULL(tensor);
  179. const void *previous_tensor_ptr = nullptr;
  180. std::shared_ptr<TensorData> tensor_prev;
  181. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
  182. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  183. *history_not_found = 1;
  184. MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
  185. } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
  186. // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
  187. // read data in offline mode
  188. std::vector<std::string> file_paths;
  189. if (!is_sync_mode_) {
  190. ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  191. std::vector<unsigned int>{tensor->GetDeviceId()},
  192. std::vector<unsigned int>{tensor->GetPrevIteration()},
  193. std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
  194. }
  195. std::vector<std::shared_ptr<TensorData>> result_list_prev;
  196. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  197. std::vector<unsigned int>{tensor->GetDeviceId()},
  198. std::vector<unsigned int>{tensor->GetPrevIteration()},
  199. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  200. file_paths, &result_list_prev);
  201. tensor_prev = result_list_prev[0];
  202. if (!tensor_prev->GetByteSize()) {
  203. tensor_prev.reset();
  204. } else {
  205. previous_tensor_ptr = tensor_prev->GetDataPtr();
  206. *prev_num_elements = tensor_prev->GetNumElements();
  207. }
  208. }
  209. return previous_tensor_ptr;
  210. }
  211. #endif
  212. void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
  213. const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
  214. std::string *const qualified_tensor_name,
  215. std::vector<watchpoint_t> *const watchpoints_to_check) {
  216. if (tensor == nullptr) {
  217. MS_LOG(DEBUG) << "tensor is nullptr.";
  218. return;
  219. }
  220. const auto tensor_name = tensor->GetName();
  221. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  222. const auto tensor_device_id = tensor->GetDeviceId();
  223. const auto tensor_root_graph_id = tensor->GetRootGraphId();
  224. for (auto w_table_item : watchpoint_table_) {
  225. auto wp = std::get<1>(w_table_item);
  226. // check ONLY init conditions on initial suspended state.
  227. // skip other conditions on initial suspended state
  228. if (init_dbg_suspend && (wp.condition.type != INIT)) {
  229. continue;
  230. }
  231. // skip init condition if not init suspend
  232. if ((wp.condition.type == INIT) && !init_dbg_suspend) {
  233. continue;
  234. }
  235. // check change conditions only on step end.
  236. if (wp.change_condition() && !step_end) {
  237. continue;
  238. }
  239. // if recheck, ignore the cache results and reanalyze everything.
  240. // if not a recheck, check only unanalyzed tensors
  241. if (!recheck) {
  242. wp_lock_.lock();
  243. bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
  244. wp_lock_.unlock();
  245. if (wp_cache_hit) {
  246. continue;
  247. }
  248. }
  249. std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
  250. if (!found.empty()) {
  251. *qualified_tensor_name = found;
  252. watchpoints_to_check->push_back(w_table_item.second);
  253. #ifdef OFFLINE_DBG_MODE
  254. if (wp.change_condition()) {
  255. *previous_iter_tensor_needed = true;
  256. }
  257. #endif
  258. }
  259. }
  260. }
  261. void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
  262. const std::string &tensor_name) {
  263. // add analyzed tensor to cache
  264. if (!recheck) {
  265. wp_lock_.lock();
  266. (void)wp_id_cache_[tensor_name].insert(id);
  267. wp_lock_.unlock();
  268. }
  269. }
  270. void DebugServices::SetCheckWatchpointsResult(
  271. const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  272. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  273. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  274. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  275. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  276. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  277. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  278. const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
  279. const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
  280. (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
  281. (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
  282. (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
  283. (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
  284. (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
  285. if (device_id != nullptr) {
  286. (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
  287. }
  288. if (root_graph_id != nullptr) {
  289. (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
  290. }
  291. (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
  292. (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
  293. (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
  294. }
  295. #ifdef OFFLINE_DBG_MODE
  296. void DebugServices::CheckOutofMemoryandNoValue(
  297. const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
  298. int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  299. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  300. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  301. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  302. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  303. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
  304. const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
  305. const unsigned int device_id_val, const unsigned int root_graph_id_val,
  306. const std::vector<parameter_t> &parameter_list) {
  307. bool set_is_needed = no_mem_to_read || error_on_no_value;
  308. int32_t error_code_to_set = 0;
  309. if (no_mem_to_read) {
  310. // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
  311. error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
  312. } else if (error_on_no_value) {
  313. error_code_to_set = ITensorSummary::NO_VALUE;
  314. }
  315. if (set_is_needed) {
  316. for (auto &wp : watchpoints_to_check) {
  317. SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
  318. chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
  319. chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
  320. qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
  321. parameter_list, error_code_to_set);
  322. }
  323. }
  324. }
  325. void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
  326. // set the tensor into not-in-use status in tensor_loader.
  327. auto tensor_name = tensor->GetName();
  328. std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
  329. std::to_string(tensor->GetRootGraphId()) + ":" +
  330. std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
  331. AppendToCacheEvictQueue(key_name_in_cache);
  332. if (previous_tensor_ptr != nullptr) {
  333. AppendToCacheEvictQueue(key_name_in_cache + ":prev");
  334. }
  335. }
  336. #endif
  337. #ifdef ONLINE_DBG_MODE
  338. bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
  339. auto debugger = Debugger::GetInstance();
  340. auto ms_context = MsContext::GetInstance();
  341. MS_EXCEPTION_IF_NULL(ms_context);
  342. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  343. auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
  344. if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  345. device_target == kAscendDevice) {
  346. if (cur_root_graph_id != id) {
  347. return false;
  348. }
  349. }
  350. return true;
  351. }
  352. const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
  353. std::shared_ptr<TensorData> prev_tensor_data;
  354. if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
  355. // not supporting watchpoints that need prev tensor for multi root graph networks.
  356. MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
  357. prev_tensor_data = nullptr;
  358. } else {
  359. prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
  360. }
  361. if (prev_tensor_data) {
  362. *prev_num_elements = prev_tensor_data->GetNumElements();
  363. return prev_tensor_data->GetDataPtr();
  364. }
  365. return nullptr;
  366. }
  367. #endif
  368. void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
  369. // check history error_code only for offline debugger
  370. if (history_not_found) {
  371. *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
  372. }
  373. }
  374. void DebugServices::CheckWatchpointsForTensor(
  375. partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  376. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  377. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  378. const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
  379. partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
  380. int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
  381. partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
  382. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
  383. std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  384. int list_size = tensor_list->size();
  385. if (end > list_size) {
  386. end = list_size;
  387. }
  388. for (int i = begin; i < end; i++) {
  389. auto &tensor = (*tensor_list)[i];
  390. const auto tensor_name = tensor->GetName();
  391. const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
  392. const auto tensor_slot = std::to_string(tensor->GetSlot());
  393. std::vector<watchpoint_t> watchpoints_to_check;
  394. std::string qualified_tensor_name;
  395. bool previous_iter_tensor_needed = false;
  396. AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
  397. &qualified_tensor_name, &watchpoints_to_check);
  398. // no wp set on current tensor
  399. if (watchpoints_to_check.empty()) {
  400. continue;
  401. }
  402. #ifdef OFFLINE_DBG_MODE
  403. // read data in offline mode
  404. bool no_mem_to_read = false;
  405. std::vector<std::shared_ptr<TensorData>> result_list;
  406. ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
  407. std::vector<unsigned int>{tensor->GetDeviceId()},
  408. std::vector<unsigned int>{tensor->GetIteration()},
  409. std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
  410. async_file_pool, &result_list, &no_mem_to_read);
  411. tensor = result_list[0];
  412. if (!tensor->GetByteSize()) {
  413. CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_names,
  414. chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  415. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id,
  416. chunk_root_graph_id, device_id, root_graph_id, tensor->GetExecutionOrder(),
  417. tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, tensor->GetDeviceId(),
  418. tensor->GetRootGraphId(), std::vector<parameter_t>());
  419. tensor.reset();
  420. continue;
  421. }
  422. #endif
  423. // no elements to analyze
  424. if (tensor->GetByteSize() == 0) {
  425. continue;
  426. }
  427. (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
  428. int tensor_dtype = tensor->GetType();
  429. uint32_t num_elements = tensor->GetNumElements();
  430. uint32_t prev_num_elements = 0;
  431. const void *previous_tensor_ptr = nullptr;
  432. #ifdef OFFLINE_DBG_MODE
  433. bool history_not_found = 0;
  434. previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
  435. #else
  436. if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
  437. MS_LOG(DEBUG)
  438. << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
  439. << tensor->GetName();
  440. continue;
  441. }
  442. previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
  443. #endif
  444. std::unique_ptr<ITensorSummary> base_summary_ptr;
  445. if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
  446. base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
  447. if (base_summary_ptr != nullptr) {
  448. base_summary_ptr->SummarizeTensor(watchpoints_to_check);
  449. }
  450. }
  451. for (auto &wp : watchpoints_to_check) {
  452. bool is_hit = false;
  453. int error_code = 0;
  454. std::vector<parameter_t> parameter_list = {};
  455. if (wp.condition.type == IS_OVERFLOW) {
  456. is_hit =
  457. CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
  458. } else if (base_summary_ptr != nullptr) {
  459. auto item = base_summary_ptr->IsWatchpointHit(wp);
  460. is_hit = std::get<ITensorSummary::eHitPos>(item);
  461. error_code = std::get<ITensorSummary::eErrorCodePos>(item);
  462. #ifdef OFFLINE_DBG_MODE
  463. CheckHistoryErrorCode(&error_code, history_not_found);
  464. #endif
  465. parameter_list = std::get<ITensorSummary::eParamListPos>(item);
  466. }
  467. AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
  468. if (is_hit || error_code) {
  469. SetCheckWatchpointsResult(
  470. chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
  471. chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
  472. root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
  473. tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
  474. }
  475. }
  476. #ifdef OFFLINE_DBG_MODE
  477. SetTensorToNotInUse(tensor, previous_tensor_ptr);
  478. // in offline mode remove the need for the data
  479. tensor.reset();
  480. #endif
  481. }
  482. }
  483. void DebugServices::CheckWatchpoints(
  484. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  485. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  486. std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
  487. const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  488. const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
  489. std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
  490. std::lock_guard<std::mutex> lg(lock_);
  491. auto t1 = std::chrono::high_resolution_clock::now();
  492. if (watchpoint_table_.empty()) {
  493. return;
  494. }
  495. // vector to store execution order of tensors hit
  496. std::vector<int> exec_order;
  497. std::vector<std::string> time_stamps;
  498. int tensor_list_size = tensor_list->size();
  499. uint64_t tensor_list_byte_size = 0;
  500. MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
  501. if (tensor_list_size <= 0) {
  502. return;
  503. }
  504. // default value for number of threads
  505. const int default_thread_num = 16;
  506. int max_thread_num = default_thread_num;
  507. if (max_thread_num > tensor_list_size) {
  508. max_thread_num = tensor_list_size;
  509. }
  510. MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
  511. int chunk_size = tensor_list_size / max_thread_num;
  512. int remainder = tensor_list_size % max_thread_num;
  513. partitioned_numbers chunk_exec_orders(max_thread_num);
  514. partitioned_names chunk_names(max_thread_num);
  515. partitioned_names chunk_slots(max_thread_num);
  516. partitioned_numbers chunk_conditions(max_thread_num);
  517. partitioned_id chunk_watchpoint_id(max_thread_num);
  518. partitioned_parameters chunk_parameters(max_thread_num);
  519. partitioned_error_code chunk_error_codes(max_thread_num);
  520. partitioned_id chunk_device_id(max_thread_num);
  521. partitioned_id chunk_root_graph_id(max_thread_num);
  522. std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
  523. partitioned_names chunk_time_stamp(max_thread_num);
  524. std::vector<std::future<void>> tensor_future_vec;
  525. int begin = 0;
  526. int end = begin;
  527. for (int i = 0; i < max_thread_num; i++) {
  528. end += chunk_size;
  529. if (remainder > 0) {
  530. end++;
  531. remainder--;
  532. }
  533. (void)tensor_future_vec.emplace_back(std::async(
  534. std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
  535. &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
  536. &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
  537. &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id, error_on_no_value));
  538. begin = end;
  539. }
  540. SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
  541. watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
  542. &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
  543. &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
  544. root_graph_id);
  545. auto t2 = std::chrono::high_resolution_clock::now();
  546. std::chrono::duration<double, std::milli> ms_double = t2 - t1;
  547. MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
  548. MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
  549. }
  550. void DebugServices::SortWatchpointsInfo(
  551. std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
  552. std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
  553. std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
  554. std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
  555. std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
  556. partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
  557. partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
  558. partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
  559. std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
  560. partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
  561. std::vector<unsigned int> *const root_graph_id) {
  562. for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
  563. (*tensor_future_vec)[i].wait();
  564. (*tensor_future_vec)[i].get();
  565. for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
  566. #ifdef ONLINE_DBG_MODE
  567. // if the execution order is repeated,inserts the new one before the others with same execution order.
  568. std::vector<int>::iterator iter =
  569. std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
  570. int position = iter - exec_order->begin();
  571. (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
  572. #endif
  573. #ifdef OFFLINE_DBG_MODE
  574. std::vector<std::string>::iterator iter =
  575. std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
  576. int position = iter - time_stamps->begin();
  577. (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
  578. #endif
  579. (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
  580. (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
  581. (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
  582. (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
  583. if (device_id != nullptr) {
  584. (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
  585. }
  586. if (root_graph_id != nullptr) {
  587. (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
  588. }
  589. (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
  590. (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
  591. }
  592. // free the memory for used vectors
  593. std::vector<int>().swap((*chunk_exec_orders)[i]);
  594. std::vector<std::string>().swap((*chunk_time_stamp)[i]);
  595. std::vector<std::string>().swap((*chunk_names)[i]);
  596. std::vector<std::string>().swap((*chunk_slots)[i]);
  597. std::vector<int>().swap((*chunk_conditions)[i]);
  598. std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
  599. std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
  600. std::vector<int32_t>().swap((*chunk_error_codes)[i]);
  601. std::vector<unsigned int>().swap((*chunk_device_id)[i]);
  602. std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
  603. (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
  604. }
  605. }
  606. #ifdef OFFLINE_DBG_MODE
  607. void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
  608. std::string *const tensor_type, std::size_t *const size,
  609. std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
  610. bool *no_mem_to_read) {
  611. std::ifstream infile;
  612. std::string file_path = file_name;
  613. MS_LOG(INFO) << "Reading in file: " << file_path;
  614. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  615. if (!infile.is_open()) {
  616. MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
  617. const int kMaxFilenameLength = 128;
  618. char err_info[kMaxFilenameLength];
  619. auto ret = strerror_r(errno, err_info, sizeof(err_info));
  620. if (ret != kStrErrorNone) {
  621. MS_LOG(ERROR) << " ErrInfo:" << ret;
  622. }
  623. return;
  624. }
  625. const int substr_len = 2;
  626. const int header_len_offset = 8;
  627. const int header_offset = 9;
  628. const int header_len_buffer_size = 2;
  629. const int type_offset = 10;
  630. // get header length
  631. (void)infile.seekg(0, std::ios::beg);
  632. auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
  633. if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
  634. MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
  635. return;
  636. }
  637. uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
  638. header_len_buffer.reset();
  639. // read in header
  640. (void)infile.seekg(0, std::ios::beg);
  641. auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
  642. if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
  643. MS_LOG(ERROR) << "Failed to read header from " << file_path;
  644. return;
  645. }
  646. std::string header(header_buffer->data() + header_offset, header_len);
  647. header_buffer.reset();
  648. std::size_t type_i = header.find("descr") + type_offset;
  649. if (header.length() < type_i + substr_len) {
  650. MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
  651. return;
  652. }
  653. *tensor_type = header.substr(type_i, substr_len);
  654. std::size_t shape_i_open = header.find("(");
  655. std::size_t shape_i_close = header.find(")");
  656. std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
  657. std::string intermediate;
  658. std::stringstream check_shape(shape_str);
  659. MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
  660. while (getline(check_shape, intermediate, ',')) {
  661. shape->push_back(std::stoi(intermediate));
  662. }
  663. std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
  664. std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
  665. std::size_t data_size = data_len * word_size;
  666. if (!data_size) {
  667. return;
  668. }
  669. // Check memory available before loading tensor into host.
  670. bool has_enough_memory = true;
  671. if (tensor_loader_->EnableMemoryControl()) {
  672. has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
  673. }
  674. if (!has_enough_memory) {
  675. MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
  676. *no_mem_to_read = true;
  677. } else {
  678. (void)infile.seekg(header_len + type_offset);
  679. *data_buffer = new std::vector<char>(data_size);
  680. if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
  681. MS_LOG(ERROR) << "Unable to get tensor data from npy";
  682. }
  683. *size = data_size;
  684. }
  685. }
  686. void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
  687. std::vector<std::string> *const result_list) {
  688. std::string file_format = "npy";
  689. for (auto const &d : dir_to_files_map) {
  690. std::vector<std::string> files_to_convert_in_dir;
  691. std::vector<std::string> files_after_convert_in_dir;
  692. std::string dump_key = d.first;
  693. for (auto const &file_name : d.second) {
  694. bool already_converted = false;
  695. // Remove scope from the file_name for matching files converted by mindinsight tool.
  696. std::size_t found_first_dot = file_name.find(".");
  697. std::size_t found_last_underscore = file_name.find_last_of("_");
  698. std::string file_name_without_scope = file_name;
  699. if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
  700. file_name_without_scope =
  701. file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
  702. }
  703. for (std::string &file_found : *result_list) {
  704. if (file_found.find(file_name_without_scope) != std::string::npos) {
  705. already_converted = true;
  706. break;
  707. }
  708. }
  709. if (!already_converted) {
  710. (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
  711. (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
  712. }
  713. }
  714. MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
  715. if (!files_to_convert_in_dir.empty()) {
  716. // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
  717. // later task.
  718. try {
  719. auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
  720. auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
  721. (void)convert_obj.attr("convert_files")();
  722. } catch (pybind11::error_already_set &e) {
  723. MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
  724. }
  725. ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
  726. }
  727. }
  728. }
  729. void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
  730. const std::string &dump_key, std::vector<std::string> *const result_list,
  731. const std::string &file_format) {
  732. std::string real_dump_iter_dir = RealPath(dump_key);
  733. DIR *d_handle = opendir(real_dump_iter_dir.c_str());
  734. if (d_handle == nullptr) {
  735. MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat.";
  736. return;
  737. }
  738. struct dirent *dir = nullptr;
  739. while ((dir = readdir(d_handle)) != nullptr) {
  740. if (dir->d_type == DT_REG) {
  741. std::string candidate = dir->d_name;
  742. for (const std::string &file_to_find : files_after_convert_in_dir) {
  743. std::string file_n = file_to_find;
  744. auto last_slash_pos = file_to_find.find_last_of("\\/");
  745. if (last_slash_pos != std::string::npos) {
  746. file_n = file_to_find.substr(last_slash_pos + 1);
  747. }
  748. if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
  749. // we found a converted file for this op
  750. std::string found_file = dump_key + "/" + candidate;
  751. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  752. result_list->push_back(found_file);
  753. }
  754. }
  755. }
  756. }
  757. }
  758. (void)closedir(d_handle);
  759. }
  760. std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
  761. if (dump_style_name.empty()) {
  762. return "";
  763. }
  764. std::size_t last_scope_marker;
  765. std::string delim = "/";
  766. last_scope_marker = dump_style_name.rfind(delim);
  767. if (last_scope_marker == std::string::npos) {
  768. return dump_style_name;
  769. }
  770. return dump_style_name.substr(last_scope_marker + delim.size());
  771. }
  772. void ReplaceSrcFileName(std::string *dump_style_name) {
  773. if (dump_style_name == nullptr) {
  774. return;
  775. }
  776. const std::string strsrc = "/";
  777. std::string strdst = "_";
  778. std::string::size_type pos = 0;
  779. std::string::size_type srclen = strsrc.size();
  780. std::string::size_type dstlen = strdst.size();
  781. while ((pos = dump_style_name->find(strsrc, pos)) != std::string::npos) {
  782. (void)dump_style_name->replace(pos, srclen, strdst);
  783. pos += dstlen;
  784. }
  785. }
  786. void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
  787. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  788. std::vector<unsigned int> root_graph_id,
  789. std::vector<std::string> *const result_list) {
  790. std::string file_format = "npy";
  791. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  792. for (unsigned int i = 0; i < backend_name.size(); i++) {
  793. // form prefix of the tensor file to read from graph pb node name
  794. std::string dump_style_kernel_name = backend_name[i];
  795. // remove slot from name
  796. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  797. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  798. std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
  799. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  800. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  801. // search files in dir for the one that meets the filename prefix and read the file into memory
  802. std::string abspath = RealPath(specific_dump_dir);
  803. DIR *d = opendir(abspath.c_str());
  804. if (d == nullptr) {
  805. MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
  806. return;
  807. }
  808. ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  809. (void)closedir(d);
  810. }
  811. ConvertToHostFormat(dir_to_files_map, result_list);
  812. }
  813. void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  814. const std::string &specific_dump_dir,
  815. std::vector<std::string> *const result_list) {
  816. std::string file_format = "npy";
  817. std::map<std::string, std::vector<std::string>> dir_to_files_map;
  818. for (const auto &node : proto_dump) {
  819. std::string dump_name = std::get<1>(node);
  820. dump_name = dump_name.substr(0, dump_name.rfind("."));
  821. // search files in dir for the one that meets the filename prefix and read the file into memory
  822. std::string abspath = RealPath(specific_dump_dir);
  823. DIR *d = opendir(abspath.c_str());
  824. if (d == nullptr) {
  825. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
  826. return;
  827. }
  828. ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
  829. (void)closedir(d);
  830. }
  831. ConvertToHostFormat(dir_to_files_map, result_list);
  832. }
  833. void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
  834. const std::string &specific_dump_dir,
  835. std::map<std::string, std::vector<std::string>> *dir_to_files_map,
  836. std::vector<std::string> *const result_list) {
  837. MS_EXCEPTION_IF_NULL(dir_to_files_map);
  838. DIR *d = opendir(specific_dump_dir.c_str());
  839. struct dirent *dir = nullptr;
  840. while ((dir = readdir(d)) != nullptr) {
  841. if (dir->d_type != DT_REG) {
  842. continue;
  843. }
  844. std::string file_name = dir->d_name;
  845. std::string file_name_w_o_perfix = file_name;
  846. auto type_pos = file_name.find('.');
  847. if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
  848. continue;
  849. }
  850. if (file_name.rfind(file_format) == std::string::npos) {
  851. // if file matches prefix and is in device format add to candidate files to convert.
  852. (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
  853. } else {
  854. // otherwise, if file matches prefix and already has been converted to host format
  855. // add to result of converted files.
  856. std::string found_file = specific_dump_dir + "/" + file_name;
  857. if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
  858. result_list->push_back(found_file);
  859. }
  860. }
  861. }
  862. (void)closedir(d);
  863. }
  864. void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
  865. const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
  866. uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
  867. std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
  868. for (auto &node : proto_dump) {
  869. std::vector<size_t> slot_list;
  870. std::string dump_style_name = std::get<1>(node);
  871. // Get dump_name and output_str from the second element of tuple
  872. std::size_t found_dot = dump_style_name.rfind(".");
  873. std::string dump_name = dump_style_name.substr(0, found_dot);
  874. std::string output_str = dump_style_name.substr(found_dot + 1);
  875. bool output_flag = (output_str == "output");
  876. for (const std::string &file_name : async_file_pool) {
  877. std::size_t found = file_name.find(dump_name);
  878. std::size_t found_out = file_name.find(output_str);
  879. std::size_t found_dot_start = file_name.find(".", found_out);
  880. std::size_t found_dot_end = file_name.find(".", found_dot_start);
  881. if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
  882. found_out != std::string::npos) {
  883. slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
  884. }
  885. }
  886. for (auto slot : slot_list) {
  887. // add a TensorData entry (data will be read when needed)
  888. std::vector<int64_t> shape;
  889. std::string orig_name = std::get<0>(node);
  890. auto tensor_data = std::make_shared<TensorData>();
  891. tensor_data->SetName(orig_name);
  892. tensor_data->SetExecutionOrder(0);
  893. tensor_data->SetSlot(slot);
  894. tensor_data->SetIteration(iteration);
  895. tensor_data->SetDeviceId(device_id);
  896. tensor_data->SetRootGraphId(root_graph_id);
  897. tensor_data->SetDataPtr(nullptr);
  898. tensor_data->SetByteSize(0);
  899. tensor_data->SetType("");
  900. tensor_data->SetShape(shape);
  901. tensor_data->SetIsOutput(output_flag);
  902. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  903. tensor_list->push_back(tensor_data);
  904. }
  905. }
  906. }
  907. uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
  908. std::regex re;
  909. if (mode == "rank") {
  910. re = "^rank_([0-9]+)$";
  911. } else if (mode == "graph") {
  912. re = "^([0-9]+)$";
  913. }
  914. std::smatch tokens;
  915. if (regex_match(name, tokens, re)) {
  916. return std::stoi(tokens[1]);
  917. } else {
  918. return UINT32_MAX;
  919. }
  920. }
  921. std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
  922. std::vector<uint32_t> rank_id_list;
  923. std::string dump_dir = GetDumpDir();
  924. DIR *d_handle = opendir(dump_dir.c_str());
  925. if (d_handle == nullptr) {
  926. MS_LOG(ERROR) << "Dump directory does not exist.";
  927. return rank_id_list;
  928. }
  929. struct dirent *dir = nullptr;
  930. while ((dir = readdir(d_handle)) != nullptr) {
  931. if (dir->d_type == DT_DIR) {
  932. std::string rank_dir_name = dir->d_name;
  933. if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) {
  934. rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name));
  935. }
  936. }
  937. }
  938. (void)closedir(d_handle);
  939. return rank_id_list;
  940. }
  941. void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
  942. std::string net_name = GetNetName();
  943. std::string dump_dir = GetDumpDir();
  944. for (uint32_t rank_id : rank_id_list) {
  945. std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
  946. std::string abspath = RealPath(path);
  947. DIR *d_handle_rank = opendir(abspath.c_str());
  948. if (d_handle_rank == nullptr) {
  949. MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
  950. continue;
  951. }
  952. struct dirent *direc = nullptr;
  953. while ((direc = readdir(d_handle_rank)) != nullptr) {
  954. if (direc->d_type == DT_DIR) {
  955. std::string graph_dir = direc->d_name;
  956. if (graph_dir == "." || graph_dir == "..") {
  957. continue;
  958. }
  959. if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) {
  960. uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
  961. ReadGraphsHistory(rank_id, graph_id);
  962. }
  963. }
  964. }
  965. (void)closedir(d_handle_rank);
  966. }
  967. }
  968. void DebugServices::SetGraphsHistory() {
  969. // extract rank_id_list
  970. std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
  971. // for each rank_id extract the graph_id list and set the dump version
  972. // and for each graph read the graph history file
  973. CheckDumpGraphIdList(rank_id_list);
  974. }
  975. void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
  976. std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
  977. if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
  978. // graph history was already stored for this rank_id and graph_id
  979. return;
  980. }
  981. std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
  982. std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
  983. DIR *d_handle = opendir(exec_order_path.c_str());
  984. if (d_handle == nullptr) {
  985. MS_LOG(ERROR) << "Directory does not exist.";
  986. return;
  987. }
  988. // read file and store the info
  989. std::string full_path = exec_order_path + "/" + file_to_check;
  990. std::string checked_path = RealPath(full_path);
  991. if (!checked_path.empty()) {
  992. ReadGraphRunIter(checked_path, rank_and_graph);
  993. }
  994. (void)closedir(d_handle);
  995. }
  996. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
  997. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
  998. for (auto w_table_item : watchpoint_table_) {
  999. auto wp = std::get<1>(w_table_item);
  1000. unsigned int index = 0;
  1001. for (auto check_node : wp.check_node_list) {
  1002. std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
  1003. std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
  1004. // graph represents root_graph for Ascend and kernel_graph for GPU
  1005. for (auto rank : ranks) {
  1006. for (auto graph : graphs) {
  1007. std::tuple<uint32_t, uint32_t> key(rank, graph);
  1008. (rank_and_graph_to_nodes)[key].push_back(check_node);
  1009. }
  1010. }
  1011. index++;
  1012. }
  1013. }
  1014. return rank_and_graph_to_nodes;
  1015. }
  1016. void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
  1017. std::ifstream infile;
  1018. std::string line;
  1019. infile.open(file_path.c_str());
  1020. if (!infile.is_open()) {
  1021. MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
  1022. const int kMaxFilenameLength = NAME_MAX;
  1023. char err_info[kMaxFilenameLength];
  1024. if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
  1025. MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
  1026. }
  1027. return;
  1028. }
  1029. std::vector<uint32_t> run_iters_vec;
  1030. while (std::getline(infile, line)) {
  1031. uint32_t iter;
  1032. std::stringstream ss(line);
  1033. ss >> iter;
  1034. run_iters_vec.push_back(iter);
  1035. }
  1036. (void)graphs_run_history_.emplace(
  1037. std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
  1038. }
  1039. void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
  1040. const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
  1041. const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
  1042. const std::string &type_name, const std::vector<int64_t> &shape,
  1043. std::vector<char> *buffer,
  1044. std::vector<std::shared_ptr<TensorData>> *const result_list) {
  1045. // call LoadNewTensor to store tensor in internal cache
  1046. auto tensor_data = std::make_shared<TensorData>();
  1047. tensor_data->SetName(backend_name);
  1048. tensor_data->SetExecutionOrder(0);
  1049. tensor_data->SetSlot(slot);
  1050. tensor_data->SetIteration(iteration);
  1051. tensor_data->SetDeviceId(device_id);
  1052. tensor_data->SetRootGraphId(root_graph_id);
  1053. tensor_data->SetIsOutput(is_output);
  1054. if (buffer != nullptr) {
  1055. tensor_data->SetDataPtr(buffer->data());
  1056. } else {
  1057. tensor_data->SetDataPtr(nullptr);
  1058. }
  1059. tensor_data->SetByteSize(data_size);
  1060. tensor_data->SetType(type_name);
  1061. tensor_data->SetShape(shape);
  1062. tensor_data->SetTimeStamp(time_stamp);
  1063. tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
  1064. if (data_size) {
  1065. (void)tensor_loader_->LoadNewTensor(tensor_data, false);
  1066. }
  1067. // add to result_list
  1068. result_list->push_back(tensor_data);
  1069. }
  1070. void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
  1071. std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
  1072. std::string dump_style_name_part = *dump_style_kernel_name;
  1073. dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
  1074. std::string slot_str;
  1075. if (is_output) {
  1076. slot_str = ".output." + std::to_string(slot);
  1077. } else {
  1078. slot_str = ".input." + std::to_string(slot);
  1079. }
  1080. dump_style_name_part += slot_str;
  1081. *prefix_dump_file_name = dump_style_name_part;
  1082. *slot_string_to_check = slot_str;
  1083. }
  1084. std::string GetNewestFilePath(std::vector<std::string> file_list) {
  1085. // get file with the newest timestamp from the list.
  1086. if (file_list.empty()) {
  1087. return "";
  1088. }
  1089. std::sort(file_list.begin(), file_list.end());
  1090. return file_list.back();
  1091. }
  1092. std::string GetTimeStampStr(std::string file_path) {
  1093. // get the file_name from file_path.
  1094. size_t pos = file_path.rfind("/");
  1095. std::string file_name = file_path.substr(pos + 1);
  1096. size_t first_dot = file_name.rfind(".");
  1097. size_t second_dot = file_name.rfind(".", first_dot - 1);
  1098. size_t third_dot = file_name.rfind(".", second_dot - 1);
  1099. size_t fourth_dot = file_name.rfind(".", third_dot - 1);
  1100. size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
  1101. if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
  1102. std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
  1103. return time_stamp;
  1104. }
  1105. return "";
  1106. }
  1107. void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
  1108. std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
  1109. std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
  1110. const std::vector<std::string> &async_file_pool,
  1111. std::vector<std::shared_ptr<TensorData>> *const result_list,
  1112. bool *no_mem_to_read) {
  1113. for (unsigned int i = 0; i < backend_name.size(); i++) {
  1114. // form prefix of the tensor file to read from graph pb node name
  1115. std::string dump_style_kernel_name = backend_name[i];
  1116. // remove slot from name
  1117. std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
  1118. dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
  1119. std::string slot_string_to_check;
  1120. std::string prefix_dump_file_name;
  1121. SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
  1122. std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
  1123. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
  1124. std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
  1125. // search files in dir for the one that meets the filename prefix and read the file into memory
  1126. if (is_sync_mode_) {
  1127. ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
  1128. iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
  1129. } else {
  1130. ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
  1131. device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
  1132. no_mem_to_read);
  1133. }
  1134. }
  1135. }
  1136. void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
  1137. const std::string &backend_name, const unsigned int device_id,
  1138. const unsigned int root_graph_id, const bool &is_output, size_t slot,
  1139. bool *no_mem_to_read, unsigned int iteration,
  1140. std::vector<std::shared_ptr<TensorData>> *result_list) {
  1141. std::string time_stamp = "";
  1142. std::string type_name = "";
  1143. size_t data_size = 0;
  1144. std::vector<int64_t> shape;
  1145. std::vector<char> *buffer = nullptr;
  1146. if (found) {
  1147. std::string result_path = GetNewestFilePath(matched_paths);
  1148. time_stamp = GetTimeStampStr(result_path);
  1149. std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
  1150. std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
  1151. std::to_string(slot);
  1152. ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
  1153. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
  1154. type_name, shape, buffer, result_list);
  1155. } else {
  1156. AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
  1157. buffer, result_list);
  1158. MS_LOG(INFO) << "Target tensor has not been found.";
  1159. }
  1160. }
  1161. void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
  1162. const std::string &backend_name, size_t slot, const unsigned int device_id,
  1163. unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
  1164. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1165. std::string abspath = RealPath(specific_dump_dir);
  1166. DIR *d = opendir(abspath.c_str());
  1167. bool found_file = false;
  1168. std::vector<std::string> matched_paths;
  1169. if (d == nullptr) {
  1170. MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
  1171. } else {
  1172. struct dirent *dir = nullptr;
  1173. while ((dir = readdir(d)) != nullptr) {
  1174. if (dir->d_type == DT_REG) {
  1175. std::string file_name = dir->d_name;
  1176. std::string stripped_file_name = GetStrippedFilename(file_name);
  1177. if (stripped_file_name.empty()) {
  1178. continue;
  1179. }
  1180. std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
  1181. if (found != 0) {
  1182. continue;
  1183. }
  1184. std::string full_path = specific_dump_dir + "/" + file_name;
  1185. matched_paths.push_back(full_path);
  1186. found_file = true;
  1187. }
  1188. }
  1189. (void)closedir(d);
  1190. }
  1191. ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
  1192. no_mem_to_read, iteration, result_list);
  1193. }
  1194. void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
  1195. const std::string &slot_string_to_check, const std::string &backend_name,
  1196. size_t slot, unsigned int device_id, unsigned int iteration,
  1197. unsigned int root_graph_id, const bool &is_output,
  1198. const std::vector<std::string> &async_file_pool,
  1199. std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
  1200. bool found = false;
  1201. std::vector<std::string> matched_paths;
  1202. // if async mode
  1203. for (const std::string &file_path : async_file_pool) {
  1204. if (file_path.find(specific_dump_dir) != std::string::npos &&
  1205. file_path.find(prefix_dump_to_check) != std::string::npos &&
  1206. file_path.find(slot_string_to_check) != std::string::npos) {
  1207. matched_paths.push_back(file_path);
  1208. found = true;
  1209. }
  1210. }
  1211. ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
  1212. iteration, result_list);
  1213. }
  1214. std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
  1215. // strip off the task_id, stream_id, and timestamp, then compare
  1216. size_t first_dot = file_name.find(".");
  1217. size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
  1218. size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
  1219. if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
  1220. return std::string();
  1221. }
  1222. // Look for the second dot's position from the back to avoid issue due to dots in the node name.
  1223. size_t second_dot = fifth_dot;
  1224. const int8_t kSecondDotPosition = 2;
  1225. for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
  1226. second_dot = file_name.rfind(".", second_dot - 1);
  1227. }
  1228. if (second_dot == std::string::npos || second_dot <= first_dot) {
  1229. return std::string();
  1230. }
  1231. std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1232. std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
  1233. std::string stripped_file_name = start_string + end_string;
  1234. return stripped_file_name;
  1235. }
  1236. std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
  1237. unsigned int iteration, std::vector<std::string> *const async_file_pool, bool error_on_no_value) {
  1238. // get a list of nodes and the devices they are on to monitor
  1239. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1240. std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
  1241. GetAllWpNodes();
  1242. // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
  1243. // as they are found
  1244. for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
  1245. std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
  1246. uint32_t rank_id = std::get<0>(rank_and_graph);
  1247. uint32_t root_graph_id = std::get<1>(rank_and_graph);
  1248. std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
  1249. std::vector<std::tuple<std::string, std::string>> proto_to_dump;
  1250. std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
  1251. std::to_string(root_graph_id) + "/" + IterationString(iteration);
  1252. // convert node names to dump style
  1253. for (auto node : wp_nodes) {
  1254. std::string orig_name = std::get<0>(node);
  1255. // Remove the scope from the fully qualified name to compare for both sync and async case.
  1256. std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
  1257. bool node_is_out = std::get<1>(node);
  1258. if (node_is_out) {
  1259. dump_style_name += ".output";
  1260. } else {
  1261. dump_style_name += ".input";
  1262. }
  1263. if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
  1264. std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
  1265. proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
  1266. }
  1267. }
  1268. if (is_sync_mode_) {
  1269. // search files in dir for the one that meets the filename prefix and read the file into memory
  1270. std::string abspath = RealPath(specific_dump_dir);
  1271. ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list,
  1272. error_on_no_value);
  1273. } else {
  1274. // convert all files in proto_to_dump to npy and add to pool of async file names
  1275. ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
  1276. GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool,
  1277. &tensor_list);
  1278. }
  1279. }
  1280. return tensor_list;
  1281. }
  1282. void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
  1283. const std::string &abspath, const std::string &specific_dump_dir,
  1284. unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
  1285. std::vector<std::shared_ptr<TensorData>> *const tensor_list,
  1286. bool error_on_no_value) {
  1287. DIR *d = opendir(abspath.c_str());
  1288. if (d == nullptr) {
  1289. MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
  1290. } else {
  1291. struct dirent *dir = nullptr;
  1292. while ((dir = readdir(d)) != nullptr) {
  1293. if (dir->d_type == DT_REG) {
  1294. std::string file_name = dir->d_name;
  1295. for (auto &node : proto_to_dump) {
  1296. std::string dump_name = std::get<1>(node);
  1297. std::string stripped_file_name = GetStrippedFilename(file_name);
  1298. if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
  1299. continue;
  1300. }
  1301. std::size_t found = stripped_file_name.rfind(dump_name, 0);
  1302. if (found == 0) {
  1303. size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
  1304. std::vector<int64_t> shape;
  1305. std::string orig_name = std::get<0>(node);
  1306. std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
  1307. bool output_flag = (output_str == "output");
  1308. AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
  1309. nullptr, tensor_list);
  1310. break;
  1311. }
  1312. }
  1313. }
  1314. }
  1315. (void)closedir(d);
  1316. }
  1317. }
  1318. std::string DebugServices::IterationString(unsigned int iteration) {
  1319. std::string iteration_string;
  1320. bool init_dbg_suspend = (iteration == UINT_MAX);
  1321. if (init_dbg_suspend) {
  1322. iteration_string = "init";
  1323. } else {
  1324. iteration_string = std::to_string(iteration);
  1325. }
  1326. return iteration_string;
  1327. }
  1328. #endif
  1329. void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
  1330. std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
  1331. std::vector<unsigned int> *const dtype,
  1332. std::vector<std::vector<int64_t>> *const shape) {
  1333. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1334. tensor_loader_->SearchTensors(name, &result_list);
  1335. for (auto result : result_list) {
  1336. if (std::get<1>(result) == nullptr) {
  1337. continue;
  1338. }
  1339. #ifdef ONLINE_DBG_MODE
  1340. if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
  1341. MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
  1342. << " is different from cur_root_graph_id: " << Debugger::GetInstance()->GetCurrentRootGraphId()
  1343. << ".";
  1344. MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
  1345. }
  1346. #endif
  1347. (void)ret_name->emplace_back(std::get<0>(result));
  1348. (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
  1349. (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
  1350. (void)dtype->emplace_back(std::get<1>(result)->GetType());
  1351. (void)shape->emplace_back(std::get<1>(result)->GetShape());
  1352. }
  1353. }
  1354. void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
  1355. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
  1356. if (result_list == nullptr) {
  1357. MS_LOG(DEBUG) << "result_list is nullptr.";
  1358. return;
  1359. }
  1360. tensor_loader_->SearchTensors(name, result_list);
  1361. }
  1362. #ifdef ONLINE_DBG_MODE
  1363. bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
  1364. bool ret = false;
  1365. for (auto w_table_item : watchpoint_table_) {
  1366. auto check_node_list = std::get<1>(w_table_item).check_node_list;
  1367. for (auto check_node : check_node_list) {
  1368. std::string w_name = std::get<0>(check_node);
  1369. bool w_type = std::get<1>(check_node);
  1370. if ((w_type == true &&
  1371. ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
  1372. (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
  1373. ret = true;
  1374. return ret;
  1375. }
  1376. }
  1377. }
  1378. return ret;
  1379. }
  1380. bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
  1381. if (kernel != nullptr && w_name.length() > 0) {
  1382. auto input_size = AnfAlgo::GetInputTensorNum(kernel);
  1383. for (size_t j = 0; j < input_size; ++j) {
  1384. auto input_kernel = kernel->input(j + 1);
  1385. std::string input_kernel_name = GetKernelNodeName(input_kernel);
  1386. auto found = w_name.find_last_of('/');
  1387. if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
  1388. return true;
  1389. }
  1390. return false;
  1391. } else {
  1392. return false;
  1393. }
  1394. }
  1395. #endif
  1396. std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
  1397. std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
  1398. return tensor_loader_->GetTensor(tensor_name);
  1399. }
  1400. void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
  1401. #ifdef ONLINE_DBG_MODE
  1402. bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1403. const std::string &host_fmt, const std::vector<int64_t> &host_shape,
  1404. TypeId host_type, TypeId device_type, const std::string &addr_format,
  1405. size_t slot) const {
  1406. return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1407. device_type, addr_format, slot);
  1408. }
  1409. #endif
  1410. bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1411. return tensor_loader_->LoadNewTensor(tensor, keep_prev);
  1412. }
  1413. uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
  1414. uint32_t prev_iter;
  1415. uint32_t rank_id = tensor->GetDeviceId();
  1416. uint32_t root_graph_id = tensor->GetRootGraphId();
  1417. std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
  1418. if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
  1419. return UINT32_MAX;
  1420. }
  1421. auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
  1422. tensor->GetIteration());
  1423. if (it == graphs_run_history_[rank_and_graph].end()) {
  1424. // The graph is not executed in that iteration
  1425. return UINT32_MAX;
  1426. } else if (it == graphs_run_history_[rank_and_graph].begin()) {
  1427. // current iteration is the first iteration that the graph was run
  1428. // no prev iter is available
  1429. MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
  1430. << " is the first run iteration for tensor: " << tensor->GetName();
  1431. return UINT32_MAX;
  1432. }
  1433. it--;
  1434. prev_iter = *it;
  1435. tensor->SetPrevIteration(prev_iter);
  1436. return prev_iter;
  1437. }
  1438. void DebugServices::ResetLoadedTensors() {
  1439. wp_id_cache_.clear();
  1440. MS_LOG(INFO) << "Resetting loaded tensors";
  1441. tensor_loader_->MoveParametersCurrentToPrev();
  1442. tensor_loader_->EmptyCurrentTensor();
  1443. // will move parameters from previous to current map
  1444. tensor_loader_->SwapCurrentPrev();
  1445. overflow_ops_.clear();
  1446. }
  1447. #ifdef ONLINE_DBG_MODE
  1448. std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
  1449. MS_EXCEPTION_IF_NULL(kernel);
  1450. std::vector<std::shared_ptr<TensorData>> result;
  1451. auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  1452. auto kernel_name = GetKernelNodeName(kernel);
  1453. for (size_t j = 0; j < output_size; ++j) {
  1454. auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
  1455. auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
  1456. if (tensor != nullptr) {
  1457. result.push_back(tensor);
  1458. }
  1459. }
  1460. return result;
  1461. }
  1462. #endif
  1463. bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
  1464. unsigned int iteration) {
  1465. std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
  1466. std::vector<std::string> op_names;
  1467. std::string overflow_bin_path;
  1468. #ifdef ONLINE_DBG_MODE
  1469. if (DumpJsonParser::GetInstance().path().empty()) {
  1470. // Dump config is not set.
  1471. return false;
  1472. }
  1473. auto debugger = Debugger::GetInstance();
  1474. overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->root_graph_id());
  1475. auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
  1476. if (!realpath.has_value()) {
  1477. MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
  1478. return false;
  1479. }
  1480. overflow_bin_path = realpath.value() + '/';
  1481. #else
  1482. overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
  1483. std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
  1484. overflow_bin_path = RealPath(overflow_bin_path);
  1485. #endif
  1486. overflow_wp_lock_.lock();
  1487. MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
  1488. auto found_overflows = overflow_ops_.find(overflow_bin_path);
  1489. if (found_overflows != overflow_ops_.end()) {
  1490. MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
  1491. op_names = overflow_ops_[overflow_bin_path];
  1492. } else {
  1493. std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
  1494. std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
  1495. const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
  1496. MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
  1497. std::string abspath = RealPath(overflow_bin_path);
  1498. DIR *d = opendir(abspath.c_str());
  1499. if (d == nullptr) {
  1500. MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
  1501. } else {
  1502. struct dirent *dir = nullptr;
  1503. while ((dir = readdir(d)) != nullptr) {
  1504. if (dir->d_type == DT_REG) {
  1505. // form fully qualified filename
  1506. std::string file_path = overflow_bin_path;
  1507. std::string file_name = dir->d_name;
  1508. (void)file_path.append(file_name);
  1509. // attempt to read the file
  1510. std::ifstream infile;
  1511. infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
  1512. if (!infile.is_open()) {
  1513. MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
  1514. continue;
  1515. }
  1516. std::string node_name;
  1517. uint64_t task_id = 0;
  1518. uint64_t stream_id = 0;
  1519. // detect overflow bin file
  1520. if (file_name.rfind(overflow_file_prefix, 0) == 0) {
  1521. // start of op overflow data in bin file
  1522. const uint32_t offset = 321;
  1523. (void)infile.seekg(offset, std::ios::beg);
  1524. std::vector<char> buffer;
  1525. // size of op overflow info section
  1526. const size_t buf_size = 256;
  1527. buffer.resize(buf_size);
  1528. (void)infile.read(buffer.data(), buf_size);
  1529. if (infile.gcount() != buf_size) {
  1530. MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
  1531. continue;
  1532. }
  1533. const uint8_t stream_id_offset = 16;
  1534. const uint8_t task_id_offset = 24;
  1535. // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
  1536. // byte values currently.
  1537. stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
  1538. task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
  1539. MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
  1540. << ".";
  1541. task_stream_hit.push_back(std::make_pair(task_id, stream_id));
  1542. } else {
  1543. // regular bin file
  1544. bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
  1545. if (success_parse) {
  1546. task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
  1547. }
  1548. }
  1549. infile.close();
  1550. }
  1551. }
  1552. (void)closedir(d);
  1553. }
  1554. // find the op_names with an overflow hit
  1555. for (auto &task_stream : task_stream_hit) {
  1556. auto op_name = task_stream_to_opname[task_stream];
  1557. if (!op_name.empty()) {
  1558. MS_LOG(INFO) << "Operation overflow detected in " << op_name;
  1559. op_names.push_back(op_name);
  1560. }
  1561. }
  1562. overflow_ops_[overflow_bin_path] = op_names;
  1563. }
  1564. overflow_wp_lock_.unlock();
  1565. // remove prefix "kernel_graph_#_" from node_name_to_find before checking it
  1566. std::string op_name_to_find = RemoveKernelGraphPrefix(node_name_to_find);
  1567. // determine if overflow wp has been triggered for node_name_to_find
  1568. if (find(op_names.begin(), op_names.end(), op_name_to_find) != op_names.end()) {
  1569. MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
  1570. return true;
  1571. }
  1572. return false;
  1573. }
  1574. std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) {
  1575. std::string op_name_to_find = node_name_to_find;
  1576. const std::string kernel_prefix = "kernel_graph_";
  1577. if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
  1578. auto start_of_op_name = node_name_to_find.find("_", kernel_prefix.length());
  1579. if (start_of_op_name != std::string::npos) {
  1580. op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
  1581. }
  1582. }
  1583. return op_name_to_find;
  1584. }
  1585. bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *const node_name,
  1586. uint64_t *task_id, uint64_t *stream_id) {
  1587. // get the node_name, task_id, and stream_id from async dump filename
  1588. // node_type.node_name.task_id.stram_id.timestamp
  1589. // WARNING: node_name may have dots in it
  1590. size_t fourth_dot = file_name.rfind(".");
  1591. size_t third_dot = file_name.rfind(".", fourth_dot - 1);
  1592. size_t second_dot = file_name.rfind(".", third_dot - 1);
  1593. size_t first_dot = file_name.find(".");
  1594. // check if dots were found
  1595. if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
  1596. fourth_dot == std::string::npos) {
  1597. return false;
  1598. }
  1599. // check if its not an async bin file
  1600. if (file_name.substr(fourth_dot) == ".npy") {
  1601. return false;
  1602. }
  1603. // get node_name
  1604. if (first_dot < second_dot) {
  1605. *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
  1606. } else {
  1607. MS_LOG(ERROR) << "Async filename parse error to get node_name.";
  1608. return false;
  1609. }
  1610. // get task id
  1611. if (second_dot < third_dot) {
  1612. std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
  1613. try {
  1614. *task_id = std::stoull(extracted_task_id);
  1615. } catch (std::invalid_argument &e) {
  1616. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
  1617. return false;
  1618. } catch (std::out_of_range &e) {
  1619. MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
  1620. return false;
  1621. }
  1622. } else {
  1623. MS_LOG(ERROR) << "Async filename parse error to get task_id.";
  1624. return false;
  1625. }
  1626. // get stream id
  1627. if (third_dot < fourth_dot) {
  1628. std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
  1629. try {
  1630. *stream_id = std::stoull(extracted_stream_id);
  1631. } catch (std::invalid_argument &e) {
  1632. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
  1633. return false;
  1634. } catch (std::out_of_range &e) {
  1635. MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
  1636. return false;
  1637. }
  1638. } else {
  1639. MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
  1640. return false;
  1641. }
  1642. return true;
  1643. }
  1644. std::string DebugServices::RealPath(const std::string &input_path) {
  1645. if (input_path.length() >= PATH_MAX) {
  1646. MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
  1647. }
  1648. size_t path_split_pos = input_path.find_last_of('/');
  1649. // get real path
  1650. char real_path[PATH_MAX] = {0};
  1651. // input_path is dir + file_name
  1652. if (path_split_pos != std::string::npos) {
  1653. std::string prefix_path = input_path.substr(0, path_split_pos);
  1654. std::string file_name = input_path.substr(path_split_pos);
  1655. if (file_name.length() > NAME_MAX) {
  1656. MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
  1657. }
  1658. if (realpath(prefix_path.c_str(), real_path) == nullptr) {
  1659. MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist.";
  1660. return "";
  1661. }
  1662. return std::string(real_path) + file_name;
  1663. }
  1664. // input_path is only file_name
  1665. if (input_path.length() > NAME_MAX) {
  1666. MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
  1667. }
  1668. if (realpath(input_path.c_str(), real_path) == nullptr) {
  1669. MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
  1670. }
  1671. return std::string(real_path);
  1672. }
  1673. uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
  1674. #if defined(__APPLE__)
  1675. return *reinterpret_cast<const uint64_t *>(buffer.data());
  1676. #else
  1677. return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
  1678. #endif
  1679. }
  1680. bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
  1681. return tensor_loader_->TensorExistsInCurrent(tensor_name);
  1682. }
  1683. void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
  1684. tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
  1685. }
  1686. void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
  1687. if (tensor_loader_->EnableMemoryControl()) {
  1688. tensor_loader_->AppendToCacheEvictQueue(tensor_name);
  1689. }
  1690. }
  1691. void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
  1692. std::string DebugServices::GetNetName() { return net_name_; }
  1693. void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
  1694. std::string DebugServices::GetDumpDir() { return dump_dir_; }
  1695. void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
  1696. bool DebugServices::GetSyncMode() { return is_sync_mode_; }
  1697. void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
  1698. #ifdef ONLINE_DBG_MODE
  1699. } // namespace mindspore
  1700. #endif