You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debugger.cc 67 kB

4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <dirent.h>
  17. #include <cstdio>
  18. #include <fstream>
  19. #include <tuple>
  20. #include <vector>
  21. #include <algorithm>
  22. #include <iostream>
  23. #include <cstring>
  24. #include <utility>
  25. #include <map>
  26. #include <regex>
  27. #include "debug/debugger/debugger.h"
  28. #include "debug/data_dump/dump_json_parser.h"
  29. #include "pipeline/jit/pipeline.h"
  30. #include "backend/common/session/anf_runtime_algorithm.h"
  31. #include "runtime/device/kernel_runtime_manager.h"
  32. #include "runtime/device/kernel_runtime.h"
  33. #include "debug/data_dump/e2e_dump.h"
  34. #include "utils/config_manager.h"
  35. #include "debug/env_config_parser.h"
  36. #include "utils/comm_manager.h"
  37. #include "runtime/hardware/device_context_manager.h"
  38. #include "debug/anf_ir_dump.h"
  39. #include "debug/anf_ir_utils.h"
  40. #include "runtime/graph_scheduler/device_tensor_store.h"
  41. #ifdef ENABLE_DEBUGGER
  42. #include "debug/debugger/proto_exporter.h"
  43. #else
  44. #include "debug/debugger/proto_exporter_stub.h"
  45. #endif
  46. using debugger::Chunk;
  47. using debugger::EventReply;
  48. using debugger::GraphProto;
  49. using debugger::ModelProto;
  50. using debugger::Statistics;
  51. using debugger::TensorProto;
  52. using debugger::WatchCondition;
  53. using debugger::WatchCondition_Condition_inf;
  54. using debugger::WatchCondition_Condition_nan;
  55. using debugger::WatchCondition_Parameter;
  56. using debugger::WatchNode;
  57. using debugger::WatchpointHit;
  58. using mindspore::runtime::DeviceTensorStore;
  59. namespace mindspore {
  60. static constexpr auto g_chunk_size = 1024 * 1024 * 3;
  61. static constexpr int32_t heartbeat_period_second = 30;
  62. DebuggerPtr Debugger::debugger_ = nullptr;
  63. std::mutex Debugger::instance_lock_;
  64. Debugger::Debugger()
  65. : grpc_client_(nullptr),
  66. debug_services_(nullptr),
  67. heartbeat_thread_(nullptr),
  68. device_id_(0),
  69. device_target_(""),
  70. num_step_(0),
  71. debugger_enabled_(false),
  72. suspended_at_last_kernel_(false),
  73. run_level_(""),
  74. node_name_(""),
  75. cur_name_(""),
  76. training_done_(false),
  77. send_metadata_done_(false),
  78. received_new_graph_(false),
  79. is_dataset_graph_(false),
  80. partial_memory_(false),
  81. initial_suspend_(true),
  82. enable_heartbeat_(false),
  83. not_dataset_graph_sum_(0),
  84. ascend_kernel_by_kernel_(false),
  85. version_("") {
  86. CheckDebuggerEnabledParam();
  87. auto ms_context = MsContext::GetInstance();
  88. MS_EXCEPTION_IF_NULL(ms_context);
  89. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  90. MS_LOG(INFO) << "Debugger got device_target: " << device_target;
  91. if (device_target == kCPUDevice) {
  92. MS_LOG(WARNING) << "Not enabling debugger. Debugger does not support CPU.";
  93. } else if (CheckDebuggerEnabled()) {
  94. // configure partial memory reuse
  95. partial_memory_ = CheckDebuggerPartialMemoryEnabled();
  96. // switch memory reuse on or off
  97. EnvConfigParser::GetInstance().SetSysMemreuse(partial_memory_);
  98. // print some message about memory reuse to user
  99. if (partial_memory_) {
  100. MS_LOG(WARNING)
  101. << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
  102. "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
  103. } else {
  104. MS_LOG(WARNING)
  105. << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
  106. "usage for large models.";
  107. }
  108. }
  109. }
  110. void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  111. // access lock for public method
  112. std::lock_guard<std::mutex> a_lock(access_lock_);
  113. // save device_id
  114. MS_LOG(INFO) << "Debugger got device_id: " << device_id;
  115. device_id_ = device_id;
  116. MS_LOG(INFO) << "Debugger got device_target: " << device_target;
  117. device_target_ = device_target;
  118. version_ = MSVERSION;
  119. }
  120. bool IsTypeDebuggerSupported(TypeId type) {
  121. if (type < TypeId::kNumberTypeEnd && type > TypeId::kNumberTypeBegin && type != kNumberTypeComplex64) {
  122. return true;
  123. }
  124. MS_LOG(INFO) << "Debugger does not support type: " << TypeIdLabel(type);
  125. return false;
  126. }
  127. void Debugger::EnableDebugger() {
  128. // reset some of the class members
  129. num_step_ = 0;
  130. debugger_enabled_ = false;
  131. enable_heartbeat_ = false;
  132. partial_memory_ = false;
  133. grpc_client_ = nullptr;
  134. debug_services_ = nullptr;
  135. heartbeat_thread_ = nullptr;
  136. // see if dump using debugger backend is enabled
  137. bool dump_enabled = CheckDebuggerDumpEnabled();
  138. MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
  139. // check if debugger enabled
  140. debugger_enabled_ = CheckDebuggerEnabled();
  141. MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
  142. if (!debugger_enabled_ && !dump_enabled) {
  143. MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
  144. return;
  145. }
  146. if (debugger_enabled_) {
  147. // configure grpc host
  148. std::string env_host_str = common::GetEnv("MS_DEBUGGER_HOST");
  149. std::string host;
  150. if (!env_host_str.empty()) {
  151. if (CheckIp(env_host_str)) {
  152. MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
  153. host = env_host_str;
  154. } else {
  155. debugger_enabled_ = false;
  156. MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_HOST isn't a valid IP address. "
  157. "Please set environment variable MS_DEBUGGER_HOST=x.x.x.x to a valid IP";
  158. }
  159. } else {
  160. MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
  161. host = "localhost";
  162. }
  163. // configure grpc port
  164. std::string env_port_str = common::GetEnv("MS_DEBUGGER_PORT");
  165. std::string port;
  166. if (!env_port_str.empty()) {
  167. if (CheckPort(env_port_str)) {
  168. MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
  169. port = env_port_str;
  170. } else {
  171. debugger_enabled_ = false;
  172. MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to "
  173. "65535";
  174. }
  175. } else {
  176. port = "50051";
  177. if (!CheckPort(port)) {
  178. MS_EXCEPTION(ValueError) << "Default MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to 65535";
  179. }
  180. MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
  181. }
  182. // initialize grpc client
  183. grpc_client_ = std::make_unique<GrpcClient>(host, port);
  184. // initialize sending heartbeat
  185. heartbeat_thread_ = std::make_unique<std::thread>([this]() { SendHeartbeat(heartbeat_period_second); });
  186. }
  187. debug_services_ = std::make_unique<DebugServices>();
  188. }
  189. void Debugger::CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
  190. bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
  191. if (CheckDebuggerDumpEnabled() && sink_mode && device_target_ == kGPUDevice) {
  192. MS_EXCEPTION(NotSupportError)
  193. << "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  194. }
  195. if (CheckDebuggerEnabled() && sink_mode) {
  196. MS_EXCEPTION(NotSupportError)
  197. << "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  198. }
  199. }
  200. bool Debugger::CheckDebuggerDumpEnabled() const {
  201. // see if dump is enabled
  202. auto &dump_json_parser = DumpJsonParser::GetInstance();
  203. if (device_target_ == kGPUDevice) {
  204. return dump_json_parser.e2e_dump_enabled();
  205. } else if (device_target_ == kAscendDevice) {
  206. return dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled();
  207. }
  208. return false;
  209. }
  210. bool Debugger::CheckDebuggerEnabled() const {
  211. // get env variables to configure debugger
  212. std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
  213. if (!env_enable_str.empty()) {
  214. (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
  215. if ((env_enable_str == "1" || env_enable_str == "true") && device_target_ != kCPUDevice) {
  216. return true;
  217. }
  218. }
  219. return false;
  220. }
  221. void Debugger::CheckDebuggerEnabledParam() const {
  222. // check the value of env variable ENABLE_MS_DEBUGGER
  223. std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
  224. if (!env_enable_str.empty()) {
  225. (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
  226. if (env_enable_str != "0" && env_enable_str != "1" && env_enable_str != "false" && env_enable_str != "true") {
  227. MS_LOG(WARNING) << "Env variable ENABLE_MS_DEBUGGER should be True/False/1/0 (case insensitive), but get: "
  228. << env_enable_str;
  229. }
  230. }
  231. }
  232. bool Debugger::CheckDebuggerPartialMemoryEnabled() const {
  233. std::string env_partial_mem_str = common::GetEnv("MS_DEBUGGER_PARTIAL_MEM");
  234. if (!env_partial_mem_str.empty()) {
  235. MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
  236. if (env_partial_mem_str == "1") {
  237. return true;
  238. }
  239. }
  240. return false;
  241. }
  242. /*
  243. * Feature group: Dump, Online debugger.
  244. * Target device group: Ascend, GPU.
  245. * Runtime category: Old runtime, MindRT
  246. * Description: Returns true if online debugger or dump is enabled.
  247. */
  248. bool Debugger::DebuggerBackendEnabled() const { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
  249. void Debugger::Reset() {
  250. // access lock for public method
  251. std::lock_guard<std::mutex> a_lock(access_lock_);
  252. // reset components
  253. if (heartbeat_thread_ && heartbeat_thread_->joinable()) {
  254. SetEnableHeartbeat(false);
  255. heartbeat_thread_->join();
  256. MS_LOG(INFO) << "Join Heartbeat thread.";
  257. }
  258. heartbeat_thread_ = nullptr;
  259. device_id_ = 0;
  260. device_target_ = "";
  261. num_step_ = 0;
  262. debugger_enabled_ = false;
  263. is_dataset_graph_ = false;
  264. partial_memory_ = false;
  265. graph_ptr_ = nullptr;
  266. grpc_client_ = nullptr;
  267. debug_services_ = nullptr;
  268. graph_proto_list_.clear();
  269. graph_ptr_list_.clear();
  270. graph_ptr_step_vec_.clear();
  271. parameters_mindRT_.clear();
  272. visited_root_graph_ids_.clear();
  273. MS_LOG(INFO) << "Release Debugger resource.";
  274. }
  275. /*
  276. * Feature group: Dump, Online debugger.
  277. * Target device group: Ascend, GPU.
  278. * Runtime category: MindRT.
  279. * Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
  280. * prev_root_graph_id_ and calls PreExecute function for all the graphs.
  281. */
  282. void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
  283. const std::vector<AnfNodePtr> &origin_parameters_order) {
  284. // MindRTBackend for GPU and Ascend
  285. if (device_target_ == kCPUDevice) {
  286. return;
  287. }
  288. // Store graphs that are run in one step.
  289. graph_ptr_step_vec_ = graphs;
  290. parameters_mindRT_ = origin_parameters_order;
  291. prev_root_graph_id_ = cur_root_graph_id_;
  292. // set first run graph as the root graph
  293. cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
  294. MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
  295. << " for step: " << num_step_ << ".";
  296. MS_LOG(DEBUG) << "Set root graph for all the subgraphs:";
  297. for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
  298. const auto &graph = graphs[graph_index];
  299. // set root graph id for GPU mindrt runtime.
  300. MS_LOG(DEBUG) << "Set root graph for graph: " << graph->graph_id() << " to: " << cur_root_graph_id_ << ".";
  301. graph->set_root_graph_id(cur_root_graph_id_);
  302. if (debugger_) {
  303. debugger_->PreExecute(graph);
  304. }
  305. }
  306. }
  307. /*
  308. * Feature group: Dump.
  309. * Target device group: Ascend.
  310. * Runtime category: Old runtime, MindRT.
  311. * Description: When async dump is enabled and dataset_sink_mode is true, graph_iter_num_map_ stores the number of
  312. * iterations per epoch for each running graph.
  313. */
  314. void Debugger::UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num) {
  315. if (graph_iter_num_map_.find(graph_id) == graph_iter_num_map_.end()) {
  316. graph_iter_num_map_[graph_id] = iter_num;
  317. }
  318. }
  319. /*
  320. * Feature group: Dump, Online debugger.
  321. * Target device group: Ascend.
  322. * Runtime category: Old runtime.
  323. * Description: For Ascend old runtime, this function sets the current and previous root graph id.
  324. */
  325. void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
  326. // for GPU and ascend MindRT root graphs are set in PreExecuteGraphDebugger.
  327. if (device_target_ != kAscendDevice || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  328. return;
  329. }
  330. prev_root_graph_id_ = cur_root_graph_id_;
  331. cur_root_graph_id_ = root_graph_id;
  332. MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
  333. << " for step: " << num_step_ << ".";
  334. }
  335. /*
  336. * Feature group: Dump, Online debugger.
  337. * Target device group: GPU.
  338. * Runtime category: Old runtime.
  339. * Description: In the case of GPU old runtime and when we have multiple subgraphs, we use the first run graph id to
  340. * update the step number.
  341. */
  342. void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
  343. // collect rungrap_ids to update step number in multigraph case for GPU old runtime
  344. if (!rungraph_id_list_.size()) {
  345. rungraph_id_list_.push_back(graph_id);
  346. } else {
  347. if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
  348. rungraph_id_list_.push_back(graph_id);
  349. }
  350. }
  351. }
  352. /*
  353. * Feature group: Dump, Online debugger.
  354. * Target device group: Ascend, GPU.
  355. * Runtime category: Old runtime, MindRT.
  356. * Description: Sets previous and current root_graph_id for Ascend old runtime, sends graphs to online debugger when
  357. * debugger_enabled_ is true.
  358. */
  359. void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  360. MS_EXCEPTION_IF_NULL(graph_ptr);
  361. // access lock for public method
  362. std::lock_guard<std::mutex> a_lock(access_lock_);
  363. if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  364. // Checking dataset_sink_mode for mindRT is done in debug_actor
  365. CheckDatasetSinkMode(graph_ptr);
  366. }
  367. auto graph_id = graph_ptr->graph_id();
  368. MS_LOG(DEBUG) << "PreExecute for graph: " << graph_id << " in step: " << num_step_ << ".";
  369. StoreRunGraphIdList(graph_id);
  370. SetCurrentAndPrevRootGraph(graph_ptr->root_graph_id());
  371. // multiple graphs
  372. if (graph_proto_list_.size() > 1) {
  373. // there are more than one graphs are not dataset_graph
  374. if (not_dataset_graph_sum_ > 0) {
  375. SendMultiGraphsAndClear(graph_ptr);
  376. }
  377. } else if (graph_proto_list_.size() == 1) {
  378. // single graph, and not the initial step
  379. if (device_target_ == kGPUDevice && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) &&
  380. num_step_ != 0) {
  381. if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
  382. CommandLoop();
  383. }
  384. debug_services_->ResetLoadedTensors();
  385. }
  386. // In single graph case, reset graph_ptr_ to be nullptr when debugger receives a new graph
  387. if (received_new_graph_) {
  388. graph_ptr_ = nullptr;
  389. CheckGraphPtr(graph_ptr);
  390. }
  391. } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice &&
  392. !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  393. // Multiple graph, and not the initial step,
  394. // stop only when receive the first sub run graph for each step for old runtime
  395. // if we have stopped for the last kernel before, no need to stop again
  396. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  397. return;
  398. }
  399. if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
  400. CommandLoop();
  401. }
  402. debug_services_->ResetLoadedTensors();
  403. }
  404. // resets for the new graph
  405. suspended_at_last_kernel_ = false;
  406. }
  407. /*
  408. * Feature group: Online debugger.
  409. * Target device group: Ascend, GPU.
  410. * Runtime category: Old runtime, MindRT.
  411. * Description: Sends all the subgraphs to online debugger when debugger_enabled_ is true.
  412. */
  413. void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
  414. // only try to enable debugger if they are not all dataset graphs
  415. if (!debugger_enabled_) {
  416. EnableDebugger();
  417. }
  418. if (debugger_enabled_) {
  419. // only send compiled graphs once at the initial step.
  420. auto dbg_graph_ptr = graph_ptr_;
  421. // use current graph ptr to load parameters
  422. graph_ptr_ = graph_ptr;
  423. LoadParametersAndConst();
  424. // revert graph ptr to original value
  425. graph_ptr_ = dbg_graph_ptr;
  426. SendMultiGraphsAndSuspend(graph_proto_list_);
  427. graph_proto_list_.clear();
  428. received_new_graph_ = false;
  429. }
  430. }
  431. /*
  432. * Feature group: Dump.
  433. * Target device group: Ascend, GPU.
  434. * Runtime category: MindRT.
  435. * Description: Returns the rank_id for GPU and Ascend kernel-bykernel mindRT.
  436. */
  437. uint32_t Debugger::GetRankID() {
  438. auto ms_context = MsContext::GetInstance();
  439. MS_EXCEPTION_IF_NULL(ms_context);
  440. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  441. uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  442. const auto &device_context =
  443. device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
  444. uint32_t rank_id = device_context->GetRankID();
  445. return rank_id;
  446. }
  447. /*
  448. * Feature group: Dump.
  449. * Target device group: Ascend, GPU.
  450. * Runtime category: MindRT.
  451. * Description: When dump is enabled, this function: 1) Dumps parameters for the current root_graph_id to the
  452. * root_graph's directory. 2) Dumps constant data once for each graph. 3) Dumps graph run history for each graph.
  453. */
  454. void Debugger::DumpParamsAndConstAndHistory() {
  455. if (!CheckDebuggerDumpEnabled()) {
  456. return;
  457. }
  458. LoadParametersAllGraphs();
  459. (void)E2eDump::DumpParametersData(GetRankID(), debugger_.get());
  460. // Whether constant data was already dumped for the current root graph.
  461. bool cur_root_graph_checked = std::find(visited_root_graph_ids_.begin(), visited_root_graph_ids_.end(),
  462. cur_root_graph_id_) != visited_root_graph_ids_.end();
  463. for (auto graph : graph_ptr_step_vec_) {
  464. if (!cur_root_graph_checked) {
  465. LoadConstsForGraph(graph);
  466. // Dump constant data for GPU.
  467. E2eDump::DumpConstantData(graph.get(), GetRankID(), debugger_.get());
  468. // Dump constant data for Ascend.
  469. DumpConstantDataAscend(graph);
  470. }
  471. // Dump graph run hisotry for each graph.
  472. E2eDump::DumpRunIter(graph, GetRankID());
  473. }
  474. if (!cur_root_graph_checked) {
  475. visited_root_graph_ids_.push_back(cur_root_graph_id_);
  476. }
  477. }
  478. void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
  479. if (device_target_ != kAscendDevice) {
  480. return;
  481. }
  482. auto &json_parser = DumpJsonParser::GetInstance();
  483. if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
  484. // Dump constant data for ascend mindRT, for old runtime constant data is dumped in session_basic.
  485. uint32_t rank_id = GetRankID();
  486. std::string cst_file_dir = GenerateDumpPath(graph->root_graph_id(), rank_id, true);
  487. DumpConstantInfo(graph, cst_file_dir);
  488. }
  489. }
  490. /*
  491. * Feature group: Dump.
  492. * Target device group: Ascend, GPU.
  493. * Runtime category: MindRT.
  494. * Description: Dumps a single node for given graph_id.
  495. */
  496. void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
  497. if (debugger_ && debugger_->DebuggerBackendEnabled()) {
  498. uint32_t rank_id = GetRankID();
  499. (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
  500. }
  501. }
  502. /*
  503. * Feature group: Dump.
  504. * Target device group: GPU.
  505. * Runtime category: MindRT.
  506. * Description: This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in
  507. * session_basic.
  508. */
  509. void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
  510. if (device_target_ == kAscendDevice) {
  511. return;
  512. }
  513. auto &json_parser = DumpJsonParser::GetInstance();
  514. if (json_parser.e2e_dump_enabled()) {
  515. uint32_t rank_id = GetRankID();
  516. kernel_graph->set_root_graph_id(kernel_graph->graph_id());
  517. std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
  518. std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
  519. std::string target_dir = root_dir + "/graphs";
  520. std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
  521. DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
  522. DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
  523. DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
  524. kernel_graph->execution_order());
  525. }
  526. }
  527. /*
  528. * Feature group: Dump, Online debugger.
  529. * Target device group: Ascend, GPU and CPU.
  530. * Runtime category: MindRT.
  531. * Description: Load and dump parameters and constant data, call postExecute and update dump iter.
  532. */
  533. void Debugger::PostExecuteGraphDebugger() {
  534. // On CPU, update dump iteration, Parameters and consts are not dumped here
  535. if (device_target_ == kCPUDevice) {
  536. DumpJsonParser::GetInstance().UpdateDumpIter();
  537. return;
  538. }
  539. DumpParamsAndConstAndHistory();
  540. // debug used for dump
  541. if (CheckDebuggerDumpEnabled() && !debugger_enabled()) {
  542. ClearCurrentData();
  543. }
  544. if (debugger_) {
  545. debugger_->PostExecute();
  546. }
  547. E2eDump::UpdateIterMindRTDump();
  548. }
  549. /*
  550. * Feature group: Online debugger.
  551. * Target device group: Ascend, GPU.
  552. * Runtime category: Old runtime, MindRT.
  553. * Description: Send hit watchpoints, update the step number and reset loaded tensors.
  554. */
  555. void Debugger::PostExecute() {
  556. // access lock for public method
  557. std::lock_guard<std::mutex> a_lock(access_lock_);
  558. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  559. return;
  560. }
  561. if (debugger_ && debugger_->DebuggerBackendEnabled()) {
  562. // analyze tensor data and send the watchpoints been hit
  563. if (debugger_enabled_ && !is_dataset_graph_) {
  564. SendWatchpoints(CheckWatchpoints());
  565. // no need to suspend at each graph for GPU old runtime, suspension happens in preExecute
  566. if (device_target_ == kAscendDevice) {
  567. CommandLoop();
  568. } else if (device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  569. if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
  570. CommandLoop();
  571. }
  572. }
  573. if (device_target_ != kGPUDevice) {
  574. num_step_++;
  575. }
  576. }
  577. // Only keep parameters in th current map
  578. // GPU ResetLoadedTensors for old runtime happens in preExecute
  579. if ((device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  580. device_target_ == kAscendDevice) {
  581. if (debug_services_ != nullptr) {
  582. debug_services_->ResetLoadedTensors();
  583. } else {
  584. MS_LOG(DEBUG) << "debug_services_ is nullptr";
  585. }
  586. }
  587. }
  588. }
  589. bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
  590. if (debugger_enabled_ && !is_dataset_graph_) {
  591. auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
  592. // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
  593. if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
  594. return true;
  595. }
  596. }
  597. return false;
  598. }
  599. /*
  600. * Feature group: Online debugger.
  601. * Target device group: GPU.
  602. * Runtime category: Old runtime, MindRT.
  603. * Description: Check and send watchpoint hit for a single node, suspend if a watchpoint is hit or we are continuing
  604. * in node level.
  605. */
  606. void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
  607. // access lock for public method
  608. std::lock_guard<std::mutex> a_lock(access_lock_);
  609. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  610. return;
  611. }
  612. if (debugger_enabled_ && !is_dataset_graph_) {
  613. auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
  614. // if kernel is watchpoint,and get hit. suspend.
  615. bool hit_empty_flag = true;
  616. if (is_watchpoint) {
  617. auto hits = CheckWatchpoints(cur_name_, kernel);
  618. if (!hits.empty()) {
  619. SendWatchpoints(hits);
  620. CommandLoop();
  621. hit_empty_flag = false;
  622. }
  623. }
  624. if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
  625. // if kernel is not watchpoint and is next_to or continue_to node, suspend
  626. // sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
  627. if (last_kernel) {
  628. suspended_at_last_kernel_ = true;
  629. }
  630. CommandLoop();
  631. }
  632. return;
  633. }
  634. }
  635. /*
  636. * Feature group: Dump, Online debugger.
  637. * Target device group: Ascend, GPU.
  638. * Runtime category: Old runtime, MindRT.
  639. * Description: Get graph proto and add it to graph proto list and add loaded graph pointers to a list.
  640. */
  641. void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
  642. MS_EXCEPTION_IF_NULL(graph_ptr);
  643. if (graph_ptr_ != graph_ptr) {
  644. MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
  645. received_new_graph_ = true;
  646. // save new graph_ptr
  647. graph_ptr_ = graph_ptr;
  648. CheckDatasetGraph();
  649. if (!is_dataset_graph_) {
  650. // get proto for new graph_ptr
  651. auto graph_proto = GetGraphProto(graph_ptr);
  652. // add new graph proto to graph_proto_list_
  653. graph_proto_list_.push_back(graph_proto);
  654. graph_ptr_list_.push_back(graph_ptr);
  655. not_dataset_graph_sum_++;
  656. }
  657. // reset is_dataset_graph to be false
  658. is_dataset_graph_ = false;
  659. }
  660. }
  661. // In single graph cases, check single graph ptr
  662. void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
  663. MS_EXCEPTION_IF_NULL(graph_ptr);
  664. if (graph_ptr_ != graph_ptr) {
  665. MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
  666. // save new graph_ptr
  667. graph_ptr_ = graph_ptr;
  668. if (!is_dataset_graph_) {
  669. // only try to enable debugger if it is not a dataset graph
  670. if (!debugger_enabled_) {
  671. EnableDebugger();
  672. }
  673. if (debugger_enabled_) {
  674. LoadParametersAndConst();
  675. // get graph proto and send to MindInsight
  676. auto graph_proto = graph_proto_list_.front();
  677. SendGraphAndSuspend(graph_proto);
  678. graph_proto_list_.clear();
  679. received_new_graph_ = false;
  680. }
  681. }
  682. }
  683. }
  684. void Debugger::CheckDatasetGraph() {
  685. // print parameter node names
  686. MS_EXCEPTION_IF_NULL(graph_ptr_);
  687. const auto &params = graph_ptr_->inputs();
  688. for (const auto &param : params) {
  689. MS_LOG(INFO) << "param: " << GetKernelNodeName(param);
  690. }
  691. // check if there is GetNext or InitDataSetQueue node
  692. const auto &nodes = graph_ptr_->execution_order();
  693. for (const auto &node : nodes) {
  694. auto node_name = AnfAlgo::GetCNodeName(node);
  695. MS_LOG(INFO) << "node: " << GetKernelNodeName(node);
  696. if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
  697. MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
  698. << node_name;
  699. is_dataset_graph_ = true;
  700. return;
  701. }
  702. }
  703. is_dataset_graph_ = false;
  704. }
  705. GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
  706. // convert kernel graph to debugger modelproto
  707. ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
  708. return model.graph();
  709. }
  710. /*
  711. * Feature group: Online debugger.
  712. * Target device group: Ascend, GPU.
  713. * Runtime category: Old runtime, MindRT.
  714. * Description: Send debugger backend heartbeat to online debugger every few seconds.
  715. */
  716. void Debugger::SendHeartbeat(int32_t period) {
  717. int num_heartbeat_fail = 0;
  718. const int max_num_heartbeat_fail = 5;
  719. const int retry_milliseconds = 500;
  720. Heartbeat heartbeat;
  721. heartbeat.set_message("Debugger is alive");
  722. heartbeat.set_period(heartbeat_period_second);
  723. SetEnableHeartbeat(CheckDebuggerEnabled());
  724. while (enable_heartbeat_) {
  725. MS_EXCEPTION_IF_NULL(grpc_client_);
  726. EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
  727. if (reply.status() != EventReply::OK) {
  728. MS_LOG(ERROR) << "Error: SendHeartbeat failed";
  729. num_heartbeat_fail++;
  730. if (num_heartbeat_fail >= max_num_heartbeat_fail) {
  731. MS_LOG(ERROR) << "Maximum number of failure for SendHeartbeat reached : exiting training session.";
  732. SetEnableHeartbeat(false);
  733. break;
  734. } else {
  735. MS_LOG(ERROR) << "Number of consecutive SendHeartbeat fail:" << num_heartbeat_fail;
  736. std::this_thread::sleep_for(std::chrono::milliseconds(retry_milliseconds));
  737. }
  738. } else {
  739. int recheck_period_ms = 200;
  740. for (int i = 0; i < (period * 1000 / recheck_period_ms); i++) {
  741. if (enable_heartbeat_) {
  742. std::this_thread::sleep_for(std::chrono::milliseconds(recheck_period_ms));
  743. } else {
  744. break;
  745. }
  746. }
  747. }
  748. }
  749. }
  750. void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  751. if (!CheckSendMetadata()) {
  752. return;
  753. }
  754. // send graph to MindInsight server
  755. MS_EXCEPTION_IF_NULL(grpc_client_);
  756. EventReply reply = grpc_client_->SendGraph(graph_proto);
  757. if (reply.status() != EventReply::OK) {
  758. MS_LOG(ERROR) << "Error: SendGraph failed";
  759. }
  760. // enter command loop, wait and process commands
  761. CommandLoop();
  762. }
  763. bool Debugger::SendMetadata(bool version_check) {
  764. // prepare metadata
  765. MS_EXCEPTION_IF_NULL(graph_ptr_);
  766. std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
  767. Metadata metadata;
  768. metadata.set_device_name(device_name);
  769. metadata.set_cur_step(num_step_);
  770. metadata.set_backend(device_target_);
  771. metadata.set_cur_node(cur_name_);
  772. metadata.set_training_done(training_done_);
  773. metadata.set_ms_version(version_);
  774. MS_LOG(INFO) << "Is training done?" << training_done_;
  775. // set graph number to not_dataset_graph_sum_
  776. metadata.set_graph_num(not_dataset_graph_sum_);
  777. MS_EXCEPTION_IF_NULL(grpc_client_);
  778. EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
  779. bool ret = false;
  780. if (reply_metadata.status() == EventReply::OK) {
  781. if (version_check) {
  782. // get type of the command in meta data reply, it should be version matched
  783. DebuggerCommand cmd = GetCommand(reply_metadata);
  784. if (cmd != DebuggerCommand::kVersionMatchedCMD) {
  785. MS_LOG(ERROR) << "MindInsight version is too old, Mindspore version is " << version_;
  786. Exit();
  787. } else {
  788. if (GetMiVersionMatched(reply_metadata)) {
  789. MS_LOG(INFO) << "MindSpore version is " << version_ << " matches MindInsight version.";
  790. ret = true;
  791. } else {
  792. MS_LOG(ERROR) << "MindSpore version " << version_ << ", did not match MindInsight version.";
  793. CommandLoop();
  794. }
  795. }
  796. } else {
  797. // version check is done before so we can just return true here
  798. ret = true;
  799. }
  800. } else {
  801. MS_LOG(ERROR) << "Error: SendMetadata failed";
  802. }
  803. return ret;
  804. }
  805. void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list) {
  806. if (!CheckSendMetadata()) {
  807. return;
  808. }
  809. MS_EXCEPTION_IF_NULL(grpc_client_);
  810. // send multiple graphs to mindinght server
  811. // split graph into chunks if one graph is larger than chunk size
  812. std::list<Chunk> chunked_graph_proto_list;
  813. Chunk chunk;
  814. for (auto graph : graph_proto_list) {
  815. std::string str = graph.SerializeAsString();
  816. auto graph_size = graph.ByteSize();
  817. if (graph_size > g_chunk_size) {
  818. auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
  819. for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
  820. chunk.set_buffer(sub_graph_str[i]);
  821. if (i < sub_graph_str.size() - 1) {
  822. chunk.set_finished(false);
  823. } else {
  824. chunk.set_finished(true);
  825. }
  826. chunked_graph_proto_list.push_back(chunk);
  827. }
  828. } else {
  829. chunk.set_buffer(str);
  830. chunk.set_finished(true);
  831. chunked_graph_proto_list.push_back(chunk);
  832. }
  833. }
  834. EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
  835. if (reply.status() != EventReply::OK) {
  836. MS_LOG(ERROR) << "Error: SendGraph failed";
  837. }
  838. // enter command loop, wait and process commands
  839. CommandLoop();
  840. }
  841. bool Debugger::CheckSendMetadata() {
  842. if (!send_metadata_done_) {
  843. if (!SendMetadata(true)) {
  844. return false;
  845. }
  846. send_metadata_done_ = true;
  847. }
  848. return true;
  849. }
  850. void Debugger::CommandLoop() {
  851. // prepare metadata
  852. MS_EXCEPTION_IF_NULL(graph_ptr_);
  853. std::string device_name = std::to_string(device_id_) + ":" + std::to_string(cur_root_graph_id_);
  854. Metadata metadata;
  855. metadata.set_device_name(device_name);
  856. metadata.set_cur_step(num_step_);
  857. metadata.set_backend(device_target_);
  858. metadata.set_cur_node(cur_name_);
  859. metadata.set_training_done(training_done_);
  860. // loop exit flag
  861. bool run = false;
  862. int num_wait_fail = 0;
  863. const int max_num_wait_fail = 5;
  864. while (!run) {
  865. // wait for command
  866. MS_EXCEPTION_IF_NULL(grpc_client_);
  867. EventReply reply = grpc_client_->WaitForCommand(metadata);
  868. if (reply.status() != EventReply::OK) {
  869. MS_LOG(ERROR) << "Error: WaitForCommand failed";
  870. num_wait_fail++;
  871. if (num_wait_fail > max_num_wait_fail) {
  872. MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session.";
  873. MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config "
  874. "of debugger host and port.";
  875. Exit();
  876. run = true;
  877. } else {
  878. MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after "
  879. << num_wait_fail << "s";
  880. std::this_thread::sleep_for(std::chrono::seconds(num_wait_fail));
  881. }
  882. continue;
  883. }
  884. // get type of the command in reply
  885. DebuggerCommand cmd = GetCommand(reply);
  886. if (cmd == DebuggerCommand::kUnknownCMD) {
  887. MS_LOG(DEBUG) << "Debug: debugger received unknown command";
  888. continue;
  889. }
  890. MS_LOG(INFO) << "received command: ";
  891. switch (cmd) {
  892. case DebuggerCommand::kUnknownCMD:
  893. MS_LOG(INFO) << "UnknownCMD";
  894. break;
  895. case DebuggerCommand::kExitCMD:
  896. MS_LOG(INFO) << "ExitCMD";
  897. Exit(true);
  898. // Used for debugger termination
  899. run = true;
  900. break;
  901. case DebuggerCommand::kRunCMD:
  902. ProcessRunCMD(reply);
  903. if (GetRunLevel(reply) != "recheck") {
  904. // exit loop
  905. run = true;
  906. }
  907. break;
  908. case DebuggerCommand::kSetCMD:
  909. ProcessKSetCMD(reply);
  910. break;
  911. case DebuggerCommand::kViewCMD:
  912. ProcessKViewCMD(reply);
  913. break;
  914. case DebuggerCommand::kVersionMatchedCMD:
  915. MS_LOG(ERROR) << "Received unexpected Version Matched CMD from MindInsight.";
  916. Exit();
  917. break;
  918. default:
  919. MS_LOG(ERROR) << "Received unknown CMD from MindInsight";
  920. Exit();
  921. break;
  922. }
  923. }
  924. }
  925. void Debugger::ProcessRunCMD(const EventReply &reply) {
  926. MS_LOG(INFO) << "RunCMD";
  927. if (GetRunLevel(reply) == "recheck") {
  928. MS_LOG(INFO) << "rechecking all watchpoints";
  929. SendWatchpoints(CheckWatchpoints("", nullptr, true));
  930. } else {
  931. // no longer the initial suspension.
  932. initial_suspend_ = false;
  933. // print run cmd content
  934. // get run_level and node_name
  935. run_level_ = GetRunLevel(reply);
  936. node_name_ = GetNodeName(reply);
  937. MS_LOG(INFO) << "run_level: " << run_level_;
  938. MS_LOG(INFO) << "node_name_: " << node_name_;
  939. }
  940. }
  941. void Debugger::ProcessKSetCMD(const EventReply &reply) {
  942. MS_LOG(INFO) << "SetCMD";
  943. MS_LOG(INFO) << "id: " << GetWatchpointID(reply);
  944. MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply);
  945. if (GetWatchpointDelete(reply)) {
  946. MS_LOG(INFO) << "Deleting watchpoint";
  947. RemoveWatchpoint(GetWatchpointID(reply));
  948. } else {
  949. MS_LOG(INFO) << "Setting watchpoint";
  950. MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition();
  951. ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply);
  952. for (const auto &node : recieved_nodes) {
  953. MS_LOG(INFO) << "node name: " << node.node_name();
  954. MS_LOG(INFO) << "node type: " << node.node_type();
  955. }
  956. ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply);
  957. for (const auto &parameter : parameters) {
  958. MS_LOG(INFO) << "parameter name: " << parameter.name();
  959. MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled();
  960. MS_LOG(INFO) << "parameter value: " << parameter.value();
  961. }
  962. SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
  963. }
  964. }
  965. void Debugger::ProcessKViewCMD(const EventReply &reply) {
  966. MS_LOG(INFO) << "ViewCMD";
  967. // print view cmd content
  968. ProtoVector<TensorProto> received_tensors = GetTensors(reply);
  969. for (auto received_tensor : received_tensors) {
  970. MS_LOG(INFO) << "tensor node name: " << received_tensor.node_name();
  971. MS_LOG(INFO) << "tensor slot: " << received_tensor.slot();
  972. MS_LOG(INFO) << "tensor finished: " << std::boolalpha << received_tensor.finished() << std::noboolalpha;
  973. MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
  974. MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
  975. }
  976. switch (reply.view_cmd().level()) {
  977. case debugger::ViewCMD_Level::ViewCMD_Level_base:
  978. MS_LOG(INFO) << "Tensor base request.";
  979. ViewBaseLevel(reply);
  980. break;
  981. case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
  982. MS_LOG(INFO) << "Tensor statistics request.";
  983. ViewStatLevel(reply);
  984. break;
  985. case debugger::ViewCMD_Level::ViewCMD_Level_value:
  986. MS_LOG(INFO) << "Tensor value request.";
  987. ViewValueLevel(reply);
  988. break;
  989. default:
  990. MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
  991. break;
  992. }
  993. }
  994. void Debugger::ViewValueLevel(const EventReply &reply) {
  995. MS_LOG(INFO) << "Sending tensors";
  996. std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
  997. // print view cmd reply
  998. for (auto tensor : tensors) {
  999. MS_LOG(INFO) << "tensor node name: " << tensor.node_name();
  1000. MS_LOG(INFO) << "tensor slot: " << tensor.slot();
  1001. MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha;
  1002. MS_LOG(INFO) << "tensor iter: " << tensor.iter();
  1003. MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha;
  1004. MS_LOG(INFO) << "tensor dims: ";
  1005. for (auto dim : tensor.dims()) {
  1006. MS_LOG(INFO) << dim << ",";
  1007. }
  1008. MS_LOG(INFO) << "tensor dtype: " << tensor.data_type();
  1009. }
  1010. MS_EXCEPTION_IF_NULL(grpc_client_);
  1011. EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
  1012. if (send_tensors_reply.status() != debugger::EventReply::OK) {
  1013. MS_LOG(ERROR) << "Error: SendTensors failed";
  1014. }
  1015. }
  1016. void Debugger::ViewStatLevel(const EventReply &reply) {
  1017. std::list<TensorSummary> tensor_stats_list = LoadTensorsStat(GetTensors(reply));
  1018. EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stats_list);
  1019. if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
  1020. MS_LOG(ERROR) << "Error: SendTensorsStats failed.";
  1021. }
  1022. }
  1023. void Debugger::ViewBaseLevel(const EventReply &reply) {
  1024. std::list<TensorBase> tensor_base_list = LoadTensorsBase(GetTensors(reply));
  1025. EventReply send_tensor_base_reply = grpc_client_->SendTensorBase(tensor_base_list);
  1026. if (send_tensor_base_reply.status() != debugger::EventReply::OK) {
  1027. MS_LOG(ERROR) << "Error: SendTensorsBase failed.";
  1028. }
  1029. }
  1030. void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
  1031. tensor_item->set_node_name(tensor.node_name());
  1032. tensor_item->set_slot(tensor.slot());
  1033. tensor_item->set_iter(tensor.iter());
  1034. tensor_item->set_truncate(tensor.truncate());
  1035. tensor_item->clear_tensor_content();
  1036. tensor_item->clear_data_type();
  1037. tensor_item->clear_dims();
  1038. }
  1039. void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat,
  1040. std::list<TensorSummary> *const tensor_summary_list) {
  1041. if (tensor_summary_list == nullptr) {
  1042. MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
  1043. return;
  1044. }
  1045. TensorSummary tensor_summary_item;
  1046. TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
  1047. tensor_base->set_data_type(tensor_stat.dtype);
  1048. tensor_base->set_data_size((int64_t)tensor_stat.data_size);
  1049. for (auto elem : tensor_stat.shape) {
  1050. tensor_base->add_shape(elem);
  1051. }
  1052. Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
  1053. tensor_statistics->set_is_bool(tensor_stat.is_bool);
  1054. tensor_statistics->set_max_value(static_cast<float>(tensor_stat.max_value));
  1055. tensor_statistics->set_min_value(static_cast<float>(tensor_stat.min_value));
  1056. tensor_statistics->set_avg_value(static_cast<float>(tensor_stat.avg_value));
  1057. tensor_statistics->set_count(tensor_stat.count);
  1058. tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
  1059. tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
  1060. tensor_statistics->set_nan_count(tensor_stat.nan_count);
  1061. tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
  1062. tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
  1063. tensor_statistics->set_zero_count(tensor_stat.zero_count);
  1064. tensor_summary_list->push_back(tensor_summary_item);
  1065. }
  1066. void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
  1067. const ProtoVector<WatchCondition_Parameter> &parameters) {
  1068. std::vector<std::tuple<std::string, bool>> check_node_list;
  1069. std::vector<DebugServices::parameter_t> parameter_list;
  1070. std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list),
  1071. [](const WatchNode &node) -> std::tuple<std::string, bool> {
  1072. return make_tuple(node.node_name(), node.node_type() == "scope");
  1073. });
  1074. std::transform(
  1075. parameters.begin(), parameters.end(), std::back_inserter(parameter_list),
  1076. [](const WatchCondition_Parameter &parameter) -> DebugServices::parameter_t {
  1077. return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
  1078. });
  1079. debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
  1080. }
  1081. void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
  1082. std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &tensors) const {
  1083. std::vector<std::string> name;
  1084. std::vector<std::string> ret_name;
  1085. std::vector<const char *> data_ptr;
  1086. std::vector<ssize_t> data_size;
  1087. std::vector<unsigned int> dtype;
  1088. std::vector<std::vector<int64_t>> shape;
  1089. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1090. // ret_name will contain tensor names that are found in TensorLoader
  1091. // items in ret_name will be in the same order with tensors if found
  1092. debug_services_->ReadNodesTensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape);
  1093. std::list<TensorProto> tensor_list;
  1094. size_t result_index = 0;
  1095. for (auto tensor : tensors) {
  1096. ssize_t size_iter = 0;
  1097. if (result_index >= ret_name.size() || ret_name[result_index] != GetTensorFullName(tensor)) {
  1098. TensorProto tensor_item;
  1099. tensor_item.set_finished(true);
  1100. AddTensorProtoInfo(&tensor_item, tensor);
  1101. tensor_list.push_back(tensor_item);
  1102. continue;
  1103. }
  1104. ssize_t tensor_size = data_size[result_index];
  1105. while (size_iter < tensor_size) {
  1106. ssize_t chunk_size = g_chunk_size;
  1107. TensorProto tensor_item;
  1108. tensor_item.set_finished(false);
  1109. if (tensor_size - size_iter <= g_chunk_size) {
  1110. chunk_size = tensor_size - size_iter;
  1111. tensor_item.set_finished(true);
  1112. }
  1113. AddTensorProtoInfo(&tensor_item, tensor);
  1114. // return empty tensor if didn't find the requested tensor
  1115. tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
  1116. tensor_item.set_data_type((debugger::DataType)dtype[result_index]);
  1117. for (auto &elem : shape[result_index]) {
  1118. tensor_item.add_dims(elem);
  1119. }
  1120. // add tensor to result list and increment result_index to check next item in ret_name
  1121. tensor_list.push_back(tensor_item);
  1122. if (size_iter > INT_MAX - g_chunk_size) {
  1123. MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
  1124. }
  1125. size_iter += g_chunk_size;
  1126. }
  1127. result_index++;
  1128. }
  1129. return tensor_list;
  1130. }
  1131. std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
  1132. std::list<TensorBase> tensor_base_list;
  1133. std::vector<std::string> name;
  1134. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1135. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1136. debug_services_->SearchNodesTensors(name, &result_list);
  1137. for (auto result : result_list) {
  1138. auto tensor = std::get<1>(result);
  1139. if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
  1140. MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
  1141. // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor base.
  1142. TensorBase tensor_base_item;
  1143. tensor_base_item.set_data_size(0);
  1144. tensor_base_item.set_data_type(0);
  1145. tensor_base_item.add_shape(0);
  1146. tensor_base_list.push_back(tensor_base_item);
  1147. continue;
  1148. }
  1149. // tensor was found creating tensor base object.
  1150. TensorBase tensor_base_item;
  1151. tensor_base_item.set_data_size((int64_t)tensor->GetByteSize());
  1152. tensor_base_item.set_data_type((int32_t)tensor->GetType());
  1153. for (auto elem : tensor->GetShape()) {
  1154. tensor_base_item.add_shape(elem);
  1155. }
  1156. tensor_base_list.push_back(tensor_base_item);
  1157. }
  1158. return tensor_base_list;
  1159. }
  1160. std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
  1161. std::list<TensorSummary> tensor_summary_list;
  1162. std::vector<std::string> name;
  1163. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1164. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1165. debug_services_->SearchNodesTensors(name, &result_list);
  1166. for (auto result : result_list) {
  1167. auto tensor = std::get<1>(result);
  1168. if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
  1169. MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
  1170. // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor summary.
  1171. DebugServices::TensorStat tensor_stat;
  1172. AddTensorStatInfo(tensor_stat, &tensor_summary_list);
  1173. continue;
  1174. }
  1175. // tensor was found creating tensor summary object.
  1176. DebugServices::TensorStat tensor_stat = DebugServices::GetTensorStatistics(tensor);
  1177. AddTensorStatInfo(tensor_stat, &tensor_summary_list);
  1178. }
  1179. return tensor_summary_list;
  1180. }
  1181. std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name) const {
  1182. return debug_services_->GetTensor(tensor_name);
  1183. }
  1184. void Debugger::Exit(bool exit_success) {
  1185. // debugger will notify main thread to exit because main thread can only exit at step boundary.
  1186. MS_LOG(INFO) << "Exit Debugger";
  1187. SetEnableHeartbeat(false);
  1188. pipeline::GraphExecutorPy::DebugTerminate(true, exit_success);
  1189. }
  1190. std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
  1191. bool recheck) {
  1192. std::vector<std::string> name;
  1193. std::vector<std::string> slot;
  1194. std::vector<int> condition;
  1195. std::vector<unsigned int> watchpoint_id;
  1196. std::vector<std::string> overflow_ops;
  1197. std::vector<std::vector<DebugServices::parameter_t>> parameters;
  1198. std::vector<int32_t> error_codes;
  1199. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1200. if (watchnode.empty()) {
  1201. tensor_list = debug_services_->GetTensor();
  1202. } else {
  1203. tensor_list = debug_services_->GetNodeTensor(kernel);
  1204. }
  1205. DebugServices::AsyncFilePool file_list;
  1206. MS_LOG(INFO) << "checkwatchpoints call for step " << num_step_;
  1207. debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
  1208. file_list, &tensor_list, initial_suspend_, watchnode.empty(), recheck);
  1209. std::list<WatchpointHit> hits;
  1210. for (unsigned int i = 0; i < name.size(); i++) {
  1211. WatchpointHit hit;
  1212. std::vector<DebugServices::parameter_t> &parameter = parameters[i];
  1213. hit.set_id(watchpoint_id[i]);
  1214. hit.set_error_code(error_codes[i]);
  1215. // here TensorProto act as a tensor indicator, not sending tensor content
  1216. TensorProto *tensor_item = hit.mutable_tensor();
  1217. tensor_item->set_node_name(name[i]);
  1218. tensor_item->set_slot(slot[i]);
  1219. tensor_item->set_finished(true);
  1220. WatchCondition *condition_item = hit.mutable_watch_condition();
  1221. condition_item->set_condition(debugger::WatchCondition_Condition(condition[i]));
  1222. for (const auto &p : parameter) {
  1223. auto x = condition_item->mutable_params()->Add();
  1224. x->set_name(p.name);
  1225. x->set_disabled(p.disabled);
  1226. x->set_value(p.value);
  1227. x->set_hit(p.hit);
  1228. x->set_actual_value(p.actual_value);
  1229. }
  1230. hits.push_back(hit);
  1231. }
  1232. return hits;
  1233. }
  1234. void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
  1235. // send info about watchpoint
  1236. if (!points.empty()) {
  1237. MS_EXCEPTION_IF_NULL(grpc_client_);
  1238. EventReply reply = grpc_client_->SendWatchpointHits(points);
  1239. if (reply.status() != EventReply::OK) {
  1240. MS_LOG(ERROR) << "Error: SendWatchpointHits failed";
  1241. }
  1242. }
  1243. }
  1244. bool Debugger::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1245. const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
  1246. TypeId device_type, const std::string &addr_format, size_t slot) const {
  1247. return debug_services_.get()->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1248. device_type, addr_format, slot);
  1249. }
  1250. bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1251. if (debug_services_ != nullptr) {
  1252. return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
  1253. }
  1254. return false;
  1255. }
  1256. bool Debugger::debugger_enabled() const { return debugger_enabled_; }
  1257. DebuggerCommand GetCommand(const EventReply &reply) {
  1258. DebuggerCommand cmd = DebuggerCommand::kUnknownCMD;
  1259. switch (reply.cmd_case()) {
  1260. case debugger::EventReply::CmdCase::kExit:
  1261. cmd = DebuggerCommand::kExitCMD;
  1262. break;
  1263. case debugger::EventReply::CmdCase::kRunCmd:
  1264. cmd = DebuggerCommand::kRunCMD;
  1265. break;
  1266. case debugger::EventReply::CmdCase::kSetCmd:
  1267. cmd = DebuggerCommand::kSetCMD;
  1268. break;
  1269. case debugger::EventReply::CmdCase::kViewCmd:
  1270. cmd = DebuggerCommand::kViewCMD;
  1271. break;
  1272. case debugger::EventReply::CmdCase::kVersionMatched:
  1273. cmd = DebuggerCommand::kVersionMatchedCMD;
  1274. break;
  1275. default:
  1276. MS_LOG(DEBUG) << "Debug: UnknownCMD";
  1277. break;
  1278. }
  1279. return cmd;
  1280. }
  1281. ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply) {
  1282. if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
  1283. MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>().";
  1284. return ProtoVector<WatchCondition_Parameter>();
  1285. }
  1286. return reply.set_cmd().watch_condition().params();
  1287. }
  1288. ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) {
  1289. if (!reply.has_set_cmd()) {
  1290. MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>().";
  1291. return ProtoVector<WatchNode>();
  1292. }
  1293. return reply.set_cmd().watch_nodes();
  1294. }
  1295. std::string GetRunLevel(const EventReply &reply) {
  1296. if (!reply.has_run_cmd()) {
  1297. MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
  1298. "";
  1299. return "";
  1300. }
  1301. return reply.run_cmd().run_level();
  1302. }
  1303. std::string GetNodeName(const EventReply &reply) {
  1304. if (!reply.has_run_cmd()) {
  1305. MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
  1306. "";
  1307. return "";
  1308. }
  1309. return reply.run_cmd().node_name();
  1310. }
  1311. WatchCondition GetWatchcondition(const EventReply &reply) {
  1312. if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
  1313. MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
  1314. return WatchCondition();
  1315. }
  1316. return reply.set_cmd().watch_condition();
  1317. }
  1318. int32_t GetWatchpointID(const EventReply &reply) {
  1319. if (!reply.has_set_cmd()) {
  1320. MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0.";
  1321. return 0;
  1322. }
  1323. return reply.set_cmd().id();
  1324. }
  1325. bool GetWatchpointDelete(const EventReply &reply) {
  1326. if (!reply.has_set_cmd()) {
  1327. MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false.";
  1328. return false;
  1329. }
  1330. return reply.set_cmd().delete_();
  1331. }
  1332. ProtoVector<TensorProto> GetTensors(const EventReply &reply) {
  1333. if (!reply.has_view_cmd()) {
  1334. MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector<TensorProto>().";
  1335. return ProtoVector<TensorProto>();
  1336. }
  1337. return reply.view_cmd().tensors();
  1338. }
  1339. std::string GetTensorFullName(const TensorProto &tensor) {
  1340. string node_name = tensor.node_name();
  1341. if (tensor.truncate()) {
  1342. // scopes in node name are separated by '/'
  1343. // use the name without scope if truncate is true
  1344. std::size_t found = node_name.find_last_of("/");
  1345. node_name = node_name.substr(found + 1);
  1346. }
  1347. return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter());
  1348. }
  1349. bool GetMiVersionMatched(const EventReply &reply) { return reply.version_matched(); }
  1350. bool Debugger::partial_memory() const { return partial_memory_; }
  1351. void Debugger::SetEnableHeartbeat(bool enabled) { enable_heartbeat_ = enabled; }
  1352. void Debugger::SetCurNode(const std::string &cur_name) {
  1353. // access lock for public method
  1354. std::lock_guard<std::mutex> a_lock(access_lock_);
  1355. cur_name_ = cur_name;
  1356. }
  1357. std::string Debugger::run_level() const { return run_level_; }
  1358. void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
  1359. bool Debugger::CheckPort(const std::string &port) const {
  1360. int num = 0;
  1361. const int min_port_num = 1;
  1362. const int max_port_num = 65535;
  1363. const int decimal = 10;
  1364. if (port[0] == '0' && port[1] != '\0') return false;
  1365. int i = 0;
  1366. while (port[i] != '\0') {
  1367. if (port[i] < '0' || port[i] > '9') return false;
  1368. num = num * decimal + (port[i] - '0');
  1369. if (num > max_port_num) return false;
  1370. i++;
  1371. }
  1372. if (num < min_port_num) return false;
  1373. return true;
  1374. }
  1375. bool Debugger::CheckIp(const std::string &host) const {
  1376. std::regex reg_ip(
  1377. "(25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])"
  1378. "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
  1379. "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
  1380. "[.](25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])");
  1381. std::smatch smat;
  1382. std::string host_str = host;
  1383. return std::regex_match(host_str, smat, reg_ip);
  1384. }
  1385. uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }
  1386. /*
  1387. * Feature group: Dump.
  1388. * Target device group: Ascend, GPU.
  1389. * Runtime category: Old runtime, MindRT.
  1390. * Description: Load a single parameter or value node.
  1391. */
  1392. void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
  1393. MS_EXCEPTION_IF_NULL(anf_node);
  1394. if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
  1395. return;
  1396. }
  1397. // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
  1398. if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  1399. if (!anf_node->isa<ValueNode>() &&
  1400. !(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
  1401. return;
  1402. }
  1403. }
  1404. // for parameters and value nodes, set its execution order to be 0;
  1405. int exec_order = 0;
  1406. std::string node_name = GetKernelNodeName(anf_node);
  1407. GetFileKernelName(NOT_NULL(&node_name));
  1408. // check if output adde exists, if not, return;
  1409. if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
  1410. return;
  1411. }
  1412. auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
  1413. MS_EXCEPTION_IF_NULL(addr);
  1414. auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
  1415. if (!IsTypeDebuggerSupported(type)) {
  1416. return;
  1417. }
  1418. auto format = kOpFormat_DEFAULT;
  1419. string tensor_name = node_name + ':' + "0";
  1420. ShapeVector int_shapes = trans::GetRuntimePaddingShape(anf_node, output_index);
  1421. bool keep_prev;
  1422. if (anf_node->isa<Parameter>()) {
  1423. keep_prev = true;
  1424. debug_services_->MoveTensorCurrentToPrev(tensor_name);
  1425. } else {
  1426. keep_prev = false;
  1427. }
  1428. bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id);
  1429. if (!ret) {
  1430. MS_LOG(ERROR) << "LoadMemToHost:"
  1431. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1432. }
  1433. }
  1434. void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
  1435. MS_EXCEPTION_IF_NULL(node);
  1436. auto root_graph_id = cur_root_graph_id_;
  1437. // This function is only for loading parameters mindRT.
  1438. std::string node_name = GetKernelNodeName(node);
  1439. GetFileKernelName(NOT_NULL(&node_name));
  1440. TypeId type;
  1441. TypeId device_type;
  1442. ShapeVector int_shapes;
  1443. auto device_addr = GetParameterInfo(node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
  1444. if (device_addr == nullptr) {
  1445. MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
  1446. return;
  1447. }
  1448. if (!IsTypeDebuggerSupported(type)) {
  1449. return;
  1450. }
  1451. auto format = kOpFormat_DEFAULT;
  1452. string tensor_name = node_name + ':' + "0";
  1453. if (debug_services_ != nullptr) {
  1454. debug_services_->MoveTensorCurrentToPrev(tensor_name);
  1455. }
  1456. // Keep_prev is True for parameters.
  1457. bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id);
  1458. if (!ret) {
  1459. MS_LOG(ERROR) << "LoadMemToHost:"
  1460. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1461. }
  1462. }
  1463. /*
  1464. * Feature group: Dump, Online debugger.
  1465. * Target device group: Ascend, GPU.
  1466. * Runtime category: Old runtime, MindRT.
  1467. * Description: Load all the parameters and value nodes for the last loaded graph.
  1468. */
  1469. void Debugger::LoadParametersAndConst() {
  1470. if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  1471. MS_EXCEPTION_IF_NULL(graph_ptr_);
  1472. // load parameters
  1473. MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id() << ".";
  1474. auto root_graph_id = graph_ptr_->root_graph_id();
  1475. const auto &parameters = graph_ptr_->inputs();
  1476. for (auto &item : parameters) {
  1477. LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  1478. }
  1479. // load value nodes
  1480. // get all constant values from the graph
  1481. MS_LOG(INFO) << "Start to load value nodes for graph " << graph_ptr_->graph_id() << ".";
  1482. const auto value_nodes = graph_ptr_->graph_value_nodes();
  1483. for (auto &item : value_nodes) {
  1484. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1485. }
  1486. }
  1487. /*
  1488. * Feature group: Dump, Online debugger.
  1489. * Target device group: Ascend, GPU.
  1490. * Runtime category: Old runtime, MindRT.
  1491. * Description: Load all the parameters and value nodes for the given graph.
  1492. */
  1493. void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
  1494. if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  1495. MS_EXCEPTION_IF_NULL(graph);
  1496. // load parameters
  1497. MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
  1498. auto root_graph_id = graph->root_graph_id();
  1499. const auto &parameters = graph->inputs();
  1500. for (auto &item : parameters) {
  1501. LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  1502. }
  1503. // load value nodes
  1504. // get all constant values from the graph
  1505. MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
  1506. const auto value_nodes = graph->graph_value_nodes();
  1507. for (auto &item : value_nodes) {
  1508. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1509. }
  1510. }
  1511. /*
  1512. * Feature group: Dump.
  1513. * Target device group: GPU.
  1514. * Runtime category: MindRT.
  1515. * Description: This function is for loading parameters' data from device to host into tensor_list_map_ for GPU dump.
  1516. * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
  1517. */
  1518. void Debugger::LoadParametersAllGraphs() {
  1519. if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
  1520. return;
  1521. }
  1522. for (auto &node : parameters_mindRT_) {
  1523. LoadSingleParameterMindRT(node);
  1524. }
  1525. }
  1526. /*
  1527. * Feature group: Dump.
  1528. * Target device group: GPU.
  1529. * Runtime category: MindRT.
  1530. * Description: This function is for loading constant data from device to host into tensor_list_map_ for GPU dump.
  1531. * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
  1532. */
  1533. void Debugger::LoadConstsForGraph(const KernelGraphPtr &graph) {
  1534. if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
  1535. return;
  1536. }
  1537. // load value nodes
  1538. // get all constant values from the graph
  1539. MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
  1540. auto root_graph_id = graph->root_graph_id();
  1541. const auto value_nodes = graph->graph_value_nodes();
  1542. for (auto &item : value_nodes) {
  1543. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1544. }
  1545. }
  1546. /*
  1547. * Feature group: Online debugger.
  1548. * Target device group: Ascend.
  1549. * Runtime category: Old runtime, MindRT.
  1550. * Description: Load all the kernels for the last loaded graph.
  1551. */
  1552. void Debugger::LoadGraphOutputs() {
  1553. if (!(debugger_enabled() && device_target_ == kAscendDevice)) return;
  1554. MS_EXCEPTION_IF_NULL(graph_ptr_);
  1555. const auto &apply_kernels = graph_ptr_->execution_order();
  1556. auto root_graph_id = graph_ptr_->root_graph_id();
  1557. // for kernels, execution order starts from 1
  1558. int exec_order = 1;
  1559. for (const auto &node : apply_kernels) {
  1560. MS_EXCEPTION_IF_NULL(node);
  1561. std::string kernel_name = GetKernelNodeName(node);
  1562. auto output_size = AnfAlgo::GetOutputTensorNum(node);
  1563. if (partial_memory_) {
  1564. if (!debug_services_->IsWatchPoint(kernel_name, node)) {
  1565. continue;
  1566. }
  1567. }
  1568. for (size_t j = 0; j < output_size; ++j) {
  1569. if (!AnfAlgo::OutputAddrExist(node, j)) {
  1570. MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
  1571. continue;
  1572. }
  1573. auto addr = AnfAlgo::GetOutputAddr(node, j);
  1574. MS_EXCEPTION_IF_NULL(addr);
  1575. auto type = AnfAlgo::GetOutputInferDataType(node, j);
  1576. if (!IsTypeDebuggerSupported(type)) {
  1577. continue;
  1578. }
  1579. auto format = kOpFormat_DEFAULT;
  1580. string tensor_name = kernel_name + ':' + std::to_string(j);
  1581. ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
  1582. auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id);
  1583. if (!ret) {
  1584. MS_LOG(ERROR) << "LoadMemToHost:"
  1585. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1586. }
  1587. }
  1588. exec_order = exec_order + 1;
  1589. }
  1590. }
  1591. /*
  1592. * Feature group: Online debugger.
  1593. * Target device group: GPU.
  1594. * Runtime category: Old runtime.
  1595. * Description: Update step number if we are processing the first graph (to support multigraph).
  1596. */
  1597. void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
  1598. MS_EXCEPTION_IF_NULL(graph);
  1599. MS_EXCEPTION_IF_NULL(debugger_);
  1600. if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
  1601. (graph->graph_id() == debugger_->GetFirstRunGraphId())) {
  1602. // access lock for public method
  1603. std::lock_guard<std::mutex> a_lock(access_lock_);
  1604. ++num_step_;
  1605. }
  1606. }
  1607. /*
  1608. * Feature group: Online debugger.
  1609. * Target device group: GPU.
  1610. * Runtime category: MindRT.
  1611. * Description: Update step number when DebugActor::DebugOnStepEnd is called at the end of each step.
  1612. */
  1613. void Debugger::UpdateStepNumGPU() {
  1614. auto &dump_json_parser = DumpJsonParser::GetInstance();
  1615. if (device_target_ == kGPUDevice && (debugger_enabled_ || dump_json_parser.DumpEnabledForIter())) {
  1616. // access lock for public method
  1617. std::lock_guard<std::mutex> a_lock(access_lock_);
  1618. ++num_step_;
  1619. MS_LOG(DEBUG) << "Update step for GPU, current step: " << num_step_;
  1620. }
  1621. }
  1622. void Debugger::ClearCurrentData() {
  1623. if ((device_target_ == kGPUDevice) && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) {
  1624. if (debug_services_) {
  1625. debug_services_->EmptyCurrentTensor();
  1626. } else {
  1627. MS_LOG(ERROR) << "debug_services_ is nullptr";
  1628. }
  1629. }
  1630. }
  1631. bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
  1632. if (debug_services_ != nullptr) {
  1633. return debug_services_->TensorExistsInCurrent(tensor_name);
  1634. }
  1635. return false;
  1636. }
  1637. #ifdef ENABLE_D
  1638. /*
  1639. * Feature group: Dump.
  1640. * Target device group: Ascend.
  1641. * Runtime category: Old runtime, MindRT.
  1642. * Description: Load DumpDataBuilder object from dump_data_construct_map_ for tracking data chunks of node_name. It's
  1643. * for Ascend a + m dump. If not found, create a new one for it and add to dump_data_construct_map_.
  1644. */
  1645. std::shared_ptr<DumpDataBuilder> Debugger::LoadDumpDataBuilder(const std::string &node_name) {
  1646. auto iter = dump_data_construct_map_.find(node_name);
  1647. if (iter == dump_data_construct_map_.end()) {
  1648. dump_data_construct_map_[node_name] = std::make_shared<DumpDataBuilder>();
  1649. }
  1650. return dump_data_construct_map_[node_name];
  1651. }
  1652. void Debugger::ClearDumpDataBuilder(const std::string &node_name) { dump_data_construct_map_.erase(node_name); }
  1653. #endif
  1654. } // namespace mindspore