You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debugger.cc 67 kB

4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <dirent.h>
  17. #include <cstdio>
  18. #include <fstream>
  19. #include <tuple>
  20. #include <vector>
  21. #include <algorithm>
  22. #include <iostream>
  23. #include <cstring>
  24. #include <utility>
  25. #include <map>
  26. #include <regex>
  27. #include "debug/debugger/debugger.h"
  28. #include "debug/data_dump/dump_json_parser.h"
  29. #include "pipeline/jit/pipeline.h"
  30. #include "backend/common/session/anf_runtime_algorithm.h"
  31. #include "runtime/device/kernel_runtime_manager.h"
  32. #include "runtime/device/kernel_runtime.h"
  33. #include "debug/data_dump/e2e_dump.h"
  34. #include "utils/config_manager.h"
  35. #include "debug/env_config_parser.h"
  36. #include "utils/comm_manager.h"
  37. #include "runtime/hardware/device_context_manager.h"
  38. #include "debug/anf_ir_dump.h"
  39. #include "debug/anf_ir_utils.h"
  40. #include "runtime/graph_scheduler/device_tensor_store.h"
  41. #ifdef ENABLE_DEBUGGER
  42. #include "debug/debugger/proto_exporter.h"
  43. #else
  44. #include "debug/debugger/proto_exporter_stub.h"
  45. #endif
  46. using debugger::Chunk;
  47. using debugger::EventReply;
  48. using debugger::GraphProto;
  49. using debugger::ModelProto;
  50. using debugger::Statistics;
  51. using debugger::TensorProto;
  52. using debugger::WatchCondition;
  53. using debugger::WatchCondition_Condition_inf;
  54. using debugger::WatchCondition_Condition_nan;
  55. using debugger::WatchCondition_Parameter;
  56. using debugger::WatchNode;
  57. using debugger::WatchpointHit;
  58. using mindspore::runtime::DeviceTensorStore;
  59. namespace mindspore {
  60. static constexpr auto g_chunk_size = 1024 * 1024 * 3;
  61. static constexpr int32_t heartbeat_period_second = 30;
  62. DebuggerPtr Debugger::debugger_ = nullptr;
  63. std::mutex Debugger::instance_lock_;
  64. Debugger::Debugger()
  65. : grpc_client_(nullptr),
  66. debug_services_(nullptr),
  67. heartbeat_thread_(nullptr),
  68. device_id_(0),
  69. device_target_(""),
  70. num_step_(0),
  71. debugger_enabled_(false),
  72. suspended_at_last_kernel_(false),
  73. run_level_(""),
  74. node_name_(""),
  75. cur_name_(""),
  76. training_done_(false),
  77. send_metadata_done_(false),
  78. received_new_graph_(false),
  79. is_dataset_graph_(false),
  80. partial_memory_(false),
  81. initial_suspend_(true),
  82. enable_heartbeat_(false),
  83. not_dataset_graph_sum_(0),
  84. ascend_kernel_by_kernel_(false),
  85. version_("") {
  86. CheckDebuggerEnabledParam();
  87. auto ms_context = MsContext::GetInstance();
  88. MS_EXCEPTION_IF_NULL(ms_context);
  89. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  90. MS_LOG(INFO) << "Debugger got device_target: " << device_target;
  91. if (!CheckDebuggerEnabled()) {
  92. return;
  93. } else if (device_target == kCPUDevice) {
  94. MS_LOG(WARNING) << "Not enabling debugger. Debugger does not support CPU.";
  95. } else {
  96. // configure partial memory reuse
  97. partial_memory_ = CheckDebuggerPartialMemoryEnabled();
  98. // switch memory reuse on or off
  99. EnvConfigParser::GetInstance().SetSysMemreuse(partial_memory_);
  100. // print some message about memory reuse to user
  101. if (partial_memory_) {
  102. MS_LOG(WARNING)
  103. << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
  104. "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
  105. } else {
  106. MS_LOG(WARNING)
  107. << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
  108. "usage for large models.";
  109. }
  110. }
  111. }
  112. void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  113. // access lock for public method
  114. std::lock_guard<std::mutex> a_lock(access_lock_);
  115. // save device_id
  116. MS_LOG(INFO) << "Debugger got device_id: " << device_id;
  117. device_id_ = device_id;
  118. MS_LOG(INFO) << "Debugger got device_target: " << device_target;
  119. device_target_ = device_target;
  120. version_ = MSVERSION;
  121. }
  122. bool IsTypeDebuggerSupported(TypeId type) {
  123. if (type < TypeId::kNumberTypeEnd && type > TypeId::kNumberTypeBegin && type != kNumberTypeComplex64) {
  124. return true;
  125. }
  126. MS_LOG(INFO) << "Debugger does not support type: " << TypeIdLabel(type);
  127. return false;
  128. }
  129. void Debugger::EnableDebugger() {
  130. // reset some of the class members
  131. num_step_ = 0;
  132. debugger_enabled_ = false;
  133. enable_heartbeat_ = false;
  134. partial_memory_ = false;
  135. grpc_client_ = nullptr;
  136. debug_services_ = nullptr;
  137. heartbeat_thread_ = nullptr;
  138. // see if dump using debugger backend is enabled
  139. bool dump_enabled = CheckDebuggerDumpEnabled();
  140. MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
  141. // check if debugger enabled
  142. debugger_enabled_ = CheckDebuggerEnabled();
  143. MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
  144. if (!debugger_enabled_ && !dump_enabled) {
  145. MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
  146. return;
  147. }
  148. if (debugger_enabled_) {
  149. // configure grpc host
  150. std::string env_host_str = common::GetEnv("MS_DEBUGGER_HOST");
  151. std::string host;
  152. if (!env_host_str.empty()) {
  153. if (CheckIp(env_host_str)) {
  154. MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
  155. host = env_host_str;
  156. } else {
  157. debugger_enabled_ = false;
  158. MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_HOST isn't a valid IP address. "
  159. "Please set environment variable MS_DEBUGGER_HOST=x.x.x.x to a valid IP";
  160. }
  161. } else {
  162. MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
  163. host = "localhost";
  164. }
  165. // configure grpc port
  166. std::string env_port_str = common::GetEnv("MS_DEBUGGER_PORT");
  167. std::string port;
  168. if (!env_port_str.empty()) {
  169. if (CheckPort(env_port_str)) {
  170. MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
  171. port = env_port_str;
  172. } else {
  173. debugger_enabled_ = false;
  174. MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to "
  175. "65535";
  176. }
  177. } else {
  178. port = "50051";
  179. if (!CheckPort(port)) {
  180. MS_EXCEPTION(ValueError) << "Default MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to 65535";
  181. }
  182. MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
  183. }
  184. // initialize grpc client
  185. grpc_client_ = std::make_unique<GrpcClient>(host, port);
  186. // initialize sending heartbeat
  187. heartbeat_thread_ = std::make_unique<std::thread>([this]() { SendHeartbeat(heartbeat_period_second); });
  188. }
  189. debug_services_ = std::make_unique<DebugServices>();
  190. }
  191. void Debugger::CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
  192. bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
  193. if (CheckDebuggerDumpEnabled() && sink_mode && device_target_ == kGPUDevice) {
  194. MS_EXCEPTION(NotSupportError)
  195. << "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  196. }
  197. if (CheckDebuggerEnabled() && sink_mode) {
  198. MS_EXCEPTION(NotSupportError)
  199. << "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
  200. }
  201. }
  202. bool Debugger::CheckDebuggerDumpEnabled() const {
  203. // see if dump is enabled
  204. auto &dump_json_parser = DumpJsonParser::GetInstance();
  205. if (device_target_ == kGPUDevice) {
  206. return dump_json_parser.e2e_dump_enabled();
  207. } else if (device_target_ == kAscendDevice) {
  208. return dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled();
  209. }
  210. return false;
  211. }
  212. bool Debugger::CheckDebuggerEnabled() const {
  213. // get env variables to configure debugger
  214. std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
  215. if (!env_enable_str.empty()) {
  216. (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
  217. if ((env_enable_str == "1" || env_enable_str == "true") && device_target_ != kCPUDevice) {
  218. return true;
  219. }
  220. }
  221. return false;
  222. }
  223. void Debugger::CheckDebuggerEnabledParam() const {
  224. // check the value of env variable ENABLE_MS_DEBUGGER
  225. std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
  226. if (!env_enable_str.empty()) {
  227. (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
  228. if (env_enable_str != "0" && env_enable_str != "1" && env_enable_str != "false" && env_enable_str != "true") {
  229. MS_LOG(WARNING) << "Env variable ENABLE_MS_DEBUGGER should be True/False/1/0 (case insensitive), but get: "
  230. << env_enable_str;
  231. }
  232. }
  233. }
  234. bool Debugger::CheckDebuggerPartialMemoryEnabled() const {
  235. std::string env_partial_mem_str = common::GetEnv("MS_DEBUGGER_PARTIAL_MEM");
  236. if (!env_partial_mem_str.empty()) {
  237. MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
  238. if (env_partial_mem_str == "1") {
  239. return true;
  240. }
  241. }
  242. return false;
  243. }
  244. /*
  245. * Feature group: Dump, Online debugger.
  246. * Target device group: Ascend, GPU.
  247. * Runtime category: Old runtime, MindRT
  248. * Description: Returns true if online debugger or dump is enabled.
  249. */
  250. bool Debugger::DebuggerBackendEnabled() const { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
  251. void Debugger::Reset() {
  252. // access lock for public method
  253. std::lock_guard<std::mutex> a_lock(access_lock_);
  254. // reset components
  255. if (heartbeat_thread_ && heartbeat_thread_->joinable()) {
  256. SetEnableHeartbeat(false);
  257. heartbeat_thread_->join();
  258. MS_LOG(INFO) << "Join Heartbeat thread.";
  259. }
  260. heartbeat_thread_ = nullptr;
  261. device_id_ = 0;
  262. device_target_ = "";
  263. num_step_ = 0;
  264. debugger_enabled_ = false;
  265. is_dataset_graph_ = false;
  266. partial_memory_ = false;
  267. graph_ptr_ = nullptr;
  268. grpc_client_ = nullptr;
  269. debug_services_ = nullptr;
  270. graph_proto_list_.clear();
  271. graph_ptr_list_.clear();
  272. graph_ptr_step_vec_.clear();
  273. parameters_mindRT_.clear();
  274. visited_root_graph_ids_.clear();
  275. MS_LOG(INFO) << "Release Debugger resource.";
  276. }
  277. /*
  278. * Feature group: Dump, Online debugger.
  279. * Target device group: Ascend, GPU.
  280. * Runtime category: MindRT.
  281. * Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
  282. * prev_root_graph_id_ and calls PreExecute function for all the graphs.
  283. */
  284. void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
  285. const std::vector<AnfNodePtr> &origin_parameters_order) {
  286. // MindRTBackend for GPU and Ascend
  287. if (device_target_ == kCPUDevice) {
  288. return;
  289. }
  290. // Store graphs that are run in one step.
  291. graph_ptr_step_vec_ = graphs;
  292. parameters_mindRT_ = origin_parameters_order;
  293. prev_root_graph_id_ = cur_root_graph_id_;
  294. // set first run graph as the root graph
  295. cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
  296. MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
  297. << " for step: " << num_step_ << ".";
  298. MS_LOG(DEBUG) << "Set root graph for all the subgraphs:";
  299. for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
  300. const auto &graph = graphs[graph_index];
  301. // set root graph id for GPU mindrt runtime.
  302. MS_LOG(DEBUG) << "Set root graph for graph: " << graph->graph_id() << " to: " << cur_root_graph_id_ << ".";
  303. graph->set_root_graph_id(cur_root_graph_id_);
  304. if (debugger_) {
  305. debugger_->PreExecute(graph);
  306. }
  307. }
  308. }
  309. /*
  310. * Feature group: Dump.
  311. * Target device group: Ascend.
  312. * Runtime category: Old runtime, MindRT.
  313. * Description: When async dump is enabled and dataset_sink_mode is true, graph_iter_num_map_ stores the number of
  314. * iterations per epoch for each running graph.
  315. */
  316. void Debugger::UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num) {
  317. if (graph_iter_num_map_.find(graph_id) == graph_iter_num_map_.end()) {
  318. graph_iter_num_map_[graph_id] = iter_num;
  319. }
  320. }
  321. /*
  322. * Feature group: Dump, Online debugger.
  323. * Target device group: Ascend.
  324. * Runtime category: Old runtime.
  325. * Description: For Ascend old runtime, this function sets the current and previous root graph id.
  326. */
  327. void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
  328. // for GPU and ascend MindRT root graphs are set in PreExecuteGraphDebugger.
  329. if (device_target_ != kAscendDevice || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  330. return;
  331. }
  332. prev_root_graph_id_ = cur_root_graph_id_;
  333. cur_root_graph_id_ = root_graph_id;
  334. MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
  335. << " for step: " << num_step_ << ".";
  336. }
  337. /*
  338. * Feature group: Dump, Online debugger.
  339. * Target device group: GPU.
  340. * Runtime category: Old runtime.
  341. * Description: In the case of GPU old runtime and when we have multiple subgraphs, we use the first run graph id to
  342. * update the step number.
  343. */
  344. void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
  345. // collect rungrap_ids to update step number in multigraph case for GPU old runtime
  346. if (!rungraph_id_list_.size()) {
  347. rungraph_id_list_.push_back(graph_id);
  348. } else {
  349. if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
  350. rungraph_id_list_.push_back(graph_id);
  351. }
  352. }
  353. }
  354. /*
  355. * Feature group: Dump, Online debugger.
  356. * Target device group: Ascend, GPU.
  357. * Runtime category: Old runtime, MindRT.
  358. * Description: Sets previous and current root_graph_id for Ascend old runtime, sends graphs to online debugger when
  359. * debugger_enabled_ is true.
  360. */
  361. void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  362. MS_EXCEPTION_IF_NULL(graph_ptr);
  363. // access lock for public method
  364. std::lock_guard<std::mutex> a_lock(access_lock_);
  365. if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  366. // Checking dataset_sink_mode for mindRT is done in debug_actor
  367. CheckDatasetSinkMode(graph_ptr);
  368. }
  369. auto graph_id = graph_ptr->graph_id();
  370. MS_LOG(DEBUG) << "PreExecute for graph: " << graph_id << " in step: " << num_step_ << ".";
  371. StoreRunGraphIdList(graph_id);
  372. SetCurrentAndPrevRootGraph(graph_ptr->root_graph_id());
  373. // multiple graphs
  374. if (graph_proto_list_.size() > 1) {
  375. // there are more than one graphs are not dataset_graph
  376. if (not_dataset_graph_sum_ > 0) {
  377. SendMultiGraphsAndClear(graph_ptr);
  378. }
  379. } else if (graph_proto_list_.size() == 1) {
  380. // single graph, and not the initial step
  381. if (device_target_ == kGPUDevice && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) &&
  382. num_step_ != 0) {
  383. if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
  384. CommandLoop();
  385. }
  386. debug_services_->ResetLoadedTensors();
  387. }
  388. // In single graph case, reset graph_ptr_ to be nullptr when debugger receives a new graph
  389. if (received_new_graph_) {
  390. graph_ptr_ = nullptr;
  391. CheckGraphPtr(graph_ptr);
  392. }
  393. } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice &&
  394. !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  395. // Multiple graph, and not the initial step,
  396. // stop only when receive the first sub run graph for each step for old runtime
  397. // if we have stopped for the last kernel before, no need to stop again
  398. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  399. return;
  400. }
  401. if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
  402. CommandLoop();
  403. }
  404. debug_services_->ResetLoadedTensors();
  405. }
  406. // resets for the new graph
  407. suspended_at_last_kernel_ = false;
  408. }
  409. /*
  410. * Feature group: Online debugger.
  411. * Target device group: Ascend, GPU.
  412. * Runtime category: Old runtime, MindRT.
  413. * Description: Sends all the subgraphs to online debugger when debugger_enabled_ is true.
  414. */
  415. void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
  416. // only try to enable debugger if they are not all dataset graphs
  417. if (!debugger_enabled_) {
  418. EnableDebugger();
  419. }
  420. if (debugger_enabled_) {
  421. // only send compiled graphs once at the initial step.
  422. auto dbg_graph_ptr = graph_ptr_;
  423. // use current graph ptr to load parameters
  424. graph_ptr_ = graph_ptr;
  425. LoadParametersAndConst();
  426. // revert graph ptr to original value
  427. graph_ptr_ = dbg_graph_ptr;
  428. SendMultiGraphsAndSuspend(graph_proto_list_);
  429. graph_proto_list_.clear();
  430. received_new_graph_ = false;
  431. }
  432. }
  433. /*
  434. * Feature group: Dump.
  435. * Target device group: Ascend, GPU.
  436. * Runtime category: MindRT.
  437. * Description: Returns the rank_id for GPU and Ascend kernel-bykernel mindRT.
  438. */
  439. uint32_t Debugger::GetRankID() {
  440. auto ms_context = MsContext::GetInstance();
  441. MS_EXCEPTION_IF_NULL(ms_context);
  442. std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  443. uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  444. const auto &device_context =
  445. device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id});
  446. uint32_t rank_id = device_context->GetRankID();
  447. return rank_id;
  448. }
  449. /*
  450. * Feature group: Dump.
  451. * Target device group: Ascend, GPU.
  452. * Runtime category: MindRT.
  453. * Description: When dump is enabled, this function: 1) Dumps parameters for the current root_graph_id to the
  454. * root_graph's directory. 2) Dumps constant data once for each graph. 3) Dumps graph run history for each graph.
  455. */
  456. void Debugger::DumpParamsAndConstAndHistory() {
  457. if (!CheckDebuggerDumpEnabled()) {
  458. return;
  459. }
  460. LoadParametersAllGraphs();
  461. E2eDump::DumpParametersData(GetRankID(), debugger_.get());
  462. // Whether constant data was already dumped for the current root graph.
  463. bool cur_root_graph_checked = std::find(visited_root_graph_ids_.begin(), visited_root_graph_ids_.end(),
  464. cur_root_graph_id_) != visited_root_graph_ids_.end();
  465. for (auto graph : graph_ptr_step_vec_) {
  466. if (!cur_root_graph_checked) {
  467. LoadConstsForGraph(graph);
  468. // Dump constant data for GPU.
  469. E2eDump::DumpConstantData(graph.get(), GetRankID(), debugger_.get());
  470. // Dump constant data for Ascend.
  471. DumpConstantDataAscend(graph);
  472. }
  473. // Dump graph run hisotry for each graph.
  474. E2eDump::DumpRunIter(graph, GetRankID());
  475. }
  476. if (!cur_root_graph_checked) {
  477. visited_root_graph_ids_.push_back(cur_root_graph_id_);
  478. }
  479. }
  480. void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
  481. if (device_target_ != kAscendDevice) {
  482. return;
  483. }
  484. auto &json_parser = DumpJsonParser::GetInstance();
  485. if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
  486. // Dump constant data for ascend mindRT, for old runtime constant data is dumped in session_basic.
  487. uint32_t rank_id = GetRankID();
  488. std::string cst_file_dir = GenerateDumpPath(graph->root_graph_id(), rank_id, true);
  489. DumpConstantInfo(graph, cst_file_dir);
  490. }
  491. }
  492. /*
  493. * Feature group: Dump.
  494. * Target device group: Ascend, GPU.
  495. * Runtime category: MindRT.
  496. * Description: Dumps a single node for given graph_id.
  497. */
  498. void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info) {
  499. if (debugger_ && debugger_->DebuggerBackendEnabled()) {
  500. uint32_t rank_id = GetRankID();
  501. (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get(), launch_info);
  502. }
  503. }
  504. /*
  505. * Feature group: Dump.
  506. * Target device group: GPU.
  507. * Runtime category: MindRT.
  508. * Description: This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in
  509. * session_basic.
  510. */
  511. void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
  512. if (device_target_ == kAscendDevice) {
  513. return;
  514. }
  515. auto &json_parser = DumpJsonParser::GetInstance();
  516. if (json_parser.e2e_dump_enabled()) {
  517. uint32_t rank_id = GetRankID();
  518. kernel_graph->set_root_graph_id(kernel_graph->graph_id());
  519. std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
  520. std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
  521. std::string target_dir = root_dir + "/graphs";
  522. std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
  523. DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
  524. DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
  525. DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
  526. kernel_graph->execution_order());
  527. }
  528. }
  529. /*
  530. * Feature group: Dump, Online debugger.
  531. * Target device group: Ascend, GPU and CPU.
  532. * Runtime category: MindRT.
  533. * Description: Load and dump parameters and constant data, call postExecute and update dump iter.
  534. */
  535. void Debugger::PostExecuteGraphDebugger() {
  536. // On CPU, update dump iteration, Parameters and consts are not dumped here
  537. if (device_target_ == kCPUDevice) {
  538. DumpJsonParser::GetInstance().UpdateDumpIter();
  539. return;
  540. }
  541. DumpParamsAndConstAndHistory();
  542. // debug used for dump
  543. if (CheckDebuggerDumpEnabled() && !debugger_enabled()) {
  544. ClearCurrentData();
  545. }
  546. if (debugger_) {
  547. debugger_->PostExecute();
  548. }
  549. E2eDump::UpdateIterMindRTDump();
  550. }
  551. /*
  552. * Feature group: Online debugger.
  553. * Target device group: Ascend, GPU.
  554. * Runtime category: Old runtime, MindRT.
  555. * Description: Send hit watchpoints, update the step number and reset loaded tensors.
  556. */
  557. void Debugger::PostExecute() {
  558. // access lock for public method
  559. std::lock_guard<std::mutex> a_lock(access_lock_);
  560. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  561. return;
  562. }
  563. if (debugger_ && debugger_->DebuggerBackendEnabled()) {
  564. // analyze tensor data and send the watchpoints been hit
  565. if (debugger_enabled_ && !is_dataset_graph_) {
  566. SendWatchpoints(CheckWatchpoints());
  567. // no need to suspend at each graph for GPU old runtime, suspension happens in preExecute
  568. if (device_target_ == kAscendDevice) {
  569. CommandLoop();
  570. } else if (device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  571. if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
  572. CommandLoop();
  573. }
  574. }
  575. if (device_target_ != kGPUDevice) {
  576. num_step_++;
  577. }
  578. }
  579. // Only keep parameters in th current map
  580. // GPU ResetLoadedTensors for old runtime happens in preExecute
  581. if ((device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
  582. device_target_ == kAscendDevice) {
  583. if (debug_services_ != nullptr) {
  584. debug_services_->ResetLoadedTensors();
  585. } else {
  586. MS_LOG(DEBUG) << "debug_services_ is nullptr";
  587. }
  588. }
  589. }
  590. }
  591. bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
  592. if (debugger_enabled_ && !is_dataset_graph_) {
  593. auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
  594. // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
  595. if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
  596. return true;
  597. }
  598. }
  599. return false;
  600. }
  601. /*
  602. * Feature group: Online debugger.
  603. * Target device group: GPU.
  604. * Runtime category: Old runtime, MindRT.
  605. * Description: Check and send watchpoint hit for a single node, suspend if a watchpoint is hit or we are continuing
  606. * in node level.
  607. */
  608. void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
  609. // access lock for public method
  610. std::lock_guard<std::mutex> a_lock(access_lock_);
  611. if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
  612. return;
  613. }
  614. if (debugger_enabled_ && !is_dataset_graph_) {
  615. auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
  616. // if kernel is watchpoint,and get hit. suspend.
  617. bool hit_empty_flag = true;
  618. if (is_watchpoint) {
  619. auto hits = CheckWatchpoints(cur_name_, kernel);
  620. if (!hits.empty()) {
  621. SendWatchpoints(hits);
  622. CommandLoop();
  623. hit_empty_flag = false;
  624. }
  625. }
  626. if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
  627. // if kernel is not watchpoint and is next_to or continue_to node, suspend
  628. // sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
  629. if (last_kernel) {
  630. suspended_at_last_kernel_ = true;
  631. }
  632. CommandLoop();
  633. }
  634. return;
  635. }
  636. }
  637. /*
  638. * Feature group: Dump, Online debugger.
  639. * Target device group: Ascend, GPU.
  640. * Runtime category: Old runtime, MindRT.
  641. * Description: Get graph proto and add it to graph proto list and add loaded graph pointers to a list.
  642. */
  643. void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
  644. MS_EXCEPTION_IF_NULL(graph_ptr);
  645. if (graph_ptr_ != graph_ptr) {
  646. MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
  647. received_new_graph_ = true;
  648. // save new graph_ptr
  649. graph_ptr_ = graph_ptr;
  650. CheckDatasetGraph();
  651. if (!is_dataset_graph_) {
  652. // get proto for new graph_ptr
  653. auto graph_proto = GetGraphProto(graph_ptr);
  654. // add new graph proto to graph_proto_list_
  655. graph_proto_list_.push_back(graph_proto);
  656. graph_ptr_list_.push_back(graph_ptr);
  657. not_dataset_graph_sum_++;
  658. }
  659. // reset is_dataset_graph to be false
  660. is_dataset_graph_ = false;
  661. }
  662. }
  663. // In single graph cases, check single graph ptr
  664. void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
  665. MS_EXCEPTION_IF_NULL(graph_ptr);
  666. if (graph_ptr_ != graph_ptr) {
  667. MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
  668. // save new graph_ptr
  669. graph_ptr_ = graph_ptr;
  670. if (!is_dataset_graph_) {
  671. // only try to enable debugger if it is not a dataset graph
  672. if (!debugger_enabled_) {
  673. EnableDebugger();
  674. }
  675. if (debugger_enabled_) {
  676. LoadParametersAndConst();
  677. // get graph proto and send to MindInsight
  678. auto graph_proto = graph_proto_list_.front();
  679. SendGraphAndSuspend(graph_proto);
  680. graph_proto_list_.clear();
  681. received_new_graph_ = false;
  682. }
  683. }
  684. }
  685. }
  686. void Debugger::CheckDatasetGraph() {
  687. // print parameter node names
  688. MS_EXCEPTION_IF_NULL(graph_ptr_);
  689. const auto &params = graph_ptr_->inputs();
  690. for (const auto &param : params) {
  691. MS_LOG(INFO) << "param: " << GetKernelNodeName(param);
  692. }
  693. // check if there is GetNext or InitDataSetQueue node
  694. const auto &nodes = graph_ptr_->execution_order();
  695. for (const auto &node : nodes) {
  696. auto node_name = AnfAlgo::GetCNodeName(node);
  697. MS_LOG(INFO) << "node: " << GetKernelNodeName(node);
  698. if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
  699. MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
  700. << node_name;
  701. is_dataset_graph_ = true;
  702. return;
  703. }
  704. }
  705. is_dataset_graph_ = false;
  706. }
  707. GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
  708. // convert kernel graph to debugger modelproto
  709. ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
  710. return model.graph();
  711. }
  712. /*
  713. * Feature group: Online debugger.
  714. * Target device group: Ascend, GPU.
  715. * Runtime category: Old runtime, MindRT.
  716. * Description: Send debugger backend heartbeat to online debugger every few seconds.
  717. */
  718. void Debugger::SendHeartbeat(int32_t period) {
  719. int num_heartbeat_fail = 0;
  720. const int max_num_heartbeat_fail = 5;
  721. const int retry_milliseconds = 500;
  722. Heartbeat heartbeat;
  723. heartbeat.set_message("Debugger is alive");
  724. heartbeat.set_period(heartbeat_period_second);
  725. SetEnableHeartbeat(CheckDebuggerEnabled());
  726. while (enable_heartbeat_) {
  727. MS_EXCEPTION_IF_NULL(grpc_client_);
  728. EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
  729. if (reply.status() != EventReply::OK) {
  730. MS_LOG(ERROR) << "Error: SendHeartbeat failed";
  731. num_heartbeat_fail++;
  732. if (num_heartbeat_fail >= max_num_heartbeat_fail) {
  733. MS_LOG(ERROR) << "Maximum number of failure for SendHeartbeat reached : exiting training session.";
  734. SetEnableHeartbeat(false);
  735. break;
  736. } else {
  737. MS_LOG(ERROR) << "Number of consecutive SendHeartbeat fail:" << num_heartbeat_fail;
  738. std::this_thread::sleep_for(std::chrono::milliseconds(retry_milliseconds));
  739. }
  740. } else {
  741. int recheck_period_ms = 200;
  742. for (int i = 0; i < (period * 1000 / recheck_period_ms); i++) {
  743. if (enable_heartbeat_) {
  744. std::this_thread::sleep_for(std::chrono::milliseconds(recheck_period_ms));
  745. } else {
  746. break;
  747. }
  748. }
  749. }
  750. }
  751. }
  752. void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  753. if (!CheckSendMetadata()) {
  754. return;
  755. }
  756. // send graph to MindInsight server
  757. MS_EXCEPTION_IF_NULL(grpc_client_);
  758. EventReply reply = grpc_client_->SendGraph(graph_proto);
  759. if (reply.status() != EventReply::OK) {
  760. MS_LOG(ERROR) << "Error: SendGraph failed";
  761. }
  762. // enter command loop, wait and process commands
  763. CommandLoop();
  764. }
  765. bool Debugger::SendMetadata(bool version_check) {
  766. // prepare metadata
  767. MS_EXCEPTION_IF_NULL(graph_ptr_);
  768. std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
  769. Metadata metadata;
  770. metadata.set_device_name(device_name);
  771. metadata.set_cur_step(num_step_);
  772. metadata.set_backend(device_target_);
  773. metadata.set_cur_node(cur_name_);
  774. metadata.set_training_done(training_done_);
  775. metadata.set_ms_version(version_);
  776. MS_LOG(INFO) << "Is training done?" << training_done_;
  777. // set graph number to not_dataset_graph_sum_
  778. metadata.set_graph_num(not_dataset_graph_sum_);
  779. MS_EXCEPTION_IF_NULL(grpc_client_);
  780. EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
  781. bool ret = false;
  782. if (reply_metadata.status() == EventReply::OK) {
  783. if (version_check) {
  784. // get type of the command in meta data reply, it should be version matched
  785. DebuggerCommand cmd = GetCommand(reply_metadata);
  786. if (cmd != DebuggerCommand::kVersionMatchedCMD) {
  787. MS_LOG(ERROR) << "MindInsight version is too old, Mindspore version is " << version_;
  788. Exit();
  789. } else {
  790. if (GetMiVersionMatched(reply_metadata)) {
  791. MS_LOG(INFO) << "MindSpore version is " << version_ << " matches MindInsight version.";
  792. ret = true;
  793. } else {
  794. MS_LOG(ERROR) << "MindSpore version " << version_ << ", did not match MindInsight version.";
  795. CommandLoop();
  796. }
  797. }
  798. } else {
  799. // version check is done before so we can just return true here
  800. ret = true;
  801. }
  802. } else {
  803. MS_LOG(ERROR) << "Error: SendMetadata failed";
  804. }
  805. return ret;
  806. }
  807. void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list) {
  808. if (!CheckSendMetadata()) {
  809. return;
  810. }
  811. MS_EXCEPTION_IF_NULL(grpc_client_);
  812. // send multiple graphs to mindinght server
  813. // split graph into chunks if one graph is larger than chunk size
  814. std::list<Chunk> chunked_graph_proto_list;
  815. Chunk chunk;
  816. for (auto graph : graph_proto_list) {
  817. std::string str = graph.SerializeAsString();
  818. auto graph_size = graph.ByteSize();
  819. if (graph_size > g_chunk_size) {
  820. auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
  821. for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
  822. chunk.set_buffer(sub_graph_str[i]);
  823. if (i < sub_graph_str.size() - 1) {
  824. chunk.set_finished(false);
  825. } else {
  826. chunk.set_finished(true);
  827. }
  828. chunked_graph_proto_list.push_back(chunk);
  829. }
  830. } else {
  831. chunk.set_buffer(str);
  832. chunk.set_finished(true);
  833. chunked_graph_proto_list.push_back(chunk);
  834. }
  835. }
  836. EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
  837. if (reply.status() != EventReply::OK) {
  838. MS_LOG(ERROR) << "Error: SendGraph failed";
  839. }
  840. // enter command loop, wait and process commands
  841. CommandLoop();
  842. }
  843. bool Debugger::CheckSendMetadata() {
  844. if (!send_metadata_done_) {
  845. if (!SendMetadata(true)) {
  846. return false;
  847. }
  848. send_metadata_done_ = true;
  849. }
  850. return true;
  851. }
  852. void Debugger::CommandLoop() {
  853. // prepare metadata
  854. MS_EXCEPTION_IF_NULL(graph_ptr_);
  855. std::string device_name = std::to_string(device_id_) + ":" + std::to_string(cur_root_graph_id_);
  856. Metadata metadata;
  857. metadata.set_device_name(device_name);
  858. metadata.set_cur_step(num_step_);
  859. metadata.set_backend(device_target_);
  860. metadata.set_cur_node(cur_name_);
  861. metadata.set_training_done(training_done_);
  862. // loop exit flag
  863. bool run = false;
  864. int num_wait_fail = 0;
  865. const int max_num_wait_fail = 5;
  866. while (!run) {
  867. // wait for command
  868. MS_EXCEPTION_IF_NULL(grpc_client_);
  869. EventReply reply = grpc_client_->WaitForCommand(metadata);
  870. if (reply.status() != EventReply::OK) {
  871. MS_LOG(ERROR) << "Error: WaitForCommand failed";
  872. num_wait_fail++;
  873. if (num_wait_fail > max_num_wait_fail) {
  874. MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session.";
  875. MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config "
  876. "of debugger host and port.";
  877. Exit();
  878. run = true;
  879. } else {
  880. MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after "
  881. << num_wait_fail << "s";
  882. std::this_thread::sleep_for(std::chrono::seconds(num_wait_fail));
  883. }
  884. continue;
  885. }
  886. // get type of the command in reply
  887. DebuggerCommand cmd = GetCommand(reply);
  888. if (cmd == DebuggerCommand::kUnknownCMD) {
  889. MS_LOG(DEBUG) << "Debug: debugger received unknown command";
  890. continue;
  891. }
  892. MS_LOG(INFO) << "received command: ";
  893. switch (cmd) {
  894. case DebuggerCommand::kUnknownCMD:
  895. MS_LOG(INFO) << "UnknownCMD";
  896. break;
  897. case DebuggerCommand::kExitCMD:
  898. MS_LOG(INFO) << "ExitCMD";
  899. Exit(true);
  900. // Used for debugger termination
  901. run = true;
  902. break;
  903. case DebuggerCommand::kRunCMD:
  904. ProcessRunCMD(reply);
  905. if (GetRunLevel(reply) != "recheck") {
  906. // exit loop
  907. run = true;
  908. }
  909. break;
  910. case DebuggerCommand::kSetCMD:
  911. ProcessKSetCMD(reply);
  912. break;
  913. case DebuggerCommand::kViewCMD:
  914. ProcessKViewCMD(reply);
  915. break;
  916. case DebuggerCommand::kVersionMatchedCMD:
  917. MS_LOG(ERROR) << "Received unexpected Version Matched CMD from MindInsight.";
  918. Exit();
  919. break;
  920. default:
  921. MS_LOG(ERROR) << "Received unknown CMD from MindInsight";
  922. Exit();
  923. break;
  924. }
  925. }
  926. }
  927. void Debugger::ProcessRunCMD(const EventReply &reply) {
  928. MS_LOG(INFO) << "RunCMD";
  929. if (GetRunLevel(reply) == "recheck") {
  930. MS_LOG(INFO) << "rechecking all watchpoints";
  931. SendWatchpoints(CheckWatchpoints("", nullptr, true));
  932. } else {
  933. // no longer the initial suspension.
  934. initial_suspend_ = false;
  935. // print run cmd content
  936. // get run_level and node_name
  937. run_level_ = GetRunLevel(reply);
  938. node_name_ = GetNodeName(reply);
  939. MS_LOG(INFO) << "run_level: " << run_level_;
  940. MS_LOG(INFO) << "node_name_: " << node_name_;
  941. }
  942. }
  943. void Debugger::ProcessKSetCMD(const EventReply &reply) {
  944. MS_LOG(INFO) << "SetCMD";
  945. MS_LOG(INFO) << "id: " << GetWatchpointID(reply);
  946. MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply);
  947. if (GetWatchpointDelete(reply)) {
  948. MS_LOG(INFO) << "Deleting watchpoint";
  949. RemoveWatchpoint(GetWatchpointID(reply));
  950. } else {
  951. MS_LOG(INFO) << "Setting watchpoint";
  952. MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition();
  953. ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply);
  954. for (const auto &node : recieved_nodes) {
  955. MS_LOG(INFO) << "node name: " << node.node_name();
  956. MS_LOG(INFO) << "node type: " << node.node_type();
  957. }
  958. ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply);
  959. for (const auto &parameter : parameters) {
  960. MS_LOG(INFO) << "parameter name: " << parameter.name();
  961. MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled();
  962. MS_LOG(INFO) << "parameter value: " << parameter.value();
  963. }
  964. SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
  965. }
  966. }
  967. void Debugger::ProcessKViewCMD(const EventReply &reply) {
  968. MS_LOG(INFO) << "ViewCMD";
  969. // print view cmd content
  970. ProtoVector<TensorProto> received_tensors = GetTensors(reply);
  971. for (auto received_tensor : received_tensors) {
  972. MS_LOG(INFO) << "tensor node name: " << received_tensor.node_name();
  973. MS_LOG(INFO) << "tensor slot: " << received_tensor.slot();
  974. MS_LOG(INFO) << "tensor finished: " << std::boolalpha << received_tensor.finished() << std::noboolalpha;
  975. MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
  976. MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
  977. }
  978. switch (reply.view_cmd().level()) {
  979. case debugger::ViewCMD_Level::ViewCMD_Level_base:
  980. MS_LOG(INFO) << "Tensor base request.";
  981. ViewBaseLevel(reply);
  982. break;
  983. case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
  984. MS_LOG(INFO) << "Tensor statistics request.";
  985. ViewStatLevel(reply);
  986. break;
  987. case debugger::ViewCMD_Level::ViewCMD_Level_value:
  988. MS_LOG(INFO) << "Tensor value request.";
  989. ViewValueLevel(reply);
  990. break;
  991. default:
  992. MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
  993. break;
  994. }
  995. }
  996. void Debugger::ViewValueLevel(const EventReply &reply) {
  997. MS_LOG(INFO) << "Sending tensors";
  998. std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
  999. // print view cmd reply
  1000. for (auto tensor : tensors) {
  1001. MS_LOG(INFO) << "tensor node name: " << tensor.node_name();
  1002. MS_LOG(INFO) << "tensor slot: " << tensor.slot();
  1003. MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha;
  1004. MS_LOG(INFO) << "tensor iter: " << tensor.iter();
  1005. MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha;
  1006. MS_LOG(INFO) << "tensor dims: ";
  1007. for (auto dim : tensor.dims()) {
  1008. MS_LOG(INFO) << dim << ",";
  1009. }
  1010. MS_LOG(INFO) << "tensor dtype: " << tensor.data_type();
  1011. }
  1012. MS_EXCEPTION_IF_NULL(grpc_client_);
  1013. EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
  1014. if (send_tensors_reply.status() != debugger::EventReply::OK) {
  1015. MS_LOG(ERROR) << "Error: SendTensors failed";
  1016. }
  1017. }
  1018. void Debugger::ViewStatLevel(const EventReply &reply) {
  1019. std::list<TensorSummary> tensor_stats_list = LoadTensorsStat(GetTensors(reply));
  1020. EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stats_list);
  1021. if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
  1022. MS_LOG(ERROR) << "Error: SendTensorsStats failed.";
  1023. }
  1024. }
  1025. void Debugger::ViewBaseLevel(const EventReply &reply) {
  1026. std::list<TensorBase> tensor_base_list = LoadTensorsBase(GetTensors(reply));
  1027. EventReply send_tensor_base_reply = grpc_client_->SendTensorBase(tensor_base_list);
  1028. if (send_tensor_base_reply.status() != debugger::EventReply::OK) {
  1029. MS_LOG(ERROR) << "Error: SendTensorsBase failed.";
  1030. }
  1031. }
  1032. void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
  1033. tensor_item->set_node_name(tensor.node_name());
  1034. tensor_item->set_slot(tensor.slot());
  1035. tensor_item->set_iter(tensor.iter());
  1036. tensor_item->set_truncate(tensor.truncate());
  1037. tensor_item->clear_tensor_content();
  1038. tensor_item->clear_data_type();
  1039. tensor_item->clear_dims();
  1040. }
  1041. void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat,
  1042. std::list<TensorSummary> *const tensor_summary_list) {
  1043. if (tensor_summary_list == nullptr) {
  1044. MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
  1045. return;
  1046. }
  1047. TensorSummary tensor_summary_item;
  1048. TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
  1049. tensor_base->set_data_type(tensor_stat.dtype);
  1050. tensor_base->set_data_size((int64_t)tensor_stat.data_size);
  1051. for (auto elem : tensor_stat.shape) {
  1052. tensor_base->add_shape(elem);
  1053. }
  1054. Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
  1055. tensor_statistics->set_is_bool(tensor_stat.is_bool);
  1056. tensor_statistics->set_max_value(static_cast<float>(tensor_stat.max_value));
  1057. tensor_statistics->set_min_value(static_cast<float>(tensor_stat.min_value));
  1058. tensor_statistics->set_avg_value(static_cast<float>(tensor_stat.avg_value));
  1059. tensor_statistics->set_count(SizeToInt(tensor_stat.count));
  1060. tensor_statistics->set_neg_zero_count(SizeToInt(tensor_stat.neg_zero_count));
  1061. tensor_statistics->set_pos_zero_count(SizeToInt(tensor_stat.pos_zero_count));
  1062. tensor_statistics->set_nan_count(SizeToInt(tensor_stat.nan_count));
  1063. tensor_statistics->set_neg_inf_count(SizeToInt(tensor_stat.neg_inf_count));
  1064. tensor_statistics->set_pos_inf_count(SizeToInt(tensor_stat.pos_inf_count));
  1065. tensor_statistics->set_zero_count(SizeToInt(tensor_stat.zero_count));
  1066. tensor_summary_list->push_back(tensor_summary_item);
  1067. }
  1068. void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
  1069. const ProtoVector<WatchCondition_Parameter> &parameters) {
  1070. std::vector<std::tuple<std::string, bool>> check_node_list;
  1071. std::vector<DebugServices::parameter_t> parameter_list;
  1072. std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list),
  1073. [](const WatchNode &node) -> std::tuple<std::string, bool> {
  1074. return make_tuple(node.node_name(), node.node_type() == "scope");
  1075. });
  1076. std::transform(
  1077. parameters.begin(), parameters.end(), std::back_inserter(parameter_list),
  1078. [](const WatchCondition_Parameter &parameter) -> DebugServices::parameter_t {
  1079. return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
  1080. });
  1081. debug_services_->AddWatchpoint(id, condition.condition(), condition.value(), check_node_list, parameter_list);
  1082. }
  1083. void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
  1084. std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &tensors) const {
  1085. std::vector<std::string> name;
  1086. std::vector<std::string> ret_name;
  1087. std::vector<const char *> data_ptr;
  1088. std::vector<ssize_t> data_size;
  1089. std::vector<unsigned int> dtype;
  1090. std::vector<std::vector<int64_t>> shape;
  1091. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1092. // ret_name will contain tensor names that are found in TensorLoader
  1093. // items in ret_name will be in the same order with tensors if found
  1094. debug_services_->ReadNodesTensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape);
  1095. std::list<TensorProto> tensor_list;
  1096. size_t result_index = 0;
  1097. for (auto tensor : tensors) {
  1098. ssize_t size_iter = 0;
  1099. if (result_index >= ret_name.size() || ret_name[result_index] != GetTensorFullName(tensor)) {
  1100. TensorProto tensor_item;
  1101. tensor_item.set_finished(true);
  1102. AddTensorProtoInfo(&tensor_item, tensor);
  1103. tensor_list.push_back(tensor_item);
  1104. continue;
  1105. }
  1106. ssize_t tensor_size = data_size[result_index];
  1107. while (size_iter < tensor_size) {
  1108. ssize_t chunk_size = g_chunk_size;
  1109. TensorProto tensor_item;
  1110. tensor_item.set_finished(false);
  1111. if (tensor_size - size_iter <= g_chunk_size) {
  1112. chunk_size = tensor_size - size_iter;
  1113. tensor_item.set_finished(true);
  1114. }
  1115. AddTensorProtoInfo(&tensor_item, tensor);
  1116. // return empty tensor if didn't find the requested tensor
  1117. tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
  1118. tensor_item.set_data_type((debugger::DataType)dtype[result_index]);
  1119. for (auto &elem : shape[result_index]) {
  1120. tensor_item.add_dims(elem);
  1121. }
  1122. // add tensor to result list and increment result_index to check next item in ret_name
  1123. tensor_list.push_back(tensor_item);
  1124. if (size_iter > INT_MAX - g_chunk_size) {
  1125. MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
  1126. }
  1127. size_iter += g_chunk_size;
  1128. }
  1129. result_index++;
  1130. }
  1131. return tensor_list;
  1132. }
  1133. std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
  1134. std::list<TensorBase> tensor_base_list;
  1135. std::vector<std::string> name;
  1136. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1137. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1138. debug_services_->SearchNodesTensors(name, &result_list);
  1139. for (auto result : result_list) {
  1140. auto tensor = std::get<1>(result);
  1141. if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
  1142. MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
  1143. // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor base.
  1144. TensorBase tensor_base_item;
  1145. tensor_base_item.set_data_size(0);
  1146. tensor_base_item.set_data_type(0);
  1147. tensor_base_item.add_shape(0);
  1148. tensor_base_list.push_back(tensor_base_item);
  1149. continue;
  1150. }
  1151. // tensor was found creating tensor base object.
  1152. TensorBase tensor_base_item;
  1153. tensor_base_item.set_data_size((int64_t)tensor->GetByteSize());
  1154. tensor_base_item.set_data_type((int32_t)tensor->GetType());
  1155. for (auto elem : tensor->GetShape()) {
  1156. tensor_base_item.add_shape(elem);
  1157. }
  1158. tensor_base_list.push_back(tensor_base_item);
  1159. }
  1160. return tensor_base_list;
  1161. }
  1162. std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
  1163. std::list<TensorSummary> tensor_summary_list;
  1164. std::vector<std::string> name;
  1165. std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
  1166. std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
  1167. debug_services_->SearchNodesTensors(name, &result_list);
  1168. for (auto result : result_list) {
  1169. auto tensor = std::get<1>(result);
  1170. if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
  1171. MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
  1172. // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor summary.
  1173. DebugServices::TensorStat tensor_stat;
  1174. AddTensorStatInfo(tensor_stat, &tensor_summary_list);
  1175. continue;
  1176. }
  1177. // tensor was found creating tensor summary object.
  1178. DebugServices::TensorStat tensor_stat = DebugServices::GetTensorStatistics(tensor);
  1179. AddTensorStatInfo(tensor_stat, &tensor_summary_list);
  1180. }
  1181. return tensor_summary_list;
  1182. }
  1183. std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name) const {
  1184. return debug_services_->GetTensor(tensor_name);
  1185. }
  1186. void Debugger::Exit(bool exit_success) {
  1187. // debugger will notify main thread to exit because main thread can only exit at step boundary.
  1188. MS_LOG(INFO) << "Exit Debugger";
  1189. SetEnableHeartbeat(false);
  1190. pipeline::GraphExecutorPy::DebugTerminate(true, exit_success);
  1191. }
  1192. std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
  1193. bool recheck) {
  1194. std::vector<std::string> name;
  1195. std::vector<std::string> slot;
  1196. std::vector<int> condition;
  1197. std::vector<unsigned int> watchpoint_id;
  1198. std::vector<std::string> overflow_ops;
  1199. std::vector<std::vector<DebugServices::parameter_t>> parameters;
  1200. std::vector<int32_t> error_codes;
  1201. std::vector<std::shared_ptr<TensorData>> tensor_list;
  1202. if (watchnode.empty()) {
  1203. tensor_list = debug_services_->GetTensor();
  1204. } else {
  1205. tensor_list = debug_services_->GetNodeTensor(kernel);
  1206. }
  1207. DebugServices::AsyncFilePool file_list;
  1208. MS_LOG(INFO) << "checkwatchpoints call for step " << num_step_;
  1209. debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
  1210. file_list, &tensor_list, initial_suspend_, watchnode.empty(), recheck);
  1211. std::list<WatchpointHit> hits;
  1212. for (unsigned int i = 0; i < name.size(); i++) {
  1213. WatchpointHit hit;
  1214. std::vector<DebugServices::parameter_t> &parameter = parameters[i];
  1215. hit.set_id(watchpoint_id[i]);
  1216. hit.set_error_code(error_codes[i]);
  1217. // here TensorProto act as a tensor indicator, not sending tensor content
  1218. TensorProto *tensor_item = hit.mutable_tensor();
  1219. tensor_item->set_node_name(name[i]);
  1220. tensor_item->set_slot(slot[i]);
  1221. tensor_item->set_finished(true);
  1222. WatchCondition *condition_item = hit.mutable_watch_condition();
  1223. condition_item->set_condition(debugger::WatchCondition_Condition(condition[i]));
  1224. for (const auto &p : parameter) {
  1225. auto x = condition_item->mutable_params()->Add();
  1226. x->set_name(p.name);
  1227. x->set_disabled(p.disabled);
  1228. x->set_value(p.value);
  1229. x->set_hit(p.hit);
  1230. x->set_actual_value(p.actual_value);
  1231. }
  1232. hits.push_back(hit);
  1233. }
  1234. return hits;
  1235. }
  1236. void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
  1237. // send info about watchpoint
  1238. if (!points.empty()) {
  1239. MS_EXCEPTION_IF_NULL(grpc_client_);
  1240. EventReply reply = grpc_client_->SendWatchpointHits(points);
  1241. if (reply.status() != EventReply::OK) {
  1242. MS_LOG(ERROR) << "Error: SendWatchpointHits failed";
  1243. }
  1244. }
  1245. }
  1246. bool Debugger::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
  1247. const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
  1248. TypeId device_type, const std::string &addr_format, size_t slot) const {
  1249. return debug_services_.get()->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
  1250. device_type, addr_format, slot);
  1251. }
  1252. bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
  1253. if (debug_services_ != nullptr) {
  1254. return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
  1255. }
  1256. return false;
  1257. }
  1258. bool Debugger::debugger_enabled() const { return debugger_enabled_; }
  1259. DebuggerCommand GetCommand(const EventReply &reply) {
  1260. DebuggerCommand cmd = DebuggerCommand::kUnknownCMD;
  1261. switch (reply.cmd_case()) {
  1262. case debugger::EventReply::CmdCase::kExit:
  1263. cmd = DebuggerCommand::kExitCMD;
  1264. break;
  1265. case debugger::EventReply::CmdCase::kRunCmd:
  1266. cmd = DebuggerCommand::kRunCMD;
  1267. break;
  1268. case debugger::EventReply::CmdCase::kSetCmd:
  1269. cmd = DebuggerCommand::kSetCMD;
  1270. break;
  1271. case debugger::EventReply::CmdCase::kViewCmd:
  1272. cmd = DebuggerCommand::kViewCMD;
  1273. break;
  1274. case debugger::EventReply::CmdCase::kVersionMatched:
  1275. cmd = DebuggerCommand::kVersionMatchedCMD;
  1276. break;
  1277. default:
  1278. MS_LOG(DEBUG) << "Debug: UnknownCMD";
  1279. break;
  1280. }
  1281. return cmd;
  1282. }
  1283. ProtoVector<WatchCondition_Parameter> GetParameters(const EventReply &reply) {
  1284. if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
  1285. MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>().";
  1286. return ProtoVector<WatchCondition_Parameter>();
  1287. }
  1288. return reply.set_cmd().watch_condition().params();
  1289. }
  1290. ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) {
  1291. if (!reply.has_set_cmd()) {
  1292. MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>().";
  1293. return ProtoVector<WatchNode>();
  1294. }
  1295. return reply.set_cmd().watch_nodes();
  1296. }
  1297. std::string GetRunLevel(const EventReply &reply) {
  1298. if (!reply.has_run_cmd()) {
  1299. MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
  1300. "";
  1301. return "";
  1302. }
  1303. return reply.run_cmd().run_level();
  1304. }
  1305. std::string GetNodeName(const EventReply &reply) {
  1306. if (!reply.has_run_cmd()) {
  1307. MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
  1308. "";
  1309. return "";
  1310. }
  1311. return reply.run_cmd().node_name();
  1312. }
  1313. WatchCondition GetWatchcondition(const EventReply &reply) {
  1314. if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
  1315. MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
  1316. return WatchCondition();
  1317. }
  1318. return reply.set_cmd().watch_condition();
  1319. }
  1320. int32_t GetWatchpointID(const EventReply &reply) {
  1321. if (!reply.has_set_cmd()) {
  1322. MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0.";
  1323. return 0;
  1324. }
  1325. return reply.set_cmd().id();
  1326. }
  1327. bool GetWatchpointDelete(const EventReply &reply) {
  1328. if (!reply.has_set_cmd()) {
  1329. MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false.";
  1330. return false;
  1331. }
  1332. return reply.set_cmd().delete_();
  1333. }
  1334. ProtoVector<TensorProto> GetTensors(const EventReply &reply) {
  1335. if (!reply.has_view_cmd()) {
  1336. MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector<TensorProto>().";
  1337. return ProtoVector<TensorProto>();
  1338. }
  1339. return reply.view_cmd().tensors();
  1340. }
  1341. std::string GetTensorFullName(const TensorProto &tensor) {
  1342. string node_name = tensor.node_name();
  1343. if (tensor.truncate()) {
  1344. // scopes in node name are separated by '/'
  1345. // use the name without scope if truncate is true
  1346. std::size_t found = node_name.find_last_of("/");
  1347. node_name = node_name.substr(found + 1);
  1348. }
  1349. return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter());
  1350. }
  1351. bool GetMiVersionMatched(const EventReply &reply) { return reply.version_matched(); }
  1352. bool Debugger::partial_memory() const { return partial_memory_; }
  1353. void Debugger::SetEnableHeartbeat(bool enabled) { enable_heartbeat_ = enabled; }
  1354. void Debugger::SetCurNode(const std::string &cur_name) {
  1355. // access lock for public method
  1356. std::lock_guard<std::mutex> a_lock(access_lock_);
  1357. cur_name_ = cur_name;
  1358. }
  1359. std::string Debugger::run_level() const { return run_level_; }
  1360. void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
  1361. bool Debugger::CheckPort(const std::string &port) const {
  1362. int num = 0;
  1363. const int min_port_num = 1;
  1364. const int max_port_num = 65535;
  1365. const int decimal = 10;
  1366. if (port[0] == '0' && port[1] != '\0') return false;
  1367. int i = 0;
  1368. while (port[i] != '\0') {
  1369. if (port[i] < '0' || port[i] > '9') return false;
  1370. num = num * decimal + (port[i] - '0');
  1371. if (num > max_port_num) return false;
  1372. i++;
  1373. }
  1374. if (num < min_port_num) return false;
  1375. return true;
  1376. }
  1377. bool Debugger::CheckIp(const std::string &host) const {
  1378. std::regex reg_ip(
  1379. "(25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])"
  1380. "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
  1381. "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
  1382. "[.](25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])");
  1383. std::smatch smat;
  1384. std::string host_str = host;
  1385. return std::regex_match(host_str, smat, reg_ip);
  1386. }
  1387. uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }
  1388. /*
  1389. * Feature group: Dump.
  1390. * Target device group: Ascend, GPU.
  1391. * Runtime category: Old runtime, MindRT.
  1392. * Description: Load a single parameter or value node.
  1393. */
  1394. void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
  1395. MS_EXCEPTION_IF_NULL(anf_node);
  1396. if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
  1397. return;
  1398. }
  1399. // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
  1400. if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  1401. if (!anf_node->isa<ValueNode>() &&
  1402. !(anf_node->isa<Parameter>() && AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
  1403. return;
  1404. }
  1405. }
  1406. // for parameters and value nodes, set its execution order to be 0;
  1407. int exec_order = 0;
  1408. std::string node_name = GetKernelNodeName(anf_node);
  1409. GetFileKernelName(NOT_NULL(&node_name));
  1410. // check if output adde exists, if not, return;
  1411. if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
  1412. return;
  1413. }
  1414. auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
  1415. MS_EXCEPTION_IF_NULL(addr);
  1416. auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
  1417. if (!IsTypeDebuggerSupported(type)) {
  1418. return;
  1419. }
  1420. auto format = kOpFormat_DEFAULT;
  1421. string tensor_name = node_name + ':' + "0";
  1422. ShapeVector int_shapes = trans::GetRuntimePaddingShape(anf_node, output_index);
  1423. bool keep_prev;
  1424. if (anf_node->isa<Parameter>()) {
  1425. keep_prev = true;
  1426. debug_services_->MoveTensorCurrentToPrev(tensor_name);
  1427. } else {
  1428. keep_prev = false;
  1429. }
  1430. bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id);
  1431. if (!ret) {
  1432. MS_LOG(ERROR) << "LoadMemToHost:"
  1433. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1434. }
  1435. }
  1436. void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
  1437. MS_EXCEPTION_IF_NULL(node);
  1438. auto root_graph_id = cur_root_graph_id_;
  1439. // This function is only for loading parameters mindRT.
  1440. std::string node_name = GetKernelNodeName(node);
  1441. GetFileKernelName(NOT_NULL(&node_name));
  1442. TypeId type;
  1443. TypeId device_type;
  1444. ShapeVector int_shapes;
  1445. auto device_addr = GetParameterInfo(node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
  1446. if (device_addr == nullptr) {
  1447. MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
  1448. return;
  1449. }
  1450. if (!IsTypeDebuggerSupported(type)) {
  1451. return;
  1452. }
  1453. auto format = kOpFormat_DEFAULT;
  1454. string tensor_name = node_name + ':' + "0";
  1455. if (debug_services_ != nullptr) {
  1456. debug_services_->MoveTensorCurrentToPrev(tensor_name);
  1457. }
  1458. // Keep_prev is True for parameters.
  1459. bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id);
  1460. if (!ret) {
  1461. MS_LOG(ERROR) << "LoadMemToHost:"
  1462. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1463. }
  1464. }
  1465. /*
  1466. * Feature group: Dump, Online debugger.
  1467. * Target device group: Ascend, GPU.
  1468. * Runtime category: Old runtime, MindRT.
  1469. * Description: Load all the parameters and value nodes for the last loaded graph.
  1470. */
  1471. void Debugger::LoadParametersAndConst() {
  1472. if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  1473. MS_EXCEPTION_IF_NULL(graph_ptr_);
  1474. // load parameters
  1475. MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id() << ".";
  1476. auto root_graph_id = graph_ptr_->root_graph_id();
  1477. const auto &parameters = graph_ptr_->inputs();
  1478. for (auto &item : parameters) {
  1479. LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  1480. }
  1481. // load value nodes
  1482. // get all constant values from the graph
  1483. MS_LOG(INFO) << "Start to load value nodes for graph " << graph_ptr_->graph_id() << ".";
  1484. const auto value_nodes = graph_ptr_->graph_value_nodes();
  1485. for (auto &item : value_nodes) {
  1486. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1487. }
  1488. }
  1489. /*
  1490. * Feature group: Dump, Online debugger.
  1491. * Target device group: Ascend, GPU.
  1492. * Runtime category: Old runtime, MindRT.
  1493. * Description: Load all the parameters and value nodes for the given graph.
  1494. */
  1495. void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
  1496. if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
  1497. MS_EXCEPTION_IF_NULL(graph);
  1498. // load parameters
  1499. MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
  1500. auto root_graph_id = graph->root_graph_id();
  1501. const auto &parameters = graph->inputs();
  1502. for (auto &item : parameters) {
  1503. LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX, root_graph_id);
  1504. }
  1505. // load value nodes
  1506. // get all constant values from the graph
  1507. MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
  1508. const auto value_nodes = graph->graph_value_nodes();
  1509. for (auto &item : value_nodes) {
  1510. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1511. }
  1512. }
  1513. /*
  1514. * Feature group: Dump.
  1515. * Target device group: GPU.
  1516. * Runtime category: MindRT.
  1517. * Description: This function is for loading parameters' data from device to host into tensor_list_map_ for GPU dump.
  1518. * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
  1519. */
  1520. void Debugger::LoadParametersAllGraphs() {
  1521. if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
  1522. return;
  1523. }
  1524. for (auto &node : parameters_mindRT_) {
  1525. LoadSingleParameterMindRT(node);
  1526. }
  1527. }
  1528. /*
  1529. * Feature group: Dump.
  1530. * Target device group: GPU.
  1531. * Runtime category: MindRT.
  1532. * Description: This function is for loading constant data from device to host into tensor_list_map_ for GPU dump.
  1533. * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
  1534. */
  1535. void Debugger::LoadConstsForGraph(const KernelGraphPtr &graph) {
  1536. if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
  1537. return;
  1538. }
  1539. // load value nodes
  1540. // get all constant values from the graph
  1541. MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
  1542. auto root_graph_id = graph->root_graph_id();
  1543. const auto value_nodes = graph->graph_value_nodes();
  1544. for (auto &item : value_nodes) {
  1545. LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
  1546. }
  1547. }
  1548. /*
  1549. * Feature group: Online debugger.
  1550. * Target device group: Ascend.
  1551. * Runtime category: Old runtime, MindRT.
  1552. * Description: Load all the kernels for the last loaded graph.
  1553. */
  1554. void Debugger::LoadGraphOutputs() {
  1555. if (!(debugger_enabled() && device_target_ == kAscendDevice)) return;
  1556. MS_EXCEPTION_IF_NULL(graph_ptr_);
  1557. const auto &apply_kernels = graph_ptr_->execution_order();
  1558. auto root_graph_id = graph_ptr_->root_graph_id();
  1559. // for kernels, execution order starts from 1
  1560. int exec_order = 1;
  1561. for (const auto &node : apply_kernels) {
  1562. MS_EXCEPTION_IF_NULL(node);
  1563. std::string kernel_name = GetKernelNodeName(node);
  1564. auto output_size = AnfAlgo::GetOutputTensorNum(node);
  1565. if (partial_memory_) {
  1566. if (!debug_services_->IsWatchPoint(kernel_name, node)) {
  1567. continue;
  1568. }
  1569. }
  1570. for (size_t j = 0; j < output_size; ++j) {
  1571. if (!AnfAlgo::OutputAddrExist(node, j)) {
  1572. MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
  1573. continue;
  1574. }
  1575. auto addr = AnfAlgo::GetOutputAddr(node, j);
  1576. MS_EXCEPTION_IF_NULL(addr);
  1577. auto type = AnfAlgo::GetOutputInferDataType(node, j);
  1578. if (!IsTypeDebuggerSupported(type)) {
  1579. continue;
  1580. }
  1581. auto format = kOpFormat_DEFAULT;
  1582. string tensor_name = kernel_name + ':' + std::to_string(j);
  1583. ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
  1584. auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id);
  1585. if (!ret) {
  1586. MS_LOG(ERROR) << "LoadMemToHost:"
  1587. << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
  1588. }
  1589. }
  1590. exec_order = exec_order + 1;
  1591. }
  1592. }
  1593. /*
  1594. * Feature group: Online debugger.
  1595. * Target device group: GPU.
  1596. * Runtime category: Old runtime.
  1597. * Description: Update step number if we are processing the first graph (to support multigraph).
  1598. */
  1599. void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
  1600. MS_EXCEPTION_IF_NULL(graph);
  1601. MS_EXCEPTION_IF_NULL(debugger_);
  1602. if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
  1603. (graph->graph_id() == debugger_->GetFirstRunGraphId())) {
  1604. // access lock for public method
  1605. std::lock_guard<std::mutex> a_lock(access_lock_);
  1606. ++num_step_;
  1607. }
  1608. }
  1609. /*
  1610. * Feature group: Online debugger.
  1611. * Target device group: GPU.
  1612. * Runtime category: MindRT.
  1613. * Description: Update step number when DebugActor::DebugOnStepEnd is called at the end of each step.
  1614. */
  1615. void Debugger::UpdateStepNumGPU() {
  1616. auto &dump_json_parser = DumpJsonParser::GetInstance();
  1617. if (device_target_ == kGPUDevice && (debugger_enabled_ || dump_json_parser.DumpEnabledForIter())) {
  1618. // access lock for public method
  1619. std::lock_guard<std::mutex> a_lock(access_lock_);
  1620. ++num_step_;
  1621. MS_LOG(DEBUG) << "Update step for GPU, current step: " << num_step_;
  1622. }
  1623. }
  1624. void Debugger::ClearCurrentData() {
  1625. if ((device_target_ == kGPUDevice) && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration())) {
  1626. if (debug_services_) {
  1627. debug_services_->EmptyCurrentTensor();
  1628. } else {
  1629. MS_LOG(ERROR) << "debug_services_ is nullptr";
  1630. }
  1631. }
  1632. }
  1633. bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
  1634. if (debug_services_ != nullptr) {
  1635. return debug_services_->TensorExistsInCurrent(tensor_name);
  1636. }
  1637. return false;
  1638. }
  1639. #ifdef ENABLE_D
  1640. /*
  1641. * Feature group: Dump.
  1642. * Target device group: Ascend.
  1643. * Runtime category: Old runtime, MindRT.
  1644. * Description: Load DumpDataBuilder object from dump_data_construct_map_ for tracking data chunks of node_name. It's
  1645. * for Ascend a + m dump. If not found, create a new one for it and add to dump_data_construct_map_.
  1646. */
  1647. std::shared_ptr<DumpDataBuilder> Debugger::LoadDumpDataBuilder(const std::string &node_name) {
  1648. auto iter = dump_data_construct_map_.find(node_name);
  1649. if (iter == dump_data_construct_map_.end()) {
  1650. dump_data_construct_map_[node_name] = std::make_shared<DumpDataBuilder>();
  1651. }
  1652. return dump_data_construct_map_[node_name];
  1653. }
  1654. void Debugger::ClearDumpDataBuilder(const std::string &node_name) { (void)dump_data_construct_map_.erase(node_name); }
  1655. #endif
  1656. } // namespace mindspore