You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.cc 33 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/engine/perf/profiling.h"
  17. #include <sys/stat.h>
  18. #include <cstdlib>
  19. #include <fstream>
  20. #include <algorithm>
  21. #include "utils/ms_utils.h"
  22. #include "minddata/dataset/util/path.h"
  23. #ifdef ENABLE_GPUQUE
  24. #include "minddata/dataset/core/config_manager.h"
  25. #include "minddata/dataset/core/global_context.h"
  26. #endif
  27. #include "minddata/dataset/engine/perf/monitor.h"
  28. #include "minddata/dataset/engine/perf/device_queue_tracing.h"
  29. #include "minddata/dataset/engine/perf/connector_size.h"
  30. #include "minddata/dataset/engine/perf/cpu_sampler.h"
  31. #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h"
  32. #include "minddata/dataset/engine/execution_tree.h"
  33. #include "minddata/dataset/engine/tree_adapter.h"
  34. #include "minddata/dataset/util/log_adapter.h"
  35. namespace mindspore {
  36. namespace dataset {
  37. constexpr int32_t PUSH_TIME_OFFSET = 0;
  38. constexpr int32_t BATCH_TIME_OFFSET = 1;
  39. constexpr int32_t PIPELINE_TIME_OFFSET = 2;
  40. constexpr int32_t CONNECTOR_DEPTH_OFFSET = 3;
  41. Status Profiling::Start() {
  42. CHECK_FAIL_RETURN_UNEXPECTED(active_ == false, "Profiling node is already active.");
  43. active_ = true;
  44. return Status::OK();
  45. }
  46. Status Profiling::Stop() {
  47. CHECK_FAIL_RETURN_UNEXPECTED(active_ == true, "Profiling node is already deactivated.");
  48. active_ = false;
  49. return Status::OK();
  50. }
  51. Status Tracing::SaveToFile(const std::string &dir_path, const std::string &rank_id) {
  52. if (value_.empty()) {
  53. return Status::OK();
  54. }
  55. Path path = GetFileName(dir_path, rank_id);
  56. // Remove the file if it exists (from prior profiling usage)
  57. RETURN_IF_NOT_OK(path.Remove());
  58. std::string file_path = path.ToString();
  59. MS_LOG(INFO) << "Start to save profiling data for a tracing node.";
  60. std::ofstream handle(file_path, std::ios::trunc);
  61. if (!handle.is_open()) {
  62. RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
  63. }
  64. for (auto value : value_) {
  65. handle << value << "\n";
  66. }
  67. handle.close();
  68. return Status::OK();
  69. }
  70. Status Tracing::ChangeFileMode(const std::string &dir_path, const std::string &rank_id) {
  71. if (value_.empty()) {
  72. return Status::OK();
  73. }
  74. Path path = GetFileName(dir_path, rank_id);
  75. std::string file_path = path.ToString();
  76. if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
  77. std::string err_str = "Change file mode failed," + file_path;
  78. return Status(StatusCode::kMDUnexpectedError, err_str);
  79. }
  80. return Status::OK();
  81. }
  82. void Tracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
  83. const uint64_t time_stamp) {
  84. // Format: "type extra-info batch-num value"
  85. // type: 0: time, 1: connector size
  86. // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  87. // if type is 1 - connector capacity
  88. // batch-num: batch number
  89. // value: if type is 0 - value is time(ms)
  90. // if type is 1 - value is connector size
  91. // time-stamp: time stamp
  92. // Examples:
  93. // 0 0 20 10 xxx- The 20th batch took 10ms to get data from pipeline.
  94. // 1 64 20 5 xxx- Connector size is 5 when get the 20th batch.Connector capacity is 64.
  95. if (active_ == false) {
  96. return;
  97. }
  98. TracingRecord record = {type, extra_info, batch_num, value, time_stamp};
  99. std::lock_guard<std::mutex> guard(lock_);
  100. (void)records_.emplace_back(record);
  101. (void)value_.emplace_back(record.ToString());
  102. // save timestamp per batch
  103. constexpr int32_t RECORDS_PER_STEP = 4;
  104. if (records_.size() % RECORDS_PER_STEP == 0) {
  105. (void)ts_.emplace_back(time_stamp);
  106. }
  107. }
  108. Status Tracing::TimeIntervalForStepRange(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts) {
  109. std::lock_guard<std::mutex> guard(lock_);
  110. MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  111. CHECK_FAIL_RETURN_UNEXPECTED(start_step > 0,
  112. "Expected start_step > 0. Got start_step: " + std::to_string(start_step));
  113. CHECK_FAIL_RETURN_UNEXPECTED(end_step >= start_step,
  114. "Expected end_step >= start_step. Got start_step: " + std::to_string(start_step) +
  115. " end_step: " + std::to_string(end_step));
  116. CHECK_FAIL_RETURN_UNEXPECTED(end_step < ts_.size(),
  117. "Expected end_step < ts_.size(). Got end_step: " + std::to_string(end_step) +
  118. " ts_.size: " + std::to_string(ts_.size()));
  119. // end timestamp of (start_step - 1) step
  120. *start_ts = ts_[start_step - 1];
  121. *end_ts = ts_[end_step];
  122. return Status::OK();
  123. }
  124. Status Tracing::StepIntervalForTimeRange(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step) {
  125. CHECK_FAIL_RETURN_UNEXPECTED(start_ts < end_ts, "Expected start_ts < end_ts. Got start_ts: " +
  126. std::to_string(start_ts) + " end_ts: " + std::to_string(end_ts));
  127. std::lock_guard<std::mutex> guard(lock_);
  128. CHECK_FAIL_RETURN_UNEXPECTED(ts_.size() > 1, "No tracing data available yet.");
  129. // find first ts that is not less than start_ts
  130. auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  131. CHECK_FAIL_RETURN_UNEXPECTED(lower != ts_.end(),
  132. "No data available for time >= start_ts. start_ts: " + std::to_string(start_ts));
  133. // there is no 0th step. If start_ts == 0, then lower == ts_.begin()
  134. *start_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), lower)));
  135. // find first ts that is greater than end_ts
  136. auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  137. if (upper == ts_.end()) {
  138. *end_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), upper) - 1));
  139. } else {
  140. *end_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), upper)));
  141. }
  142. return Status::OK();
  143. }
  144. Status Tracing::GetRecordEntryFieldValue(int32_t start_step, int32_t end_step, int32_t record_offset,
  145. const std::string &field, std::vector<int32_t> *result) {
  146. std::lock_guard<std::mutex> guard(lock_);
  147. constexpr int32_t RECORDS_PER_STEP = 4;
  148. auto total_steps = records_.size() / RECORDS_PER_STEP;
  149. MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  150. CHECK_FAIL_RETURN_UNEXPECTED(start_step <= total_steps,
  151. "Expected start_step <= total_steps. Got start_step: " + std::to_string(start_step) +
  152. " total_steps: " + std::to_string(total_steps));
  153. CHECK_FAIL_RETURN_UNEXPECTED(end_step <= total_steps,
  154. "Expected end_step <= total_steps. Got end_step: " + std::to_string(end_step) +
  155. " total_steps: " + std::to_string(total_steps));
  156. CHECK_FAIL_RETURN_UNEXPECTED(start_step <= end_step,
  157. "Expected start_step <= end_step. Got start_step: " + std::to_string(start_step) +
  158. " end_step: " + std::to_string(end_step));
  159. for (auto step_num = start_step; step_num <= end_step; step_num++) {
  160. auto idx = (step_num - 1) * RECORDS_PER_STEP + record_offset;
  161. if (field == "value") {
  162. (void)result->emplace_back(records_[idx].value);
  163. } else if (field == "extra_info") {
  164. (void)result->emplace_back(records_[idx].extra_info);
  165. } else {
  166. return {StatusCode::kMDUnexpectedError,
  167. "Received unexpected field: " + field + R"(. Expected: ["value", "extra_info"].)"};
  168. }
  169. }
  170. return Status::OK();
  171. }
  172. Status Tracing::GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  173. return GetRecordEntryFieldValue(start_step, end_step, PIPELINE_TIME_OFFSET, "value", result);
  174. }
  175. Status Tracing::GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  176. return GetRecordEntryFieldValue(start_step, end_step, PUSH_TIME_OFFSET, "value", result);
  177. }
  178. Status Tracing::GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  179. return GetRecordEntryFieldValue(start_step, end_step, BATCH_TIME_OFFSET, "value", result);
  180. }
  181. Status Tracing::GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  182. return GetRecordEntryFieldValue(start_step, end_step, CONNECTOR_DEPTH_OFFSET, "value", result);
  183. }
  184. Status Tracing::GetConnectorCapacity(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  185. return GetRecordEntryFieldValue(start_step, end_step, CONNECTOR_DEPTH_OFFSET, "extra_info", result);
  186. }
  187. Status Tracing::GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) {
  188. std::vector<int32_t> sizes;
  189. RETURN_IF_NOT_OK(GetConnectorSize(start_step, end_step, &sizes));
  190. int32_t total = end_step - start_step + 1;
  191. CHECK_FAIL_RETURN_UNEXPECTED(total > 0, "Start step is greater than end step.");
  192. uint32_t count = std::count(sizes.begin(), sizes.end(), 0);
  193. *empty_queue_freq = static_cast<float_t>(count) / static_cast<float_t>(total);
  194. return Status::OK();
  195. }
  196. Status Tracing::Init() {
  197. (void)ts_.emplace_back(0);
  198. return Status::OK();
  199. }
  200. // Constructor
  201. ProfilingManager::ProfilingManager()
  202. : profiling_state_(ProfilingState::kProfilingStateUnBegun), tree_(nullptr), autotuning_(false), profiling_(false) {}
  203. bool ProfilingManager::IsProfilingEnable(const ExecutionTree *tree) const {
  204. auto external_state = GetProfilerTreeState(tree);
  205. return (external_state == kEnabledTreeNotRegistered || external_state == kEnabledTreeRegistered);
  206. }
  207. Status ProfilingManager::RegisterTree(TreeAdapter *tree_adapter) {
  208. CHECK_FAIL_RETURN_UNEXPECTED(tree_ == nullptr, "Another tree is already registered.");
  209. CHECK_FAIL_RETURN_UNEXPECTED((autotuning_ || profiling_) == true,
  210. "MD Profiler is disabled. Cannot register the tree.");
  211. tree_ = tree_adapter->tree_.get();
  212. MS_LOG(INFO) << "Registering tree: " + tree_->GetUniqueId();
  213. perf_monitor_ = std::make_unique<Monitor>(this);
  214. // Register all sampling nodes here.
  215. // Tracing node registration is the responsibility of the Consumer
  216. std::shared_ptr<Sampling> connector_size_sampling = std::make_shared<ConnectorSize>(tree_);
  217. RETURN_IF_NOT_OK(RegisterSamplingNode(connector_size_sampling));
  218. #ifndef ENABLE_ANDROID
  219. std::shared_ptr<Sampling> cpu_sampler = std::make_shared<CpuSampler>(tree_);
  220. RETURN_IF_NOT_OK(RegisterSamplingNode(cpu_sampler));
  221. #endif
  222. // can insert a correct timestamp so that we can ignore the samples that were taken
  223. // during start up of the pipeline.
  224. (void)epoch_end_ts_.emplace_back(0);
  225. (void)epoch_end_step_.emplace_back(0);
  226. return Status::OK();
  227. }
  228. // Launch monitoring thread.
  229. Status ProfilingManager::LaunchMonitor() {
  230. RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("Monitor Thread launched", std::ref(*perf_monitor_)));
  231. return Status::OK();
  232. }
  233. // Profiling node registration
  234. Status ProfilingManager::RegisterTracingNode(std::shared_ptr<Tracing> node) {
  235. // Check if node with the same name has already been registered.
  236. auto exist = tracing_nodes_.find(node->Name());
  237. if (exist != tracing_nodes_.end()) {
  238. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  239. }
  240. // Register the node with its name as key.
  241. RETURN_IF_NOT_OK(node->Init());
  242. tracing_nodes_[node->Name()] = node;
  243. // the user may have already started profiling.
  244. if (profiling_state_ == ProfilingState::kProfilingStateRunning) {
  245. RETURN_IF_NOT_OK(node->Start());
  246. }
  247. return Status::OK();
  248. }
  249. // Profiling node getter
  250. Status ProfilingManager::GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node) {
  251. // Check if node with the same name has already been registered.
  252. auto exist = tracing_nodes_.find(name);
  253. if (exist == tracing_nodes_.end()) {
  254. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  255. }
  256. // Fetch node.
  257. *node = tracing_nodes_[name];
  258. return Status::OK();
  259. }
  260. // Profiling node registration
  261. Status ProfilingManager::RegisterSamplingNode(std::shared_ptr<Sampling> node) {
  262. // Check if node with the same name has already been registered.
  263. auto exist = sampling_nodes_.find(node->Name());
  264. if (exist != sampling_nodes_.end()) {
  265. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  266. }
  267. // Register the node with its name as key.
  268. RETURN_IF_NOT_OK(node->Init());
  269. sampling_nodes_[node->Name()] = node;
  270. // the user may have already started profiling.
  271. if (profiling_state_ == ProfilingState::kProfilingStateRunning) {
  272. RETURN_IF_NOT_OK(node->Start());
  273. }
  274. return Status::OK();
  275. }
  276. // Profiling node getter
  277. Status ProfilingManager::GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node) {
  278. // Check if node with the same name has already been registered.
  279. auto exist = sampling_nodes_.find(name);
  280. if (exist == sampling_nodes_.end()) {
  281. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  282. }
  283. // Fetch node.
  284. *node = sampling_nodes_[name];
  285. return Status::OK();
  286. }
  287. Status ProfilingManager::SaveProfilingData(const std::string &dir_path, const std::string &rank_id) {
  288. MS_LOG(INFO) << "Start to save profiling data.";
  289. for (auto node : tracing_nodes_) {
  290. RETURN_IF_NOT_OK(node.second->SaveToFile(dir_path, rank_id));
  291. }
  292. for (auto node : sampling_nodes_) {
  293. RETURN_IF_NOT_OK(node.second->SaveToFile(dir_path, rank_id));
  294. }
  295. MS_LOG(INFO) << "Save profiling data end.";
  296. return Status::OK();
  297. }
  298. Status ProfilingManager::ChangeFileMode(const std::string &dir_path, const std::string &rank_id) {
  299. MS_LOG(INFO) << "Start to change file mode.";
  300. for (auto node : tracing_nodes_) {
  301. RETURN_IF_NOT_OK(node.second->ChangeFileMode(dir_path, rank_id));
  302. }
  303. for (auto node : sampling_nodes_) {
  304. RETURN_IF_NOT_OK(node.second->ChangeFileMode(dir_path, rank_id));
  305. }
  306. MS_LOG(INFO) << "Change file mode end.";
  307. return Status::OK();
  308. }
  309. #ifndef ENABLE_ANDROID
  310. Status ProfilingManager::GetUserCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result) {
  311. uint64_t start_ts, end_ts;
  312. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  313. return GetUserCpuUtilByTime(start_ts, end_ts, result);
  314. }
  315. Status ProfilingManager::GetUserCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result) {
  316. uint64_t start_ts, end_ts;
  317. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  318. return GetUserCpuUtilByTime(start_ts, end_ts, result);
  319. }
  320. Status ProfilingManager::GetUserCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  321. std::shared_ptr<Sampling> sampling_node;
  322. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  323. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  324. return node->GetSystemUserCpuUtil(start_ts, end_ts, result);
  325. }
  326. Status ProfilingManager::GetSysCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result) {
  327. uint64_t start_ts, end_ts;
  328. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  329. return GetSysCpuUtilByTime(start_ts, end_ts, result);
  330. }
  331. Status ProfilingManager::GetSysCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result) {
  332. uint64_t start_ts, end_ts;
  333. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  334. return GetSysCpuUtilByTime(start_ts, end_ts, result);
  335. }
  336. Status ProfilingManager::GetSysCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  337. std::shared_ptr<Sampling> sampling_node;
  338. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  339. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  340. return node->GetSystemSysCpuUtil(start_ts, end_ts, result);
  341. }
  342. Status ProfilingManager::GetUserCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  343. uint64_t start_ts, end_ts;
  344. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  345. return GetUserCpuUtilByTime(op_id, start_ts, end_ts, result);
  346. }
  347. Status ProfilingManager::GetUserCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  348. std::vector<uint16_t> *result) {
  349. uint64_t start_ts, end_ts;
  350. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  351. return GetUserCpuUtilByTime(op_id, start_ts, end_ts, result);
  352. }
  353. Status ProfilingManager::GetUserCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  354. std::vector<uint16_t> *result) {
  355. std::shared_ptr<Sampling> sampling_node;
  356. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  357. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  358. return node->GetOpUserCpuUtil(op_id, start_ts, end_ts, result);
  359. }
  360. Status ProfilingManager::GetSysCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  361. uint64_t start_ts, end_ts;
  362. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  363. return GetSysCpuUtilByTime(op_id, start_ts, end_ts, result);
  364. }
  365. Status ProfilingManager::GetSysCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  366. std::vector<uint16_t> *result) {
  367. uint64_t start_ts, end_ts;
  368. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  369. return GetSysCpuUtilByTime(op_id, start_ts, end_ts, result);
  370. }
  371. Status ProfilingManager::GetSysCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  372. std::vector<uint16_t> *result) {
  373. std::shared_ptr<Sampling> sampling_node;
  374. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  375. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  376. return node->GetOpSysCpuUtil(op_id, start_ts, end_ts, result);
  377. }
  378. #endif
  379. Status ProfilingManager::EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts) {
  380. if (epoch_num <= 0 || epoch_num >= epoch_end_ts_.size()) {
  381. std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
  382. MS_LOG(INFO) << err;
  383. return {StatusCode::kMDUnexpectedError, err};
  384. }
  385. *start_ts = epoch_end_ts_[epoch_num - 1];
  386. *end_ts = epoch_end_ts_[epoch_num];
  387. return Status::OK();
  388. }
  389. Status ProfilingManager::EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step) {
  390. if (epoch_num <= 0 || epoch_num >= epoch_end_step_.size()) {
  391. std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
  392. return {StatusCode::kMDUnexpectedError, err};
  393. }
  394. *start_step = epoch_end_step_[epoch_num - 1] + 1;
  395. *end_step = epoch_end_step_[epoch_num];
  396. return Status::OK();
  397. }
  398. Status ProfilingManager::StepToTimeInterval(int32_t start_step, int32_t end_step, uint64_t *start_ts,
  399. uint64_t *end_ts) {
  400. std::shared_ptr<Tracing> node;
  401. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  402. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  403. return node->TimeIntervalForStepRange(start_step, end_step, start_ts, end_ts);
  404. } else {
  405. return {StatusCode::kMDUnexpectedError,
  406. "Cannot find appropriate tracing node to convert step range to time interval."};
  407. }
  408. }
  409. Status ProfilingManager::TimeToStepInterval(uint64_t start_ts, uint64_t end_ts, int32_t *start_step,
  410. int32_t *end_step) {
  411. std::shared_ptr<Tracing> node;
  412. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  413. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  414. return node->StepIntervalForTimeRange(start_ts, end_ts, start_step, end_step);
  415. } else {
  416. return {StatusCode::kMDUnexpectedError,
  417. "Cannot find appropriate tracing node to convert step range to time interval."};
  418. }
  419. }
  420. Status ProfilingManager::GetConnectorSizeByEpoch(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result) {
  421. uint64_t start_ts, end_ts;
  422. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  423. return GetConnectorSizeByTime(op_id, start_ts, end_ts, result);
  424. }
  425. Status ProfilingManager::GetConnectorSizeByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  426. std::vector<int32_t> *result) {
  427. uint64_t start_ts, end_ts;
  428. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  429. return GetConnectorSizeByTime(op_id, start_ts, end_ts, result);
  430. }
  431. Status ProfilingManager::GetConnectorSizeByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  432. std::vector<int32_t> *result) {
  433. std::shared_ptr<Sampling> node;
  434. RETURN_IF_NOT_OK(GetSamplingNode(kConnectorSizeSamplingName, &node));
  435. auto connector_node = std::dynamic_pointer_cast<ConnectorSize>(node);
  436. return connector_node->GetOpConnectorSize(op_id, start_ts, end_ts, result);
  437. }
  438. Status ProfilingManager::GetPipelineTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  439. uint32_t start_step, end_step;
  440. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  441. return GetPipelineTimeByStep(start_step, end_step, result);
  442. }
  443. Status ProfilingManager::GetPipelineTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  444. std::shared_ptr<Tracing> node;
  445. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  446. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  447. return node->GetPipelineTime(start_step, end_step, result);
  448. } else {
  449. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  450. }
  451. }
  452. Status ProfilingManager::GetPipelineTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  453. int32_t start_step, end_step;
  454. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  455. return GetPipelineTimeByStep(start_step, end_step, result);
  456. }
  457. Status ProfilingManager::GetPushTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  458. uint32_t start_step, end_step;
  459. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  460. return GetPushTimeByStep(start_step, end_step, result);
  461. }
  462. Status ProfilingManager::GetPushTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  463. std::shared_ptr<Tracing> node;
  464. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  465. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  466. return node->GetPushTime(start_step, end_step, result);
  467. } else {
  468. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  469. }
  470. }
  471. Status ProfilingManager::GetPushTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  472. int32_t start_step, end_step;
  473. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  474. return GetPushTimeByStep(start_step, end_step, result);
  475. }
  476. Status ProfilingManager::GetBatchTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  477. uint32_t start_step, end_step;
  478. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  479. return GetBatchTimeByStep(start_step, end_step, result);
  480. }
  481. Status ProfilingManager::GetBatchTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  482. std::shared_ptr<Tracing> node;
  483. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  484. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  485. return node->GetBatchTime(start_step, end_step, result);
  486. } else {
  487. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  488. }
  489. }
  490. Status ProfilingManager::GetBatchTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  491. int32_t start_step, end_step;
  492. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  493. return GetBatchTimeByStep(start_step, end_step, result);
  494. }
  495. Status ProfilingManager::GetConnectorSizeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  496. uint32_t start_step, end_step;
  497. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  498. return GetConnectorSizeByStep(start_step, end_step, result);
  499. }
  500. Status ProfilingManager::GetConnectorSizeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  501. std::shared_ptr<Tracing> node;
  502. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  503. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  504. return node->GetConnectorSize(start_step, end_step, result);
  505. } else {
  506. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  507. }
  508. }
  509. Status ProfilingManager::GetConnectorSizeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  510. int32_t start_step, end_step;
  511. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  512. return GetConnectorSizeByStep(start_step, end_step, result);
  513. }
  514. Status ProfilingManager::GetEmptyQueueFrequencyByEpoch(int32_t epoch_num, float_t *result) {
  515. uint32_t start_step, end_step;
  516. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  517. return GetEmptyQueueFrequencyByStep(start_step, end_step, result);
  518. }
  519. Status ProfilingManager::GetEmptyQueueFrequencyByStep(int32_t start_step, int32_t end_step, float_t *result) {
  520. std::shared_ptr<Tracing> node;
  521. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  522. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  523. return node->GetEmptyQueueFrequency(start_step, end_step, result);
  524. } else {
  525. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  526. }
  527. }
  528. Status ProfilingManager::GetEmptyQueueFrequencyByTime(uint64_t start_ts, uint64_t end_ts, float_t *result) {
  529. int32_t start_step, end_step;
  530. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  531. return GetEmptyQueueFrequencyByStep(start_step, end_step, result);
  532. }
  533. Status ProfilingManager::GetConnectorCapacityByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  534. uint32_t start_step, end_step;
  535. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  536. return GetConnectorCapacityByStep(start_step, end_step, result);
  537. }
  538. Status ProfilingManager::GetConnectorCapacityByStep(int32_t start_step, int32_t end_step,
  539. std::vector<int32_t> *result) {
  540. std::shared_ptr<Tracing> node;
  541. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  542. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  543. return node->GetConnectorCapacity(start_step, end_step, result);
  544. } else {
  545. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  546. }
  547. }
  548. Status ProfilingManager::GetConnectorCapacityByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  549. int32_t start_step, end_step;
  550. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  551. return GetConnectorCapacityByStep(start_step, end_step, result);
  552. }
  553. void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
  554. if (profiling_state_ != ProfilingState::kProfilingStateRunning) {
  555. return;
  556. }
  557. MS_LOG(INFO) << "Recording end of epoch. step_num: " << step_num;
  558. (void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
  559. (void)epoch_end_step_.emplace_back(step_num);
  560. }
  561. Status ProfilingManager::Reset() {
  562. tracing_nodes_.clear();
  563. sampling_nodes_.clear();
  564. epoch_end_ts_.clear();
  565. epoch_end_step_.clear();
  566. perf_monitor_.reset();
  567. tree_ = nullptr;
  568. profiling_state_ = ProfilingState::kProfilingStateUnBegun;
  569. autotuning_ = false;
  570. profiling_ = false;
  571. return Status::OK();
  572. }
  573. Status ProfilingManager::Init(const bool for_autotune) {
  574. // Reinitialization should only be done in case of UT with sequential pipelines and should not be used externally.
  575. // Reinitialization with parallel data pipelines can have unexpected consequences.
  576. CHECK_FAIL_RETURN_UNEXPECTED(!autotuning_, "Stop MD Autotune before initializing the MD Profiler.");
  577. CHECK_FAIL_RETURN_UNEXPECTED(!profiling_, "Stop MD Profiler before initializing it.");
  578. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateRunning,
  579. "Stop MD Profiler before reinitializing it.");
  580. Reset();
  581. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ == ProfilingState::kProfilingStateUnBegun,
  582. "MD Profiler is in an unexpected state.");
  583. if (for_autotune) {
  584. autotuning_ = true;
  585. MS_LOG(INFO) << "MD profiler is initialized successfully for autotuning.";
  586. } else {
  587. profiling_ = true;
  588. MS_LOG(INFO) << "MD profiler is initialized successfully for profiling.";
  589. }
  590. return Status::OK();
  591. }
  592. Status ProfilingManager::Start() {
  593. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateRunning,
  594. "MD ProfilingManager is already running.");
  595. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateFinished,
  596. "MD ProfilingManager is already finished.");
  597. profiling_state_ = ProfilingState::kProfilingStateRunning;
  598. for (const auto &node : tracing_nodes_) {
  599. RETURN_IF_NOT_OK(node.second->Start());
  600. }
  601. for (const auto &node : sampling_nodes_) {
  602. RETURN_IF_NOT_OK(node.second->Start());
  603. }
  604. MS_LOG(INFO) << "MD profiler is started.";
  605. return Status::OK();
  606. }
  607. Status ProfilingManager::Stop() {
  608. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateUnBegun,
  609. "MD ProfilingManager has not started yet.");
  610. // It's OK if we are in kProfilingStateFinished state. We allow user to call Stop twice.
  611. if (profiling_state_ == ProfilingState::kProfilingStateFinished) {
  612. MS_LOG(WARNING) << "MD ProfilingManager had already stopped.";
  613. return Status::OK();
  614. }
  615. for (const auto &node : tracing_nodes_) {
  616. RETURN_IF_NOT_OK(node.second->Stop());
  617. }
  618. for (const auto &node : sampling_nodes_) {
  619. RETURN_IF_NOT_OK(node.second->Stop());
  620. }
  621. profiling_state_ = ProfilingState::kProfilingStateFinished;
  622. if (autotuning_) {
  623. autotuning_ = false;
  624. MS_LOG(INFO) << "MD Autotune is stopped.";
  625. }
  626. if (profiling_) {
  627. profiling_ = false;
  628. MS_LOG(INFO) << "MD Profiler is stopped.";
  629. }
  630. return Status::OK();
  631. }
  632. Status ProfilingManager::Save(const std::string &profile_data_path) {
  633. // Validate input profile data path
  634. CHECK_FAIL_RETURN_UNEXPECTED(!profile_data_path.empty(), "Invalid parameter, Profiling directory is not set.");
  635. CHECK_FAIL_RETURN_UNEXPECTED(profile_data_path.size() < PATH_MAX, "Invalid file, Profiling directory is invalid.");
  636. // profiling file: <profile_data_path>/filename_rank_id.suffix
  637. char real_path[PATH_MAX] = {0};
  638. #if defined(_WIN32) || defined(_WIN64)
  639. if (_fullpath(real_path, common::SafeCStr(profile_data_path), PATH_MAX) == nullptr) {
  640. RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
  641. }
  642. #else
  643. if (realpath(common::SafeCStr(profile_data_path), real_path) == nullptr) {
  644. RETURN_STATUS_UNEXPECTED("Invalid file, can not get realpath of Profiling directory.");
  645. }
  646. #endif
  647. std::string rank_id;
  648. #ifdef ENABLE_GPUQUE
  649. std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
  650. int32_t rank_id_int = cfg->rank_id();
  651. // If DEVICE_ID is not set, default value is 0
  652. if (rank_id_int < 0) {
  653. rank_id = common::GetEnv("DEVICE_ID");
  654. } else {
  655. rank_id = std::to_string(rank_id_int);
  656. }
  657. #else
  658. rank_id = common::GetEnv("RANK_ID");
  659. #endif
  660. // If RANK_ID is not set, default value is 0
  661. if (rank_id.empty()) {
  662. rank_id = "0";
  663. }
  664. // Output all profiling data upon request.
  665. RETURN_IF_NOT_OK(SaveProfilingData(std::string(profile_data_path), rank_id));
  666. RETURN_IF_NOT_OK(ChangeFileMode(std::string(profile_data_path), rank_id));
  667. return Status::OK();
  668. }
  669. ProfilingManager::ProfilingRegistrationState ProfilingManager::GetProfilerTreeState(const ExecutionTree *tree) const {
  670. auto enabled = (profiling_ || autotuning_);
  671. if (!enabled) return kNotEnabled;
  672. if (tree_ == nullptr) {
  673. return kEnabledTreeNotRegistered;
  674. } else {
  675. return tree_ == tree ? kEnabledTreeRegistered : kEnabledDifferentTreeRegistered;
  676. }
  677. }
  678. uint64_t ProfilingTime::GetCurMilliSecond() {
  679. // because cpplint does not allow using namespace
  680. using std::chrono::duration_cast;
  681. using std::chrono::milliseconds;
  682. using std::chrono::steady_clock;
  683. return static_cast<uint64_t>(duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count());
  684. }
  685. } // namespace dataset
  686. } // namespace mindspore