You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.cc 37 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/engine/perf/profiling.h"
  17. #include <sys/stat.h>
  18. #include <cstdlib>
  19. #include <fstream>
  20. #include <algorithm>
  21. #include "utils/ms_utils.h"
  22. #include "minddata/dataset/util/path.h"
  23. #ifdef ENABLE_GPUQUE
  24. #include "minddata/dataset/core/config_manager.h"
  25. #include "minddata/dataset/core/global_context.h"
  26. #endif
  27. #include "minddata/dataset/engine/perf/monitor.h"
  28. #include "minddata/dataset/engine/perf/connector_size.h"
  29. #include "minddata/dataset/engine/perf/cpu_sampler.h"
  30. #include "minddata/dataset/engine/execution_tree.h"
  31. #include "minddata/dataset/engine/tree_adapter.h"
  32. #include "minddata/dataset/util/log_adapter.h"
  33. namespace mindspore {
  34. namespace dataset {
  35. constexpr int32_t PUSH_TIME_OFFSET = 0;
  36. constexpr int32_t BATCH_TIME_OFFSET = 1;
  37. constexpr int32_t PIPELINE_TIME_OFFSET = 2;
  38. constexpr int32_t CONNECTOR_DEPTH_OFFSET = 3;
  39. Status Profiling::Start() {
  40. CHECK_FAIL_RETURN_UNEXPECTED(active_ == false, "Profiling node is already active.");
  41. active_ = true;
  42. return Status::OK();
  43. }
  44. Status Profiling::Stop() {
  45. CHECK_FAIL_RETURN_UNEXPECTED(active_ == true, "Profiling node is already deactivated.");
  46. active_ = false;
  47. return Status::OK();
  48. }
  49. Status Tracing::SaveToFile(const std::string &dir_path, const std::string &rank_id) {
  50. if (value_.empty()) {
  51. return Status::OK();
  52. }
  53. Path path = GetFileName(dir_path, rank_id);
  54. // Remove the file if it exists (from prior profiling usage)
  55. RETURN_IF_NOT_OK(path.Remove());
  56. std::string file_path = path.ToString();
  57. MS_LOG(INFO) << "Start to save profiling data for a tracing node.";
  58. std::ofstream handle(file_path, std::ios::trunc);
  59. if (!handle.is_open()) {
  60. RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
  61. }
  62. for (const auto &value : value_) {
  63. handle << value << "\n";
  64. }
  65. handle.close();
  66. return Status::OK();
  67. }
  68. Status Tracing::ChangeFileMode(const std::string &dir_path, const std::string &rank_id) {
  69. if (value_.empty()) {
  70. return Status::OK();
  71. }
  72. Path path = GetFileName(dir_path, rank_id);
  73. std::string file_path = path.ToString();
  74. if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
  75. std::string err_str = "Change file mode failed," + file_path;
  76. return Status(StatusCode::kMDUnexpectedError, err_str);
  77. }
  78. return Status::OK();
  79. }
  80. void Tracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
  81. const uint64_t time_stamp) {
  82. // Format: "type extra-info batch-num value"
  83. // type: 0: time, 1: connector size
  84. // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  85. // if type is 1 - connector capacity
  86. // batch-num: batch number
  87. // value: if type is 0 - value is time(ms)
  88. // if type is 1 - value is connector size
  89. // time-stamp: time stamp
  90. // Examples:
  91. // 0 0 20 10 xxx- The 20th batch took 10ms to get data from pipeline.
  92. // 1 64 20 5 xxx- Connector size is 5 when get the 20th batch.Connector capacity is 64.
  93. if (!active_) {
  94. return;
  95. }
  96. TracingRecord record = {type, extra_info, batch_num, value, time_stamp};
  97. std::lock_guard<std::mutex> guard(lock_);
  98. (void)records_.emplace_back(record);
  99. (void)value_.emplace_back(record.ToString());
  100. // save timestamp per batch
  101. const constexpr int32_t RECORDS_PER_STEP = 4;
  102. if (records_.size() % RECORDS_PER_STEP == 0) {
  103. (void)ts_.emplace_back(time_stamp);
  104. }
  105. }
  106. Status Tracing::TimeIntervalForStepRange(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts) {
  107. std::lock_guard<std::mutex> guard(lock_);
  108. MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  109. CHECK_FAIL_RETURN_UNEXPECTED(start_step > 0,
  110. "Expected start_step > 0. Got start_step: " + std::to_string(start_step));
  111. CHECK_FAIL_RETURN_UNEXPECTED(end_step >= start_step,
  112. "Expected end_step >= start_step. Got start_step: " + std::to_string(start_step) +
  113. " end_step: " + std::to_string(end_step));
  114. CHECK_FAIL_RETURN_UNEXPECTED(end_step < ts_.size(),
  115. "Expected end_step < ts_.size(). Got end_step: " + std::to_string(end_step) +
  116. " ts_.size: " + std::to_string(ts_.size()));
  117. // end timestamp of (start_step - 1) step
  118. *start_ts = ts_[start_step - 1];
  119. *end_ts = ts_[end_step];
  120. return Status::OK();
  121. }
  122. Status Tracing::StepIntervalForTimeRange(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step) {
  123. CHECK_FAIL_RETURN_UNEXPECTED(start_ts < end_ts, "Expected start_ts < end_ts. Got start_ts: " +
  124. std::to_string(start_ts) + " end_ts: " + std::to_string(end_ts));
  125. std::lock_guard<std::mutex> guard(lock_);
  126. CHECK_FAIL_RETURN_UNEXPECTED(ts_.size() > 1, "No tracing data available yet.");
  127. // find first ts that is not less than start_ts
  128. auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  129. CHECK_FAIL_RETURN_UNEXPECTED(lower != ts_.end(),
  130. "No data available for time >= start_ts. start_ts: " + std::to_string(start_ts));
  131. // there is no 0th step. If start_ts == 0, then lower == ts_.begin()
  132. *start_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), lower)));
  133. // find first ts that is greater than end_ts
  134. auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  135. if (upper == ts_.end()) {
  136. *end_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), upper) - 1));
  137. } else {
  138. *end_step = std::max(1, static_cast<int32_t>(std::distance(ts_.begin(), upper)));
  139. }
  140. return Status::OK();
  141. }
  142. Status Tracing::GetRecordEntryFieldValue(int32_t start_step, int32_t end_step, int32_t record_offset,
  143. const std::string &field, std::vector<int32_t> *result) {
  144. std::lock_guard<std::mutex> guard(lock_);
  145. const constexpr int32_t RECORDS_PER_STEP = 4;
  146. auto total_steps = records_.size() / RECORDS_PER_STEP;
  147. MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  148. CHECK_FAIL_RETURN_UNEXPECTED(start_step <= total_steps,
  149. "Expected start_step <= total_steps. Got start_step: " + std::to_string(start_step) +
  150. " total_steps: " + std::to_string(total_steps));
  151. CHECK_FAIL_RETURN_UNEXPECTED(end_step <= total_steps,
  152. "Expected end_step <= total_steps. Got end_step: " + std::to_string(end_step) +
  153. " total_steps: " + std::to_string(total_steps));
  154. CHECK_FAIL_RETURN_UNEXPECTED(start_step <= end_step,
  155. "Expected start_step <= end_step. Got start_step: " + std::to_string(start_step) +
  156. " end_step: " + std::to_string(end_step));
  157. for (auto step_num = start_step; step_num <= end_step; step_num++) {
  158. auto idx = (step_num - 1) * RECORDS_PER_STEP + record_offset;
  159. if (field == "value") {
  160. (void)result->emplace_back(records_[idx].value);
  161. } else if (field == "extra_info") {
  162. (void)result->emplace_back(records_[idx].extra_info);
  163. } else {
  164. return {StatusCode::kMDUnexpectedError,
  165. "Received unexpected field: " + field + R"(. Expected: ["value", "extra_info"].)"};
  166. }
  167. }
  168. return Status::OK();
  169. }
  170. Status Tracing::GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  171. return GetRecordEntryFieldValue(start_step, end_step, PIPELINE_TIME_OFFSET, "value", result);
  172. }
  173. Status Tracing::GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  174. return GetRecordEntryFieldValue(start_step, end_step, PUSH_TIME_OFFSET, "value", result);
  175. }
  176. Status Tracing::GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  177. return GetRecordEntryFieldValue(start_step, end_step, BATCH_TIME_OFFSET, "value", result);
  178. }
  179. Status Tracing::GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  180. return GetRecordEntryFieldValue(start_step, end_step, CONNECTOR_DEPTH_OFFSET, "value", result);
  181. }
  182. Status Tracing::GetConnectorCapacity(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  183. return GetRecordEntryFieldValue(start_step, end_step, CONNECTOR_DEPTH_OFFSET, "extra_info", result);
  184. }
  185. Status Tracing::GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) {
  186. std::vector<int32_t> sizes;
  187. RETURN_IF_NOT_OK(GetConnectorSize(start_step, end_step, &sizes));
  188. int32_t total = end_step - start_step + 1;
  189. CHECK_FAIL_RETURN_UNEXPECTED(total > 0, "Start step is greater than end step.");
  190. uint32_t count = std::count(sizes.begin(), sizes.end(), 0);
  191. *empty_queue_freq = static_cast<float_t>(count) / static_cast<float_t>(total);
  192. return Status::OK();
  193. }
  194. Status Tracing::Init() {
  195. (void)ts_.emplace_back(0);
  196. return Status::OK();
  197. }
  198. size_t Tracing::GetNumberSteps() { return ts_.size(); }
  199. void Tracing::Clear() {
  200. value_.clear();
  201. records_.clear();
  202. ts_.clear();
  203. }
  204. // Constructor
  205. ProfilingManager::ProfilingManager()
  206. : profiling_state_(ProfilingState::kProfilingStateUnBegun), tree_(nullptr), autotuning_(false), profiling_(false) {}
  207. bool ProfilingManager::IsProfilingEnable(const ExecutionTree *tree) const {
  208. auto external_state = GetProfilerTreeState(tree);
  209. return (external_state == kEnabledTreeNotRegistered || external_state == kEnabledTreeRegistered);
  210. }
  211. Status ProfilingManager::RegisterTree(TreeAdapter *tree_adapter) {
  212. CHECK_FAIL_RETURN_UNEXPECTED(tree_ == nullptr, "Another tree is already registered.");
  213. CHECK_FAIL_RETURN_UNEXPECTED((autotuning_ || profiling_) == true,
  214. "MD Profiler is disabled. Cannot register the tree.");
  215. tree_ = tree_adapter->tree_.get();
  216. MS_LOG(INFO) << "Registering tree: " + tree_->GetUniqueId();
  217. perf_monitor_ = std::make_unique<Monitor>(this);
  218. // Register all sampling nodes here.
  219. // Tracing node registration is the responsibility of the Consumer
  220. std::shared_ptr<Sampling> connector_size_sampling = std::make_shared<ConnectorSize>(tree_);
  221. RETURN_IF_NOT_OK(RegisterSamplingNode(connector_size_sampling));
  222. #ifndef ENABLE_ANDROID
  223. std::shared_ptr<Sampling> cpu_sampler = std::make_shared<CpuSampler>(tree_);
  224. RETURN_IF_NOT_OK(RegisterSamplingNode(cpu_sampler));
  225. #endif
  226. // can insert a correct timestamp so that we can ignore the samples that were taken
  227. // during start up of the pipeline.
  228. (void)epoch_end_ts_.emplace_back(0);
  229. (void)epoch_end_step_.emplace_back(0);
  230. return Status::OK();
  231. }
  232. // Launch monitoring thread.
  233. Status ProfilingManager::LaunchMonitor() {
  234. RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("Monitor Thread launched", std::ref(*perf_monitor_)));
  235. return Status::OK();
  236. }
  237. // Profiling node registration
  238. Status ProfilingManager::RegisterTracingNode(const std::shared_ptr<Tracing> &node) {
  239. // Check if node with the same name has already been registered.
  240. auto exist = tracing_nodes_.find(node->Name());
  241. if (exist != tracing_nodes_.end()) {
  242. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  243. }
  244. // Register the node with its name as key.
  245. RETURN_IF_NOT_OK(node->Init());
  246. tracing_nodes_[node->Name()] = node;
  247. // the user may have already started profiling.
  248. if (profiling_state_ == ProfilingState::kProfilingStateRunning) {
  249. RETURN_IF_NOT_OK(node->Start());
  250. }
  251. return Status::OK();
  252. }
  253. // Profiling node getter
  254. Status ProfilingManager::GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node) {
  255. // Check if node with the same name has already been registered.
  256. auto exist = tracing_nodes_.find(name);
  257. if (exist == tracing_nodes_.end()) {
  258. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  259. }
  260. // Fetch node.
  261. *node = tracing_nodes_[name];
  262. return Status::OK();
  263. }
  264. // Profiling node registration
  265. Status ProfilingManager::RegisterSamplingNode(const std::shared_ptr<Sampling> &node) {
  266. // Check if node with the same name has already been registered.
  267. auto exist = sampling_nodes_.find(node->Name());
  268. if (exist != sampling_nodes_.end()) {
  269. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  270. }
  271. // Register the node with its name as key.
  272. RETURN_IF_NOT_OK(node->Init());
  273. sampling_nodes_[node->Name()] = node;
  274. // the user may have already started profiling.
  275. if (profiling_state_ == ProfilingState::kProfilingStateRunning) {
  276. RETURN_IF_NOT_OK(node->Start());
  277. }
  278. return Status::OK();
  279. }
  280. // Profiling node getter
  281. Status ProfilingManager::GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node) {
  282. // Check if node with the same name has already been registered.
  283. auto exist = sampling_nodes_.find(name);
  284. if (exist == sampling_nodes_.end()) {
  285. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  286. }
  287. // Fetch node.
  288. *node = sampling_nodes_[name];
  289. return Status::OK();
  290. }
  291. Status ProfilingManager::SaveProfilingData(const std::string &dir_path, const std::string &rank_id) {
  292. MS_LOG(INFO) << "Start to save profiling data.";
  293. for (const auto &node : tracing_nodes_) {
  294. RETURN_IF_NOT_OK(node.second->SaveToFile(dir_path, rank_id));
  295. }
  296. for (const auto &node : sampling_nodes_) {
  297. RETURN_IF_NOT_OK(node.second->SaveToFile(dir_path, rank_id));
  298. }
  299. MS_LOG(INFO) << "Save profiling data end.";
  300. return Status::OK();
  301. }
  302. Status ProfilingManager::ChangeFileMode(const std::string &dir_path, const std::string &rank_id) {
  303. MS_LOG(INFO) << "Start to change file mode.";
  304. for (const auto &node : tracing_nodes_) {
  305. RETURN_IF_NOT_OK(node.second->ChangeFileMode(dir_path, rank_id));
  306. }
  307. for (const auto &node : sampling_nodes_) {
  308. RETURN_IF_NOT_OK(node.second->ChangeFileMode(dir_path, rank_id));
  309. }
  310. MS_LOG(INFO) << "Change file mode end.";
  311. return Status::OK();
  312. }
  313. #ifndef ENABLE_ANDROID
  314. Status ProfilingManager::GetUserCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result) {
  315. uint64_t start_ts = 0, end_ts = 0;
  316. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  317. return GetUserCpuUtilByTime(start_ts, end_ts, result);
  318. }
  319. Status ProfilingManager::GetUserCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result) {
  320. uint64_t start_ts = 0, end_ts = 0;
  321. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  322. return GetUserCpuUtilByTime(start_ts, end_ts, result);
  323. }
  324. Status ProfilingManager::GetUserCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  325. std::shared_ptr<Sampling> sampling_node;
  326. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  327. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  328. return node->GetSystemUserCpuUtil(start_ts, end_ts, result);
  329. }
  330. Status ProfilingManager::GetSysCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result) {
  331. uint64_t start_ts = 0, end_ts = 0;
  332. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  333. return GetSysCpuUtilByTime(start_ts, end_ts, result);
  334. }
  335. Status ProfilingManager::GetSysCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result) {
  336. uint64_t start_ts = 0, end_ts = 0;
  337. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  338. return GetSysCpuUtilByTime(start_ts, end_ts, result);
  339. }
  340. Status ProfilingManager::GetSysCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  341. std::shared_ptr<Sampling> sampling_node;
  342. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  343. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  344. return node->GetSystemSysCpuUtil(start_ts, end_ts, result);
  345. }
  346. Status ProfilingManager::GetUserCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  347. uint64_t start_ts = 0, end_ts = 0;
  348. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  349. return GetUserCpuUtilByTime(op_id, start_ts, end_ts, result);
  350. }
  351. Status ProfilingManager::GetUserCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  352. std::vector<uint16_t> *result) {
  353. uint64_t start_ts = 0, end_ts = 0;
  354. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  355. return GetUserCpuUtilByTime(op_id, start_ts, end_ts, result);
  356. }
  357. Status ProfilingManager::GetUserCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  358. std::vector<uint16_t> *result) {
  359. std::shared_ptr<Sampling> sampling_node;
  360. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  361. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  362. return node->GetOpUserCpuUtil(op_id, start_ts, end_ts, result);
  363. }
  364. Status ProfilingManager::GetSysCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  365. uint64_t start_ts = 0, end_ts = 0;
  366. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  367. return GetSysCpuUtilByTime(op_id, start_ts, end_ts, result);
  368. }
  369. Status ProfilingManager::GetSysCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  370. std::vector<uint16_t> *result) {
  371. uint64_t start_ts = 0, end_ts = 0;
  372. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  373. return GetSysCpuUtilByTime(op_id, start_ts, end_ts, result);
  374. }
  375. Status ProfilingManager::GetSysCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  376. std::vector<uint16_t> *result) {
  377. std::shared_ptr<Sampling> sampling_node;
  378. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  379. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  380. return node->GetOpSysCpuUtil(op_id, start_ts, end_ts, result);
  381. }
  382. Status ProfilingManager::GetMainProcessMemoryInfoByEpoch(ProcessMemoryMetric metric, int32_t epoch_num,
  383. std::vector<float> *result) {
  384. uint64_t start_ts = 0, end_ts = 0;
  385. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  386. return GetMainProcessMemoryInfoByTime(metric, start_ts, end_ts, result);
  387. }
  388. Status ProfilingManager::GetMainProcessMemoryInfoByStep(ProcessMemoryMetric metric, int32_t start_step,
  389. int32_t end_step, std::vector<float> *result) {
  390. uint64_t start_ts = 0, end_ts = 0;
  391. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  392. return GetMainProcessMemoryInfoByTime(metric, start_ts, end_ts, result);
  393. }
  394. Status ProfilingManager::GetMainProcessMemoryInfoByTime(ProcessMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
  395. std::vector<float> *result) {
  396. std::shared_ptr<Sampling> sampling_node;
  397. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  398. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  399. return node->GetProcessMemoryInfo(metric, start_ts, end_ts, result);
  400. }
  401. Status ProfilingManager::GetSystemMemoryInfoByEpoch(SystemMemoryMetric metric, int32_t epoch_num,
  402. std::vector<float> *result) {
  403. uint64_t start_ts = 0, end_ts = 0;
  404. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  405. return GetSystemMemoryInfoByTime(metric, start_ts, end_ts, result);
  406. }
  407. Status ProfilingManager::GetSystemMemoryInfoByStep(SystemMemoryMetric metric, int32_t start_step, int32_t end_step,
  408. std::vector<float> *result) {
  409. uint64_t start_ts = 0, end_ts = 0;
  410. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  411. return GetSystemMemoryInfoByTime(metric, start_ts, end_ts, result);
  412. }
  413. Status ProfilingManager::GetSystemMemoryInfoByTime(SystemMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
  414. std::vector<float> *result) {
  415. std::shared_ptr<Sampling> sampling_node;
  416. RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  417. auto node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  418. return node->GetSystemMemoryInfo(metric, start_ts, end_ts, result);
  419. }
  420. #endif
  421. Status ProfilingManager::EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts) {
  422. if (epoch_num <= 0 || epoch_num >= epoch_end_ts_.size()) {
  423. std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
  424. MS_LOG(INFO) << err;
  425. return {StatusCode::kMDUnexpectedError, err};
  426. }
  427. *start_ts = epoch_end_ts_[epoch_num - 1];
  428. *end_ts = epoch_end_ts_[epoch_num];
  429. return Status::OK();
  430. }
  431. Status ProfilingManager::EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step) {
  432. if (epoch_num <= 0 || epoch_num >= epoch_end_step_.size()) {
  433. std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
  434. return {StatusCode::kMDUnexpectedError, err};
  435. }
  436. *start_step = epoch_end_step_[epoch_num - 1] + 1;
  437. *end_step = epoch_end_step_[epoch_num];
  438. return Status::OK();
  439. }
  440. Status ProfilingManager::StepToTimeInterval(int32_t start_step, int32_t end_step, uint64_t *start_ts,
  441. uint64_t *end_ts) {
  442. std::shared_ptr<Tracing> node;
  443. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  444. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  445. return node->TimeIntervalForStepRange(start_step, end_step, start_ts, end_ts);
  446. } else {
  447. return {StatusCode::kMDUnexpectedError,
  448. "Cannot find appropriate tracing node to convert step range to time interval."};
  449. }
  450. }
  451. Status ProfilingManager::TimeToStepInterval(uint64_t start_ts, uint64_t end_ts, int32_t *start_step,
  452. int32_t *end_step) {
  453. std::shared_ptr<Tracing> node;
  454. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  455. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  456. return node->StepIntervalForTimeRange(start_ts, end_ts, start_step, end_step);
  457. } else {
  458. return {StatusCode::kMDUnexpectedError,
  459. "Cannot find appropriate tracing node to convert time interval to step range."};
  460. }
  461. }
  462. Status ProfilingManager::GetConnectorSizeByEpoch(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result) {
  463. uint64_t start_ts = 0, end_ts = 0;
  464. RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  465. return GetConnectorSizeByTime(op_id, start_ts, end_ts, result);
  466. }
  467. Status ProfilingManager::GetConnectorSizeByStep(int32_t op_id, int32_t start_step, int32_t end_step,
  468. std::vector<int32_t> *result) {
  469. uint64_t start_ts = 0, end_ts = 0;
  470. RETURN_IF_NOT_OK(StepToTimeInterval(start_step, end_step, &start_ts, &end_ts));
  471. return GetConnectorSizeByTime(op_id, start_ts, end_ts, result);
  472. }
  473. Status ProfilingManager::GetConnectorSizeByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts,
  474. std::vector<int32_t> *result) {
  475. std::shared_ptr<Sampling> node;
  476. RETURN_IF_NOT_OK(GetSamplingNode(kConnectorSizeSamplingName, &node));
  477. auto connector_node = std::dynamic_pointer_cast<ConnectorSize>(node);
  478. return connector_node->GetOpConnectorSize(op_id, start_ts, end_ts, result);
  479. }
  480. Status ProfilingManager::GetPipelineTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  481. uint32_t start_step = 0, end_step = 0;
  482. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  483. return GetPipelineTimeByStep(start_step, end_step, result);
  484. }
  485. Status ProfilingManager::GetPipelineTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  486. std::shared_ptr<Tracing> node;
  487. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  488. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  489. return node->GetPipelineTime(start_step, end_step, result);
  490. } else {
  491. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  492. }
  493. }
  494. Status ProfilingManager::GetPipelineTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  495. int32_t start_step = 0, end_step = 0;
  496. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  497. return GetPipelineTimeByStep(start_step, end_step, result);
  498. }
  499. Status ProfilingManager::GetPushTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  500. uint32_t start_step = 0, end_step = 0;
  501. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  502. return GetPushTimeByStep(start_step, end_step, result);
  503. }
  504. Status ProfilingManager::GetPushTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  505. std::shared_ptr<Tracing> node;
  506. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  507. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  508. return node->GetPushTime(start_step, end_step, result);
  509. } else {
  510. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  511. }
  512. }
  513. Status ProfilingManager::GetPushTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  514. int32_t start_step = 0, end_step = 0;
  515. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  516. return GetPushTimeByStep(start_step, end_step, result);
  517. }
  518. Status ProfilingManager::GetBatchTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  519. uint32_t start_step = 0, end_step = 0;
  520. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  521. return GetBatchTimeByStep(start_step, end_step, result);
  522. }
  523. Status ProfilingManager::GetBatchTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  524. std::shared_ptr<Tracing> node;
  525. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  526. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  527. return node->GetBatchTime(start_step, end_step, result);
  528. } else {
  529. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  530. }
  531. }
  532. Status ProfilingManager::GetBatchTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  533. int32_t start_step = 0, end_step = 0;
  534. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  535. return GetBatchTimeByStep(start_step, end_step, result);
  536. }
  537. Status ProfilingManager::GetConnectorSizeByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  538. uint32_t start_step = 0, end_step = 0;
  539. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  540. return GetConnectorSizeByStep(start_step, end_step, result);
  541. }
  542. Status ProfilingManager::GetConnectorSizeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  543. std::shared_ptr<Tracing> node;
  544. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  545. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  546. return node->GetConnectorSize(start_step, end_step, result);
  547. } else {
  548. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  549. }
  550. }
  551. Status ProfilingManager::GetConnectorSizeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  552. int32_t start_step = 0, end_step = 0;
  553. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  554. return GetConnectorSizeByStep(start_step, end_step, result);
  555. }
  556. Status ProfilingManager::GetEmptyQueueFrequencyByEpoch(int32_t epoch_num, float_t *result) {
  557. uint32_t start_step = 0, end_step = 0;
  558. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  559. return GetEmptyQueueFrequencyByStep(start_step, end_step, result);
  560. }
  561. Status ProfilingManager::GetEmptyQueueFrequencyByStep(int32_t start_step, int32_t end_step, float_t *result) {
  562. std::shared_ptr<Tracing> node;
  563. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  564. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  565. return node->GetEmptyQueueFrequency(start_step, end_step, result);
  566. } else {
  567. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  568. }
  569. }
  570. Status ProfilingManager::GetEmptyQueueFrequencyByTime(uint64_t start_ts, uint64_t end_ts, float_t *result) {
  571. int32_t start_step = 0, end_step = 0;
  572. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  573. return GetEmptyQueueFrequencyByStep(start_step, end_step, result);
  574. }
  575. Status ProfilingManager::GetConnectorCapacityByEpoch(int32_t epoch_num, std::vector<int32_t> *result) {
  576. uint32_t start_step = 0, end_step = 0;
  577. RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  578. return GetConnectorCapacityByStep(start_step, end_step, result);
  579. }
  580. Status ProfilingManager::GetConnectorCapacityByStep(int32_t start_step, int32_t end_step,
  581. std::vector<int32_t> *result) {
  582. std::shared_ptr<Tracing> node;
  583. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  584. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  585. return node->GetConnectorCapacity(start_step, end_step, result);
  586. } else {
  587. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  588. }
  589. }
  590. Status ProfilingManager::GetConnectorCapacityByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result) {
  591. int32_t start_step = 0, end_step = 0;
  592. RETURN_IF_NOT_OK(TimeToStepInterval(start_ts, end_ts, &start_step, &end_step));
  593. return GetConnectorCapacityByStep(start_step, end_step, result);
  594. }
  595. Status ProfilingManager::GetNumberOfProfiledSteps(int32_t *steps) {
  596. std::shared_ptr<Tracing> node;
  597. if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
  598. GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
  599. *steps = node->GetNumberSteps();
  600. return Status::OK();
  601. } else {
  602. return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  603. }
  604. }
  605. void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
  606. if (profiling_state_ != ProfilingState::kProfilingStateRunning) {
  607. return;
  608. }
  609. MS_LOG(INFO) << "Recording end of epoch. step_num: " << step_num;
  610. (void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
  611. (void)epoch_end_step_.emplace_back(step_num);
  612. }
  613. Status ProfilingManager::Reset() {
  614. for (const auto &node : tracing_nodes_) {
  615. node.second->Clear();
  616. }
  617. for (const auto &node : sampling_nodes_) {
  618. node.second->Clear();
  619. }
  620. epoch_end_ts_.clear();
  621. epoch_end_step_.clear();
  622. profiling_state_ = ProfilingState::kProfilingStateUnBegun;
  623. autotuning_ = false;
  624. profiling_ = false;
  625. return Status::OK();
  626. }
  627. Status ProfilingManager::Init(const bool for_autotune) {
  628. // Reinitialization should only be done in case of UT with sequential pipelines and should not be used externally.
  629. // Reinitialization with parallel data pipelines can have unexpected consequences.
  630. CHECK_FAIL_RETURN_UNEXPECTED(!autotuning_, "Stop MD Autotune before initializing the MD Profiler.");
  631. CHECK_FAIL_RETURN_UNEXPECTED(!profiling_, "Stop MD Profiler before initializing it.");
  632. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateRunning,
  633. "Stop MD Profiler before reinitializing it.");
  634. Reset();
  635. tracing_nodes_.clear();
  636. sampling_nodes_.clear();
  637. tree_ = nullptr;
  638. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ == ProfilingState::kProfilingStateUnBegun,
  639. "MD Profiler is in an unexpected state.");
  640. if (for_autotune) {
  641. autotuning_ = true;
  642. MS_LOG(INFO) << "MD profiler is initialized successfully for autotuning.";
  643. } else {
  644. profiling_ = true;
  645. MS_LOG(INFO) << "MD profiler is initialized successfully for profiling.";
  646. }
  647. return Status::OK();
  648. }
  649. Status ProfilingManager::Start() {
  650. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateRunning,
  651. "MD ProfilingManager is already running.");
  652. if (profiling_state_ == ProfilingState::kProfilingStateFinished) {
  653. // This scenario (start, stop, and then start again) only happens in profiling, not autotune.
  654. MS_LOG(INFO) << "MD ProfilingManager had already stopped. Resetting...";
  655. Reset();
  656. for (const auto &node : sampling_nodes_) {
  657. RETURN_IF_NOT_OK(node.second->Init());
  658. }
  659. for (const auto &node : tracing_nodes_) {
  660. RETURN_IF_NOT_OK(node.second->Init());
  661. }
  662. profiling_ = true;
  663. MS_LOG(INFO) << "MD profiler is reset successfully for profiling.";
  664. }
  665. profiling_state_ = ProfilingState::kProfilingStateRunning;
  666. for (const auto &node : tracing_nodes_) {
  667. RETURN_IF_NOT_OK(node.second->Start());
  668. }
  669. for (const auto &node : sampling_nodes_) {
  670. RETURN_IF_NOT_OK(node.second->Start());
  671. }
  672. MS_LOG(INFO) << "MD profiler is started.";
  673. return Status::OK();
  674. }
  675. Status ProfilingManager::Stop() {
  676. CHECK_FAIL_RETURN_UNEXPECTED(profiling_state_ != ProfilingState::kProfilingStateUnBegun,
  677. "MD ProfilingManager has not started yet.");
  678. // It's OK if we are in kProfilingStateFinished state. We allow user to call Stop twice.
  679. if (profiling_state_ == ProfilingState::kProfilingStateFinished) {
  680. MS_LOG(WARNING) << "MD ProfilingManager had already stopped.";
  681. return Status::OK();
  682. }
  683. for (const auto &node : tracing_nodes_) {
  684. RETURN_IF_NOT_OK(node.second->Stop());
  685. }
  686. for (const auto &node : sampling_nodes_) {
  687. RETURN_IF_NOT_OK(node.second->Stop());
  688. }
  689. profiling_state_ = ProfilingState::kProfilingStateFinished;
  690. if (autotuning_) {
  691. autotuning_ = false;
  692. MS_LOG(INFO) << "MD Autotune is stopped.";
  693. }
  694. if (profiling_) {
  695. profiling_ = false;
  696. MS_LOG(INFO) << "MD Profiler is stopped.";
  697. }
  698. return Status::OK();
  699. }
  700. Status ProfilingManager::Save(const std::string &profile_data_path) {
  701. // Validate input profile data path
  702. CHECK_FAIL_RETURN_UNEXPECTED(!profile_data_path.empty(), "Invalid parameter, Profiling directory is not set.");
  703. CHECK_FAIL_RETURN_UNEXPECTED(profile_data_path.size() < PATH_MAX, "Invalid file, Profiling directory is invalid.");
  704. // profiling file: <profile_data_path>/filename_rank_id.suffix
  705. char real_path[PATH_MAX] = {0};
  706. #if defined(_WIN32) || defined(_WIN64)
  707. if (_fullpath(real_path, common::SafeCStr(profile_data_path), PATH_MAX) == nullptr) {
  708. RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
  709. }
  710. #else
  711. if (realpath(common::SafeCStr(profile_data_path), real_path) == nullptr) {
  712. RETURN_STATUS_UNEXPECTED("Invalid file, can not get realpath of Profiling directory.");
  713. }
  714. #endif
  715. std::string rank_id;
  716. #ifdef ENABLE_GPUQUE
  717. std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
  718. int32_t rank_id_int = cfg->rank_id();
  719. // If DEVICE_ID is not set, default value is 0
  720. if (rank_id_int < 0) {
  721. rank_id = common::GetEnv("DEVICE_ID");
  722. } else {
  723. rank_id = std::to_string(rank_id_int);
  724. }
  725. #else
  726. rank_id = common::GetEnv("RANK_ID");
  727. #endif
  728. // If RANK_ID is not set, default value is 0
  729. if (rank_id.empty()) {
  730. rank_id = "0";
  731. }
  732. // Output all profiling data upon request.
  733. RETURN_IF_NOT_OK(SaveProfilingData(std::string(profile_data_path), rank_id));
  734. RETURN_IF_NOT_OK(ChangeFileMode(std::string(profile_data_path), rank_id));
  735. return Status::OK();
  736. }
  737. ProfilingManager::ProfilingRegistrationState ProfilingManager::GetProfilerTreeState(const ExecutionTree *tree) const {
  738. auto enabled = (profiling_ || autotuning_);
  739. if (!enabled) return kNotEnabled;
  740. if (tree_ == nullptr) {
  741. return kEnabledTreeNotRegistered;
  742. } else {
  743. return tree_ == tree ? kEnabledTreeRegistered : kEnabledDifferentTreeRegistered;
  744. }
  745. }
  746. uint64_t ProfilingTime::GetCurMilliSecond() {
  747. // because cpplint does not allow using namespace
  748. using std::chrono::duration_cast;
  749. using std::chrono::milliseconds;
  750. using std::chrono::steady_clock;
  751. return static_cast<uint64_t>(duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count());
  752. }
  753. } // namespace dataset
  754. } // namespace mindspore