You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.h 31 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_
  17. #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_
  18. #include <atomic>
  19. #include <chrono>
  20. #include <memory>
  21. #include <mutex>
  22. #include <string>
  23. #include <unordered_map>
  24. #include <vector>
  25. #include <nlohmann/json.hpp>
  26. #include "minddata/dataset/util/path.h"
  27. #include "minddata/dataset/util/status.h"
  28. #include "minddata/dataset/engine/perf/monitor.h"
  29. namespace mindspore {
  30. namespace dataset {
  31. class Monitor;
  32. class ExecutionTree;
  33. class TreeConsumer;
  34. class CpuSampler;
  35. class TreeAdapter;
  36. const char kDeviceQueueTracingName[] = "Device_Queue_Tracing";
  37. const char kDatasetIteratorTracingName[] = "Dataset_Iterator_Tracing";
  38. const char kConnectorSizeSamplingName[] = "Connector_Size_Sampling";
  39. const char kCpuSamplerName[] = "Cpu_Sampler";
  40. // Values for process memory metrics - common for profiling and cpu_sampler
  41. enum ProcessMemoryMetric { kPSS, kRSS, kVSS };
  42. // Values for system memory metrics - common for profiling and cpu_sampler
  43. enum SystemMemoryMetric { kMemoryAvailable, kMemoryTotal, kMemoryUsed };
  44. // Profiling is a class of basic unit of profiling action
  45. // This base class encapsulate the serialization output logic
  46. class Profiling : public std::enable_shared_from_this<Profiling> {
  47. public:
  48. // Constructor
  49. Profiling() : active_(false) {}
  50. // Destructor
  51. virtual ~Profiling() = default;
  52. virtual Status Init() = 0;
  53. // Default serialization file generator
  54. virtual Status SaveToFile(const std::string &dir_path, const std::string &rank_id) = 0;
  55. // Profiling name
  56. virtual std::string Name() const = 0;
  57. virtual Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) = 0;
  58. // Start collecting data
  59. Status Start();
  60. // Stop collecting data
  61. Status Stop();
  62. // Clear all collected data
  63. virtual void Clear() = 0;
  64. protected:
  65. bool active_; // show current state of ProfilingManager (running, or paused)
  66. std::mutex lock_;
  67. virtual Path GetFileName(const std::string &dir_path, const std::string &rank_id) = 0;
  68. };
  69. // Sampling is a class of profiling which generate samples periodically.
  70. class Sampling : public Profiling {
  71. public:
  72. // Sampling action function. This function will be invoked by performance monitor thread.
  73. virtual Status Sample() = 0;
  74. ~Sampling() override = default;
  75. };
  76. typedef struct TracingRecord_s {
  77. int32_t type;
  78. int32_t extra_info;
  79. int32_t batch_num;
  80. int32_t value;
  81. uint64_t ts;
  82. std::string ToString() {
  83. return std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
  84. std::to_string(value) + " " + std::to_string(ts);
  85. }
  86. } TracingRecord;
  87. // Tracing is class of profiling which record samples upon request.
  88. class Tracing : public Profiling {
  89. public:
  90. // Tracing has minimal interface to provide flexible on data recording.
  91. // It only includes some common routines.
  92. Status SaveToFile(const std::string &dir_path, const std::string &rank_id) override;
  93. Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id) override;
  94. Status Init() override;
  95. Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  96. Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  97. Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  98. Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  99. Status GetConnectorCapacity(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  100. Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq);
  101. void Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
  102. const uint64_t time_stamp);
  103. Status TimeIntervalForStepRange(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts);
  104. Status StepIntervalForTimeRange(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step);
  105. size_t GetNumberSteps();
  106. // Clear all collected data
  107. void Clear() override;
  108. protected:
  109. Tracing() = default;
  110. std::vector<std::string> value_;
  111. std::vector<TracingRecord> records_;
  112. std::vector<uint64_t> ts_; // End time of each step or batch
  113. Status GetRecordEntryFieldValue(int32_t start_step, int32_t end_step, int32_t record_offset, const std::string &field,
  114. std::vector<int32_t> *result);
  115. };
  116. // ProfilingManager is a class manages all profiling infrastructure
  117. // It serves the following purposes:
  118. // 1) Fetch profiling configs from global contexts
  119. // 2) Setup all profiling node based on config
  120. // 3) Provide access of profiling nodes for profiling actions
  121. // 4) Manage profiling data serialization process
  122. class ProfilingManager {
  123. friend Monitor;
  124. public:
  125. ProfilingManager();
  126. ~ProfilingManager() = default;
  127. /// Register the given tree to be profiled.
  128. /// This method should be called once, calling it for another tree without resetting the ProfilingManager would fail.
  129. /// \param tree_adapter pointer the adapter that owns the ExecutionTree
  130. /// \return Status the status code returned
  131. Status RegisterTree(TreeAdapter *tree_adapter);
  132. /// Reset the ProfilingManager. This method is sued when we want to profile another tree in the same process.
  133. /// \return Status the status code returned
  134. Status Reset();
  135. // Save profile data to file
  136. // @param dir_path_ The path to the directory where the profiling data will be saved.
  137. // @return Status The status code returned
  138. Status SaveProfilingData(const std::string &dir_path, const std::string &rank_id);
  139. // Sampling node getter
  140. // @param name - The name of the requested node
  141. // @param node - Pointer to the shared pointer for the Sampling node
  142. // @return Status The status code returned
  143. Status GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node);
  144. // Tracing node getter
  145. // @param name - The name of the requested node
  146. // @param node - Pointer to the shared pointer for the Tracing node
  147. // @return Status The status code returned
  148. Status GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node);
  149. // return true if enabled_ is set to true, namely if Init() has been called successfully
  150. // @param tree - Execution tree pointer
  151. bool IsProfilingEnable(const ExecutionTree *tree = nullptr) const;
  152. // Record end of epoch information
  153. // @param step_num - The number of steps
  154. void RecordEndOfEpoch(uint32_t step_num);
  155. const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() const { return sampling_nodes_; }
  156. // Launch monitoring thread.
  157. Status LaunchMonitor();
  158. // @return Status The status code returned
  159. Status ChangeFileMode(const std::string &dir_path, const std::string &rank_id);
  160. #ifndef ENABLE_ANDROID
  161. /// \brief API to get User CPU utilization for the system
  162. /// \param [in] epoch_num The epoch number for which results are requested
  163. /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
  164. /// \return Status object with the error code
  165. Status GetUserCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result);
  166. /// \brief API to get User CPU utilization for the system
  167. /// \param [in] start_step The step interval start range
  168. /// \param [in] end_step The step interval end range
  169. /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
  170. /// \return Status object with the error code
  171. Status GetUserCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result);
  172. /// \brief API to get User CPU utilization for the system
  173. /// \param [in] start_ts The time interval start range in ms
  174. /// \param [in] end_ts The time interval end range in ms
  175. /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
  176. /// \return Status object with the error code
  177. Status GetUserCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
  178. /// \brief API to get System CPU utilization for the system
  179. /// \param [in] epoch_num The epoch number for which results are requested
  180. /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
  181. /// \return Status object with the error code
  182. Status GetSysCpuUtilByEpoch(int32_t epoch_num, std::vector<uint8_t> *result);
  183. /// \brief API to get System CPU utilization for the system
  184. /// \param [in] start_step The step interval start range
  185. /// \param [in] end_step The step interval end range
  186. /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
  187. /// \return Status object with the error code
  188. Status GetSysCpuUtilByStep(int32_t start_step, int32_t end_step, std::vector<uint8_t> *result);
  189. /// \brief API to get System CPU utilization for the system
  190. /// \param [in] start_ts The time interval start range in ms
  191. /// \param [in] end_ts The time interval end range in ms
  192. /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
  193. /// \return Status object with the error code
  194. Status GetSysCpuUtilByTime(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
  195. /// \brief API to get User CPU Utilization of an MD operator
  196. /// \param [in] op_id The id of the operator
  197. /// \param [in] epoch_num The epoch number for which results are requested
  198. /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
  199. /// \return Status object with the error code
  200. Status GetUserCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);
  201. /// \brief API to get User CPU Utilization of an MD operator
  202. /// \param [in] op_id The id of the operator
  203. /// \param [in] start_step The step interval start range
  204. /// \param [in] end_step The step interval end range
  205. /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
  206. /// \return Status object with the error code
  207. Status GetUserCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result);
  208. /// \brief API to get User CPU Utilization of an MD operator
  209. /// \param [in] op_id The id of the operator
  210. /// \param [in] start_ts The time interval start range in ms
  211. /// \param [in] end_ts The time interval end range in ms
  212. /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
  213. /// \return Status object with the error code
  214. Status GetUserCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);
  215. /// \brief API to get System CPU Utilization of an MD operator
  216. /// \param [in] op_id The id of the operator
  217. /// \param [in] epoch_num The epoch number for which results are requested
  218. /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
  219. /// \return Status object with the error code
  220. Status GetSysCpuUtilByEpoch(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);
  221. /// \brief API to get System CPU Utilization of an MD operator
  222. /// \param [in] op_id The id of the operator
  223. /// \param [in] start_step The step interval start range
  224. /// \param [in] end_step The step interval end range
  225. /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
  226. /// \return Status object with the error code
  227. Status GetSysCpuUtilByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<uint16_t> *result);
  228. /// \brief API to get System CPU Utilization of an MD operator
  229. /// \param [in] op_id The id of the operator
  230. /// \param [in] start_ts The time interval start range in ms
  231. /// \param [in] end_ts The time interval end range in ms
  232. /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
  233. /// \return Status object with the error code
  234. Status GetSysCpuUtilByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);
  235. /// \brief API to get information on main process memory usage
  236. /// \param [in] metric The requested memory set usage. One of these values:
  237. /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
  238. /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
  239. /// shared libraries
  240. /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
  241. /// \param [in] epoch_num The epoch number for which results are requested
  242. /// \param [out] result The desired value in MB
  243. /// \return Status object with the error code
  244. Status GetMainProcessMemoryInfoByEpoch(ProcessMemoryMetric metric, int32_t epoch_num, std::vector<float> *result);
  245. /// \brief API to get information on main process memory usage
  246. /// \param [in] metric The requested memory set usage. One of these values:
  247. /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
  248. /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
  249. /// shared libraries
  250. /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
  251. /// \param [in] end_ts The time interval end range in ms
  252. /// \param [in] start_step The step interval start range
  253. /// \param [in] end_step The step interval end range
  254. /// \param [out] result The desired value in MB
  255. /// \return Status object with the error code
  256. Status GetMainProcessMemoryInfoByStep(ProcessMemoryMetric metric, int32_t start_step, int32_t end_step,
  257. std::vector<float> *result);
  258. /// \brief API to get information on main process memory usage
  259. /// \param [in] metric The requested memory set usage. One of these values:
  260. /// - ProcessMemoryMetric::kVSS - virtual set size, virtual memory usage
  261. /// - ProcessMemoryMetric::kPSS - proportional set size, physical memory usage with proportional allocation of
  262. /// shared libraries
  263. /// - ProcessMemoryMetric::kRSS - resident set size, physical memory usage (includes shared libraries)
  264. /// \param [in] start_ts The time interval start range in ms
  265. /// \param [in] end_ts The time interval end range in ms
  266. /// \param [out] result The desired value in MB
  267. /// \return Status object with the error code
  268. Status GetMainProcessMemoryInfoByTime(ProcessMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
  269. std::vector<float> *result);
  270. /// \brief API to get information on system memory usage
  271. /// \param [in] metric The requested memory metric. One of these values:
  272. /// - SystemMemoryMetric::kMemoryAvailable
  273. /// - SystemMemoryMetric::kMemoryTotal
  274. /// - SystemMemoryMetric::kMemoryUsed
  275. /// \param [in] epoch_num The epoch number for which results are requested
  276. /// \param [out] result The desired value in MB
  277. /// \return Status object with the error code
  278. Status GetSystemMemoryInfoByEpoch(SystemMemoryMetric metric, int32_t epoch_num, std::vector<float> *result);
  279. /// \brief API to get information on system memory usage
  280. /// \param [in] metric The requested memory metric. One of these values:
  281. /// - SystemMemoryMetric::kMemoryAvailable
  282. /// - SystemMemoryMetric::kMemoryTotal
  283. /// - SystemMemoryMetric::kMemoryUsed
  284. /// \param [in] start_step The step interval start range
  285. /// \param [in] end_step The step interval end range
  286. /// \param [out] result The desired value in MB
  287. /// \return Status object with the error code
  288. Status GetSystemMemoryInfoByStep(SystemMemoryMetric metric, int32_t start_step, int32_t end_step,
  289. std::vector<float> *result);
  290. /// \brief API to get information on system memory usage
  291. /// \param [in] metric The requested memory metric. One of these values:
  292. /// - SystemMemoryMetric::kMemoryAvailable
  293. /// - SystemMemoryMetric::kMemoryTotal
  294. /// - SystemMemoryMetric::kMemoryUsed
  295. /// \param [in] start_ts The time interval start range in ms
  296. /// \param [in] end_ts The time interval end range in ms
  297. /// \param [out] result The desired value in MB
  298. /// \return Status object with the error code
  299. Status GetSystemMemoryInfoByTime(SystemMemoryMetric metric, uint64_t start_ts, uint64_t end_ts,
  300. std::vector<float> *result);
  301. #endif
  302. /// \brief API to get the connector size of an MD operator
  303. /// \param [in] op_id The id of the operator
  304. /// \param [in] epoch_num The epoch number for which results are requested
  305. /// \param [out] result A vector with the sampled connector sizes of the operator
  306. /// \return Status object with the error code
  307. Status GetConnectorSizeByEpoch(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result);
  308. /// \brief API to get the connector size of an MD operator
  309. /// \param [in] op_id The id of the operator
  310. /// \param [in] start_step The step interval start range
  311. /// \param [in] end_step The step interval end range
  312. /// \param [out] result A vector with the sampled connector sizes of the operator
  313. /// \return Status object with the error code
  314. Status GetConnectorSizeByStep(int32_t op_id, int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  315. /// \brief API to get the connector size of an MD operator
  316. /// \param [in] op_id The id of the operator
  317. /// \param [in] start_ts The time interval start range in ms
  318. /// \param [in] end_ts The time interval end range in ms
  319. /// \param [out] result A vector with the sampled connector sizes of the operator
  320. /// \return Status object with the error code
  321. Status GetConnectorSizeByTime(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  322. /// \brief API to get the connector size of DatasetIterator or DeviceQueueOp
  323. /// \param [in] epoch_num The epoch number for which results are requested
  324. /// \param [out] result A vector with connector size at each step
  325. /// \return Status object with the error code
  326. Status GetConnectorSizeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
  327. /// \brief API to get the connector size of DatasetIterator or DeviceQueueOp
  328. /// \param [in] start_step The step interval start range
  329. /// \param [in] end_step The step interval end range
  330. /// \param [out] result A vector with connector size at each step
  331. /// \return Status object with the error code
  332. Status GetConnectorSizeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  333. /// \brief API to get the connector size of DatasetIterator or DeviceQueueOp
  334. /// \param [in] start_ts The time interval start range in ms
  335. /// \param [in] end_ts The time interval end range in ms
  336. /// \param [out] result A vector with connector size at each step
  337. /// \return Status object with the error code
  338. Status GetConnectorSizeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  339. /// \brief API to get the connector capacity of DatasetIterator or DeviceQueueOp
  340. /// \param [in] epoch_num The epoch number for which results are requested
  341. /// \param [out] result A vector with connector capacity at each step
  342. /// \return Status object with the error code
  343. Status GetConnectorCapacityByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
  344. /// \brief API to get the connector capacity of DatasetIterator or DeviceQueueOp
  345. /// \param [in] start_step The step interval start range
  346. /// \param [in] end_step The step interval end range
  347. /// \param [out] result A vector with connector capacity at each step
  348. /// \return Status object with the error code
  349. Status GetConnectorCapacityByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  350. /// \brief API to get the connector capacity of DatasetIterator or DeviceQueueOp
  351. /// \param [in] start_ts The time interval start range in ms
  352. /// \param [in] end_ts The time interval end range in ms
  353. /// \param [out] result A vector with connector capacity for steps in the given time range
  354. /// \return Status object with the error code
  355. Status GetConnectorCapacityByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  356. /// \brief API to get the pipeline time of batches
  357. /// \param [in] epoch_num The epoch number for which results are requested
  358. /// \param [out] result A vector with the pipeline time for each step
  359. /// \return Status object with the error code
  360. Status GetPipelineTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
  361. /// \brief API to get the pipeline time of batches
  362. /// \param [in] start_step The step interval start range
  363. /// \param [in] end_step The step interval end range
  364. /// \param [out] result A vector with the pipeline time for each step
  365. /// \return Status object with the error code
  366. Status GetPipelineTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  367. /// \brief API to get the pipeline time of batches
  368. /// \param [in] start_ts The time interval start range in ms
  369. /// \param [in] end_ts The time interval end range in ms
  370. /// \param [out] result A vector with the pipeline time for steps in the given time range
  371. /// \return Status object with the error code
  372. Status GetPipelineTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  373. /// \brief API to get the push time of batches
  374. /// \param [in] epoch_num The epoch number for which results are requested
  375. /// \param [out] result A vector with the push time for each each step
  376. /// \return Status object with the error code
  377. Status GetPushTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
  378. /// \brief API to get the push time of batches
  379. /// \param [in] start_step The step interval start range
  380. /// \param [in] end_step The step interval end range
  381. /// \param [out] result A vector with the push time for each each step
  382. /// \return Status object with the error code
  383. Status GetPushTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  384. /// \brief API to get the push time of batches
  385. /// \param [in] start_ts The time interval start range in ms
  386. /// \param [in] end_ts The time interval end range in ms
  387. /// \param [out] result A vector with the push time for steps in the given time range
  388. /// \return Status object with the error code
  389. Status GetPushTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  390. /// \brief API to get the batch time of batches
  391. /// \param [in] epoch_num The epoch number for which results are requested
  392. /// \param [out] result A vector with the batch time for each step
  393. /// \return Status object with the error code
  394. Status GetBatchTimeByEpoch(int32_t epoch_num, std::vector<int32_t> *result);
  395. /// \brief API to get the batch time of batches
  396. /// \param [in] start_step The step interval start range
  397. /// \param [in] end_step The step interval end range
  398. /// \param [out] result A vector with the batch time for each step
  399. /// \return Status object with the error code
  400. Status GetBatchTimeByStep(int32_t start_step, int32_t end_step, std::vector<int32_t> *result);
  401. /// \brief API to get the batch time of batches
  402. /// \param [in] start_ts The time interval start range in ms
  403. /// \param [in] end_ts The time interval end range in ms
  404. /// \param [out] result A vector with the batch time for steps in the given time range
  405. /// \return Status object with the error code
  406. Status GetBatchTimeByTime(uint64_t start_ts, uint64_t end_ts, std::vector<int32_t> *result);
  407. /// \brief API to get fraction of steps that DatasetIterator or DeviceQueueOp connector was empty
  408. /// \param [in] epoch_num The epoch number for which results are requested
  409. /// \param [out] result The empty queue frequency
  410. /// \return Status object with the error code
  411. Status GetEmptyQueueFrequencyByEpoch(int32_t epoch_num, float_t *result);
  412. /// \brief API to get fraction of steps that DatasetIterator or DeviceQueueOp connector was empty
  413. /// \param [in] start_step The step interval start range
  414. /// \param [in] end_step The step interval end range
  415. /// \param [out] result The empty queue frequency
  416. /// \return Status object with the error code
  417. Status GetEmptyQueueFrequencyByStep(int32_t start_step, int32_t end_step, float_t *result);
  418. /// \brief API to get fraction of steps that DatasetIterator or DeviceQueueOp connector was empty
  419. /// \param [in] start_ts The time interval start range in ms
  420. /// \param [in] end_ts The time interval end range in ms
  421. /// \param [out] result The empty queue frequency
  422. /// \return Status object with the error code
  423. Status GetEmptyQueueFrequencyByTime(uint64_t start_ts, uint64_t end_ts, float_t *result);
  424. // Register profile node to tree
  425. // @param node - Profiling node
  426. // @return Status The status code returned
  427. Status RegisterTracingNode(const std::shared_ptr<Tracing> &node);
  428. /// \brief API to initialize profiling manager
  429. /// \param for_autotune flag to indicate if Profiler is initialized for autotuning or profiling purposes
  430. /// \return Status object with the error code
  431. Status Init(bool for_autotune = false);
  432. /// \brief API to signal the profiling nodes to start collecting data
  433. /// \return Status object with the error code
  434. Status Start();
  435. /// \brief API to signal profiling nodes to stop collecting data
  436. /// \return Status object with the error code
  437. Status Stop();
  438. /// \brief API to save to file all the collected data between Start and Stop calls
  439. /// \return Status object with the error code
  440. Status Save(const std::string &profile_data_path);
  441. /// Get number of epochs that have been already profiled
  442. /// \return number of epochs
  443. int32_t GetNumOfProfiledEpochs() { return static_cast<int32_t>(epoch_end_step_.size()) - 1; }
  444. // Get number of steps taken in pipeline
  445. /// \return number of steps
  446. Status GetNumberOfProfiledSteps(int32_t *size);
  447. /// Determine if the Profiler is being used for autotuning.
  448. /// \return boolean
  449. bool IsAutotuning() const { return autotuning_; }
  450. /// Determine if the Profiler is being used for profiling.
  451. /// \return boolean
  452. bool IsProfiling() const { return profiling_; }
  453. // Registration state for the profiler
  454. enum ProfilingRegistrationState {
  455. kNotEnabled,
  456. kEnabledTreeNotRegistered,
  457. kEnabledTreeRegistered,
  458. kEnabledDifferentTreeRegistered,
  459. };
  460. /// \brief Getter for the profiling and tree registration state
  461. /// \param tree Execution Tree pointer
  462. /// \return ProfilingRegistrationState
  463. ProfilingRegistrationState GetProfilerTreeState(const ExecutionTree *tree) const;
  464. protected:
  465. std::unique_ptr<Monitor> perf_monitor_;
  466. // State flags for profiling
  467. enum ProfilingState {
  468. kProfilingStateUnBegun,
  469. kProfilingStateRunning,
  470. kProfilingStateFinished,
  471. };
  472. ProfilingState profiling_state_; // show current state of ProfilingManager (running, or paused)
  473. std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_;
  474. std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;
  475. ExecutionTree *tree_; // ExecutionTree pointer
  476. std::vector<uint64_t> epoch_end_ts_; // End of epoch timestamp
  477. std::vector<uint32_t> epoch_end_step_; // End of epoch step number
  478. std::atomic<bool> autotuning_; // flag to indicate if ProfilingManager is being used for auto-tuning the pipeline
  479. std::atomic<bool> profiling_; // flag to indicate if ProfilingManager is being used for profiling the pipeline
  480. // Register profile node to tree
  481. // @param node - Profiling node
  482. // @return Status The status code returned
  483. Status RegisterSamplingNode(const std::shared_ptr<Sampling> &node);
  484. /// \brief Helper to convert a given epoch number to a step interval
  485. /// \param [in] epoch_num The epoch number to be converted
  486. /// \param [out] start_step The corresponding start step for the given epoch
  487. /// \param [out] end_step The corresponding end step for the given epoch
  488. /// \return Status object with the error code
  489. Status EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step);
  490. /// \brief Helper to convert a given epoch number to a time interval
  491. /// \param [in] epoch_num The epoch number to be converted
  492. /// \param [out] start_ts The corresponding starting timestamp in ms for the given epoch
  493. /// \param [out] end_ts The corresponding ending timestamp in ms for the given epoch
  494. /// \return Status object with the error code
  495. Status EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts);
  496. /// \brief Helper to convert step interval to a time interval
  497. /// \param [in] start_step The step interval start range
  498. /// \param [in] end_step The step interval end range
  499. /// \param [out] start_ts The corresponding starting timestamp in ms for the given step interval
  500. /// \param [out] end_ts The corresponding ending timestamp in ms for the given step interval
  501. /// \return Status object with the error code
  502. Status StepToTimeInterval(int32_t start_step, int32_t end_step, uint64_t *start_ts, uint64_t *end_ts);
  503. /// \brief Helper to convert time interval to a step interval
  504. /// \param [in] start_ts The time interval start range in ms
  505. /// \param [in] end_ts The time interval end range in ms
  506. /// \param [out] start_step The corresponding start step for the given time interval
  507. /// \param [out] end_step The corresponding end step for the given time interval
  508. /// \return Status object with the error code
  509. Status TimeToStepInterval(uint64_t start_ts, uint64_t end_ts, int32_t *start_step, int32_t *end_step);
  510. };
  511. enum ProfilingType { TIME, CONNECTOR_DEPTH };
  512. enum ProfilingTimeSubType {
  513. PIPELINE_TIME,
  514. TDT_PUSH_TIME,
  515. BATCH_TIME,
  516. INVALID_TIME,
  517. };
  518. class ProfilingTime {
  519. public:
  520. static uint64_t GetCurMilliSecond();
  521. };
  522. } // namespace dataset
  523. } // namespace mindspore
  524. #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_PROFILING_H_"