You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.cc 8.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/engine/perf/profiling.h"
  17. #include <cstdlib>
  18. #include <fstream>
  19. #include "utils/ms_utils.h"
  20. #include "minddata/dataset/util/path.h"
  21. #ifdef ENABLE_GPUQUE
  22. #include "minddata/dataset/core/config_manager.h"
  23. #include "minddata/dataset/core/global_context.h"
  24. #endif
  25. #include "minddata/dataset/engine/perf/monitor.h"
  26. #include "minddata/dataset/engine/perf/device_queue_tracing.h"
  27. #include "minddata/dataset/engine/perf/connector_size.h"
  28. #include "minddata/dataset/engine/perf/connector_throughput.h"
  29. #include "minddata/dataset/engine/perf/cpu_sampling.h"
  30. #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h"
  31. #include "minddata/dataset/util/log_adapter.h"
  32. namespace mindspore {
  33. namespace dataset {
  34. Status Tracing::SaveToFile() {
  35. if (value_.empty()) {
  36. return Status::OK();
  37. }
  38. std::ofstream handle(file_path_, std::ios::trunc);
  39. if (!handle.is_open()) {
  40. RETURN_STATUS_UNEXPECTED("Profiling file can not be opened.");
  41. }
  42. for (auto value : value_) {
  43. handle << value << "\n";
  44. }
  45. handle.close();
  46. return Status::OK();
  47. }
  48. Status Sampling::ReadJson(nlohmann::json *output) {
  49. RETURN_UNEXPECTED_IF_NULL(output);
  50. Path path = Path(file_path_);
  51. if (path.Exists()) {
  52. MS_LOG(DEBUG) << file_path_ << " exists";
  53. try {
  54. std::ifstream file(file_path_);
  55. file >> (*output);
  56. } catch (const std::exception &err) {
  57. RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path_ +
  58. ", please delete it and try again!");
  59. }
  60. } else {
  61. (*output)["sampling_interval"] = GlobalContext::config_manager()->monitor_sampling_interval();
  62. }
  63. return Status::OK();
  64. }
  65. // Constructor
  66. ProfilingManager::ProfilingManager(ExecutionTree *tree) : tree_(tree), enabled_(true) {
  67. perf_monitor_ = std::make_unique<Monitor>(tree_);
  68. }
  69. bool ProfilingManager::IsProfilingEnable() const { return common::GetEnv("PROFILING_MODE") == "true" && enabled_; }
  70. Status ProfilingManager::Initialize() {
  71. // Register nodes based on config
  72. std::string dir = common::GetEnv("MINDDATA_PROFILING_DIR");
  73. if (dir.empty()) {
  74. RETURN_STATUS_UNEXPECTED("Profiling dir is not set.");
  75. }
  76. char real_path[PATH_MAX] = {0};
  77. if (dir.size() >= PATH_MAX) {
  78. RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
  79. }
  80. #if defined(_WIN32) || defined(_WIN64)
  81. if (_fullpath(real_path, common::SafeCStr(dir), PATH_MAX) == nullptr) {
  82. RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
  83. }
  84. #else
  85. if (realpath(common::SafeCStr(dir), real_path) == nullptr) {
  86. RETURN_STATUS_UNEXPECTED("Profiling dir is invalid.");
  87. }
  88. #endif
  89. dir_path_ = real_path;
  90. #ifdef ENABLE_GPUQUE
  91. std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
  92. int32_t rank_id = cfg->rank_id();
  93. // If DEVICE_ID is not set, default value is 0
  94. if (rank_id < 0) {
  95. device_id_ = common::GetEnv("DEVICE_ID");
  96. // If DEVICE_ID is not set, default value is 0
  97. if (device_id_.empty()) {
  98. device_id_ = "0";
  99. }
  100. } else {
  101. device_id_ = std::to_string(rank_id);
  102. }
  103. #else
  104. device_id_ = common::GetEnv("RANK_ID");
  105. // If RANK_ID is not set, default value is 0
  106. if (device_id_.empty()) {
  107. device_id_ = "0";
  108. }
  109. #endif
  110. // Register all profiling node.
  111. // device_queue node is used for graph mode
  112. std::shared_ptr<Tracing> device_queue_tracing = std::make_shared<DeviceQueueTracing>();
  113. RETURN_IF_NOT_OK(RegisterTracingNode(device_queue_tracing));
  114. // dataset_iterator node is used for graph mode
  115. std::shared_ptr<Tracing> dataset_iterator_tracing = std::make_shared<DatasetIteratorTracing>();
  116. RETURN_IF_NOT_OK(RegisterTracingNode(dataset_iterator_tracing));
  117. std::shared_ptr<Sampling> connector_size_sampling = std::make_shared<ConnectorSize>(tree_);
  118. RETURN_IF_NOT_OK(RegisterSamplingNode(connector_size_sampling));
  119. std::shared_ptr<Sampling> connector_thr_sampling = std::make_shared<ConnectorThroughput>(tree_);
  120. RETURN_IF_NOT_OK(RegisterSamplingNode(connector_thr_sampling));
  121. #ifndef ENABLE_ANDROID
  122. std::shared_ptr<Sampling> cpu_sampling = std::make_shared<CpuSampling>(tree_);
  123. RETURN_IF_NOT_OK(RegisterSamplingNode(cpu_sampling));
  124. #endif
  125. return Status::OK();
  126. }
  127. // Launch monitoring thread.
  128. Status ProfilingManager::LaunchMonitor() {
  129. RETURN_IF_NOT_OK(tree_->AllTasks()->CreateAsyncTask("Monitor Thread launched", std::ref(*perf_monitor_)));
  130. return Status::OK();
  131. }
  132. // Profiling node registration
  133. Status ProfilingManager::RegisterTracingNode(std::shared_ptr<Tracing> node) {
  134. // Check if node with the same name has already been registered.
  135. auto exist = tracing_nodes_.find(node->Name());
  136. if (exist != tracing_nodes_.end()) {
  137. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  138. }
  139. // Register the node with its name as key.
  140. RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
  141. tracing_nodes_[node->Name()] = node;
  142. return Status::OK();
  143. }
  144. // Profiling node getter
  145. Status ProfilingManager::GetTracingNode(const std::string &name, std::shared_ptr<Tracing> *node) {
  146. // Check if node with the same name has already been registered.
  147. auto exist = tracing_nodes_.find(name);
  148. if (exist == tracing_nodes_.end()) {
  149. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  150. }
  151. // Fetch node.
  152. *node = tracing_nodes_[name];
  153. return Status::OK();
  154. }
  155. // Profiling node registration
  156. Status ProfilingManager::RegisterSamplingNode(std::shared_ptr<Sampling> node) {
  157. // Check if node with the same name has already been registered.
  158. auto exist = sampling_nodes_.find(node->Name());
  159. if (exist != sampling_nodes_.end()) {
  160. return Status(StatusCode::kMDProfilingError, "Profiling node already exist: " + node->Name());
  161. }
  162. // Register the node with its name as key.
  163. RETURN_IF_NOT_OK(node->Init(dir_path_, device_id_));
  164. sampling_nodes_[node->Name()] = node;
  165. return Status::OK();
  166. }
  167. // Profiling node getter
  168. Status ProfilingManager::GetSamplingNode(const std::string &name, std::shared_ptr<Sampling> *node) {
  169. // Check if node with the same name has already been registered.
  170. auto exist = sampling_nodes_.find(name);
  171. if (exist == sampling_nodes_.end()) {
  172. return Status(StatusCode::kMDProfilingError, "Profiling node does not exist: " + name);
  173. }
  174. // Fetch node.
  175. *node = sampling_nodes_[name];
  176. return Status::OK();
  177. }
  178. Status ProfilingManager::SaveProfilingData() {
  179. if (!IsProfilingEnable()) {
  180. return Status::OK();
  181. }
  182. MS_LOG(INFO) << "Start to save profiling data.";
  183. for (auto node : tracing_nodes_) {
  184. RETURN_IF_NOT_OK(node.second->SaveToFile());
  185. }
  186. for (auto node : sampling_nodes_) {
  187. RETURN_IF_NOT_OK(node.second->SaveToFile());
  188. }
  189. MS_LOG(INFO) << "Save profiling data end.";
  190. return Status::OK();
  191. }
  192. Status ProfilingManager::Analyze() {
  193. if (!IsProfilingEnable()) {
  194. return Status::OK();
  195. }
  196. MS_LOG(INFO) << "Start to analyze profiling data.";
  197. for (auto node : sampling_nodes_) {
  198. RETURN_IF_NOT_OK(node.second->Analyze());
  199. }
  200. return Status::OK();
  201. }
  202. Status ProfilingManager::ChangeFileMode() {
  203. if (!IsProfilingEnable()) {
  204. return Status::OK();
  205. }
  206. MS_LOG(INFO) << "Start to change file mode.";
  207. for (auto node : tracing_nodes_) {
  208. RETURN_IF_NOT_OK(node.second->ChangeFileMode());
  209. }
  210. for (auto node : sampling_nodes_) {
  211. RETURN_IF_NOT_OK(node.second->ChangeFileMode());
  212. }
  213. MS_LOG(INFO) << "Change file mode end.";
  214. return Status::OK();
  215. }
  216. uint64_t ProfilingTime::GetCurMilliSecond() {
  217. // because cpplint does not allow using namespace
  218. using std::chrono::duration_cast;
  219. using std::chrono::milliseconds;
  220. using std::chrono::steady_clock;
  221. return static_cast<uint64_t>(duration_cast<milliseconds>(steady_clock::now().time_since_epoch()).count());
  222. }
  223. } // namespace dataset
  224. } // namespace mindspore