You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling_manager.cc 6.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/ascend/profiling/profiling_manager.h"
  17. #include <stdlib.h>
  18. #include <vector>
  19. #include "securec/include/securec.h"
  20. #include "./prof_mgr_core.h"
  21. #include "device/ascend/profiling/plugin_impl.h"
  22. #include "device/ascend/profiling/profiling_engine_impl.h"
  23. #include "utils/log_adapter.h"
  24. #include "utils/context/ms_context.h"
  25. #include "common/utils.h"
  26. #include "utils/convert_utils.h"
  27. #include "runtime/base.h"
  28. namespace mindspore {
  29. namespace device {
  30. namespace ascend {
  31. ProfilingManager &ProfilingManager::GetInstance() {
  32. static ProfilingManager inst;
  33. return inst;
  34. }
  35. ProfilingManager::ProfilingManager() : device_id_(0), prof_handle_(nullptr) {
  36. engine_0_ = std::make_shared<ProfilingEngineImpl>();
  37. }
  38. uint64_t ProfilingManager::GetJobId() const {
  39. const char *job_id = std::getenv("JOB_ID");
  40. return ((job_id != nullptr) ? std::strtoul(job_id, nullptr, 10) : 0);
  41. }
  42. bool ProfilingManager::ReportProfilingData(const map<uint32_t, string> &op_taskId_map) const {
  43. if (!IsProfiling()) {
  44. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  45. return false;
  46. }
  47. if (op_taskId_map.empty()) {
  48. MS_LOG(WARNING) << "op_taskId_map is empty.";
  49. return false;
  50. }
  51. auto reporter = PluginImpl::GetPluginReporter();
  52. if (reporter == nullptr) {
  53. MS_LOG(ERROR) << "No profiling data report!";
  54. return false;
  55. }
  56. MS_LOG(INFO) << "DistributeTask: op tasId map size = " << op_taskId_map.size();
  57. Msprof::Engine::ReporterData reporter_data = {};
  58. for (const auto &iter : op_taskId_map) {
  59. auto data = iter.second + ' ' + std::to_string(iter.first) + ';';
  60. reporter_data.deviceId = UintToInt(device_id_);
  61. reporter_data.data = (unsigned char *)(const_cast<char *>(data.c_str()));
  62. reporter_data.dataLen = data.size();
  63. auto ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "framework", sizeof("framework"));
  64. if (ret != 0) {
  65. MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")";
  66. return false;
  67. }
  68. ret = reporter->Report(&reporter_data);
  69. if (ret != 0) {
  70. MS_LOG(ERROR) << "reporter data fail, errorno(" << ret << ")";
  71. return false;
  72. }
  73. }
  74. return true;
  75. }
  76. static std::vector<std::string> Split(const std::string &str, const char delim) {
  77. std::vector<std::string> elems;
  78. if (str.empty()) {
  79. elems.emplace_back("");
  80. return elems;
  81. }
  82. std::stringstream ss(str);
  83. std::string item;
  84. while (getline(ss, item, delim)) {
  85. elems.push_back(item);
  86. }
  87. auto str_size = str.size();
  88. if (str_size > 0 && str[str_size - 1] == delim) {
  89. elems.emplace_back("");
  90. }
  91. return elems;
  92. }
  93. bool ProfilingManager::StartupProfiling(uint32_t device_id) {
  94. auto is_profiling = IsProfiling();
  95. if (!is_profiling) {
  96. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  97. return true;
  98. }
  99. device_id_ = device_id;
  100. // register Framework to profiling
  101. int result = Msprof::Engine::RegisterEngine("Framework", engine_0_.get());
  102. if (result != 0) {
  103. MS_LOG(ERROR) << "Register profiling Engine failed.";
  104. return false;
  105. }
  106. auto context = MsContext::GetInstance();
  107. MS_EXCEPTION_IF_NULL(context);
  108. const string prof_options_str = context->profiling_options();
  109. std::vector<string> opts = Split(prof_options_str, ':');
  110. if (opts.empty()) {
  111. MS_LOG(WARNING) << "Profiling is enabled, but profiling option is not set!";
  112. return true;
  113. }
  114. // current one docker only use one device`
  115. nlohmann::json p_device;
  116. // JOBID
  117. auto job_id = GetJobId();
  118. p_device["jobID"] = std::to_string(job_id);
  119. // device_id
  120. p_device["deviceID"] = std::to_string(device_id);
  121. // features:'training_trace', 'task_trace' etc
  122. nlohmann::json features;
  123. for (std::vector<string>::size_type i = 0; i < opts.size(); i++) {
  124. nlohmann::json f;
  125. f["name"] = opts[i];
  126. features[i] = f;
  127. }
  128. p_device["features"] = features;
  129. // only one device, but sProfMgrStartUp API require for device list
  130. nlohmann::json devices;
  131. devices[0] = p_device;
  132. nlohmann::json startCfg;
  133. startCfg["startCfg"] = devices;
  134. if (!ProfStartUp(NOT_NULL(&startCfg))) {
  135. MS_LOG(ERROR) << "ProfMgrStartUp failed.";
  136. return false;
  137. }
  138. return true;
  139. }
  140. bool ProfilingManager::ProfStartUp(NotNull<nlohmann::json *> startCfg) {
  141. // convert json to string
  142. std::stringstream ss;
  143. ss << *startCfg;
  144. std::string cfg = ss.str();
  145. MS_LOG(INFO) << "profiling config " << cfg;
  146. auto ret = rtProfilerStart();
  147. if (ret != RT_ERROR_NONE) {
  148. MS_LOG(INFO) << "Call rtProfilerStart failed, ret:" << ret;
  149. return false;
  150. }
  151. // call profiling startup API
  152. ProfMgrCfg prof_cfg = {cfg};
  153. prof_handle_ = ProfMgrStartUp(&prof_cfg);
  154. if (prof_handle_ == nullptr) {
  155. MS_LOG(ERROR) << "Startup profiling failed.";
  156. return false;
  157. }
  158. return true;
  159. }
  160. bool ProfilingManager::StopProfiling() {
  161. MS_LOG(INFO) << "StopProfiling";
  162. if (!IsProfiling()) {
  163. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  164. return true;
  165. }
  166. Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
  167. if (reporter != nullptr) {
  168. MS_LOG(INFO) << "report data end, ret = " << reporter->Flush();
  169. }
  170. auto rt_ret = rtProfilerStop();
  171. if (rt_ret != RT_ERROR_NONE) {
  172. MS_LOG(ERROR) << "Call rtProfilerStop failed";
  173. return false;
  174. }
  175. if (prof_handle_ != nullptr) {
  176. int result = ProfMgrStop(prof_handle_);
  177. if (result != 0) {
  178. MS_LOG(ERROR) << "ProfMgr stop return fail:" << result << ".";
  179. prof_handle_ = nullptr;
  180. return false;
  181. }
  182. prof_handle_ = nullptr;
  183. }
  184. return true;
  185. }
  186. } // namespace ascend
  187. } // namespace device
  188. } // namespace mindspore