You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling_manager.cc 5.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/ascend/profiling/profiling_manager.h"
  17. #include <stdlib.h>
  18. #include <vector>
  19. #include <nlohmann/json.hpp>
  20. #include "securec/include/securec.h"
  21. #include "./prof_mgr_core.h"
  22. #include "device/ascend/profiling/plugin_impl.h"
  23. #include "device/ascend/profiling/profiling_engine_impl.h"
  24. #include "utils/log_adapter.h"
  25. #include "utils/context/ms_context.h"
  26. #include "common/utils.h"
  27. #include "utils/convert_utils.h"
  28. using std::vector;
  29. using Json = nlohmann::json;
  30. namespace mindspore {
  31. namespace device {
  32. namespace ascend {
  33. ProfilingManager &ProfilingManager::GetInstance() {
  34. static ProfilingManager inst;
  35. return inst;
  36. }
  37. ProfilingManager::ProfilingManager() : device_id_(0), prof_handle_(nullptr) {
  38. engine_0_ = std::make_shared<ProfilingEngineImpl>();
  39. }
  40. uint64_t ProfilingManager::GetJobId() const {
  41. const char *job_id = std::getenv("JOB_ID");
  42. return ((job_id != nullptr) ? std::strtoul(job_id, nullptr, 10) : 0);
  43. }
  44. bool ProfilingManager::ReportProfilingData(const map<uint32_t, string> &op_taskId_map) const {
  45. if (!IsProfiling()) {
  46. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  47. return false;
  48. }
  49. if (op_taskId_map.empty()) {
  50. MS_LOG(WARNING) << "op_taskId_map is empty.";
  51. return false;
  52. }
  53. auto reporter = PluginImpl::GetPluginReporter();
  54. if (reporter == nullptr) {
  55. MS_LOG(ERROR) << "No profiling data report!";
  56. return false;
  57. }
  58. MS_LOG(INFO) << "DistributeTask: op tasId map size = " << op_taskId_map.size();
  59. Msprof::Engine::ReporterData reporter_data = {};
  60. for (const auto &iter : op_taskId_map) {
  61. auto data = iter.second + ' ' + std::to_string(iter.first) + ';';
  62. reporter_data.deviceId = UintToInt(device_id_);
  63. reporter_data.data = (unsigned char *)(const_cast<char *>(data.c_str()));
  64. reporter_data.dataLen = data.size();
  65. auto ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "framework", sizeof("framework"));
  66. if (ret != 0) {
  67. MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")";
  68. return false;
  69. }
  70. ret = reporter->Report(&reporter_data);
  71. if (ret != 0) {
  72. MS_LOG(ERROR) << "reporter data fail, errorno(" << ret << ")";
  73. return false;
  74. }
  75. }
  76. return true;
  77. }
  78. static std::vector<std::string> Split(const std::string &str, const char delim) {
  79. std::vector<std::string> elems;
  80. if (str.empty()) {
  81. elems.emplace_back("");
  82. return elems;
  83. }
  84. std::stringstream ss(str);
  85. std::string item;
  86. while (getline(ss, item, delim)) {
  87. elems.push_back(item);
  88. }
  89. auto str_size = str.size();
  90. if (str_size > 0 && str[str_size - 1] == delim) {
  91. elems.emplace_back("");
  92. }
  93. return elems;
  94. }
  95. bool ProfilingManager::StartupProfiling(uint32_t device_id) {
  96. auto is_profiling = IsProfiling();
  97. if (!is_profiling) {
  98. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  99. return true;
  100. }
  101. device_id_ = device_id;
  102. // exp: export PROFILING_MODE=true
  103. // export PROFILING_OPTIONS=training_trace
  104. const char *prof_options_str = std::getenv("PROFILING_OPTIONS");
  105. // register Framework to profiling
  106. int result = Msprof::Engine::RegisterEngine("Framework", engine_0_.get());
  107. if (result != 0) {
  108. MS_LOG(ERROR) << "Register profiling Engine failed.";
  109. return false;
  110. }
  111. if (prof_options_str != nullptr) {
  112. const string prof_options_str_tmp = prof_options_str;
  113. vector<string> opts = Split(prof_options_str_tmp, ':');
  114. if (!opts.empty()) {
  115. // current one docker only use one device`
  116. Json p_device;
  117. // device_id
  118. p_device["deviceID"] = std::to_string(device_id);
  119. // features:'training_trace', 'task_trace' etc
  120. Json features;
  121. for (vector<string>::size_type i = 0; i < opts.size(); i++) {
  122. Json f;
  123. f["name"] = opts[i];
  124. features[i] = f;
  125. }
  126. p_device["features"] = features;
  127. // only one device, but sProfMgrStartUp API require for device list
  128. Json devices;
  129. devices[0] = p_device;
  130. Json startCfg;
  131. startCfg["startCfg"] = devices;
  132. // convert json to string
  133. std::stringstream ss;
  134. ss << startCfg;
  135. std::string cfg = ss.str();
  136. MS_LOG(INFO) << "profiling config " << cfg;
  137. // call profiling startup API
  138. ProfMgrCfg prof_cfg = {cfg};
  139. prof_handle_ = ProfMgrStartUp(&prof_cfg);
  140. if (prof_handle_ == nullptr) {
  141. MS_LOG(ERROR) << "Startup profiling failed.";
  142. return false;
  143. }
  144. }
  145. }
  146. return true;
  147. }
  148. bool ProfilingManager::StopProfiling() const {
  149. MS_LOG(INFO) << "StopProfiling";
  150. if (!IsProfiling()) {
  151. MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
  152. return true;
  153. }
  154. Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
  155. if (reporter != nullptr) {
  156. MS_LOG(INFO) << "report data end, ret = " << reporter->Flush();
  157. }
  158. if (prof_handle_ != nullptr) {
  159. int result = ProfMgrStop(prof_handle_);
  160. if (result != 0) {
  161. MS_LOG(ERROR) << "ProfMgr stop return fail:" << result << ".";
  162. return false;
  163. }
  164. }
  165. return true;
  166. }
  167. } // namespace ascend
  168. } // namespace device
  169. } // namespace mindspore