You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.cc 32 kB

5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "profiler/device/gpu/gpu_profiling.h"
  17. #include <cxxabi.h>
  18. #include <chrono>
  19. #include <cmath>
  20. #include <ctime>
  21. #include "profiler/device/gpu/cupti_interface.h"
  22. #include "profiler/device/gpu/gpu_data_saver.h"
  23. #include "pybind_api/api_register.h"
  24. #include "utils/log_adapter.h"
  25. #include "utils/utils.h"
  26. #include "utils/profile.h"
  27. #include "utils/ms_context.h"
  28. namespace mindspore {
  29. namespace profiler {
  30. namespace gpu {
  31. const size_t BUF_SIZE = 32 * 1024;
  32. const size_t ALIGN_SIZE = 8;
  33. #define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \
  34. if ((expression) != CUPTI_SUCCESS) { \
  35. const char *errstr; \
  36. CuptiGetResultString(expression, &errstr); \
  37. MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << (message) \
  38. << ". You may not have access to the NVIDIA GPU performance counters on " \
  39. << "the target device. Please use the root account to run profiling or " \
  40. << "configure permissions. If there is still the problem, please refer to the" \
  41. << " GPU performance tuning document on the official website of mindinsight."; \
  42. }
  43. #define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \
  44. if ((expression) != CUPTI_SUCCESS) { \
  45. const char *errstr; \
  46. CuptiGetResultString(expression, &errstr); \
  47. MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << (message); \
  48. }
  49. #define CHECK_CUDA_RET_WITH_ERROR(expression, message) \
  50. do { \
  51. cudaError_t status = (expression); \
  52. if (status != cudaSuccess) { \
  53. MS_LOG(ERROR) << "CUDA Error: " << (message) << " | Error Number: " << status << " " \
  54. << cudaGetErrorString(status); \
  55. } \
  56. } while (0)
  57. #define PROFILER_ERROR_IF_NULLPTR(ptr) \
  58. do { \
  59. if ((ptr) == nullptr) { \
  60. MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \
  61. return; \
  62. } \
  63. } while (0)
  64. std::shared_ptr<GPUProfiler> GPUProfiler::profiler_inst_ = std::make_shared<GPUProfiler>();
  65. int32_t GetThreadID() {
  66. uint32_t thread_id = static_cast<uint32_t>(pthread_self());
  67. return thread_id;
  68. }
  69. uint32_t GetStreamID(const CUcontext context, const void *stream) {
  70. uint32_t stream_id = 0;
  71. if (stream != nullptr) {
  72. CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId");
  73. if (CuptiGetStreamId(context, (CUstream)stream, &stream_id) != CUPTI_SUCCESS) {
  74. MS_LOG(ERROR) << "Training process unexpectedly stopped, profiling data cannot be write to file"
  75. << "To obtain the profiling data, do not interrupt the training process.";
  76. }
  77. }
  78. return stream_id;
  79. }
  80. uint64_t GetCUPTITimeStamp() {
  81. uint64_t time_stamp = 0l;
  82. CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp");
  83. return time_stamp;
  84. }
  85. uint64_t GetHostTimeStamp() {
  86. auto cur_sys_clock = std::chrono::system_clock::now();
  87. uint64_t cur_time_stamp =
  88. std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
  89. return cur_time_stamp;
  90. }
  91. std::string GetKernelFunc(const char *name) {
  92. char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr);
  93. if (demangledName != nullptr) {
  94. return demangledName;
  95. } else {
  96. return name;
  97. }
  98. }
  99. bool IsMemcpyAsyncEvent(CUpti_CallbackId cb_id) {
  100. switch (cb_id) {
  101. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
  102. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
  103. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
  104. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
  105. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
  106. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
  107. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
  108. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
  109. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
  110. return true;
  111. default:
  112. return false;
  113. }
  114. return false;
  115. }
  116. bool IsMemcpySyncEvent(CUpti_CallbackId cb_id) {
  117. switch (cb_id) {
  118. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
  119. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
  120. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
  121. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
  122. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
  123. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
  124. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
  125. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
  126. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
  127. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
  128. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
  129. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
  130. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
  131. return true;
  132. default:
  133. return false;
  134. }
  135. return false;
  136. }
  137. void CUPTIApiExit(const std::shared_ptr<GPUProfiler> &gpu_profiler_inst, CUpti_CallbackId cb_id,
  138. const CUpti_CallbackData *cb_data) {
  139. uint64_t start_timestamp = *cb_data->correlationData;
  140. uint64_t end_timestamp = GetCUPTITimeStamp();
  141. switch (cb_id) {
  142. case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
  143. case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
  144. case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
  145. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp);
  146. break;
  147. case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc:
  148. case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
  149. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp);
  150. break;
  151. case CUPTI_DRIVER_TRACE_CBID_cuEventCreate:
  152. case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2:
  153. case CUPTI_DRIVER_TRACE_CBID_cuEventRecord:
  154. case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize:
  155. case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime:
  156. // In some cases, the callback of cuctxsetcurrent is only exist
  157. // without entry, so this callback is ignored
  158. case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent:
  159. break;
  160. default:
  161. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp);
  162. break;
  163. }
  164. if (IsMemcpyAsyncEvent(cb_id) || IsMemcpySyncEvent(cb_id)) {
  165. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp);
  166. }
  167. }
  168. void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id,
  169. const CUpti_CallbackData *cb_data) {
  170. if (domain != CUPTI_CB_DOMAIN_DRIVER_API) {
  171. return;
  172. }
  173. auto gpu_profiler_inst = GPUProfiler::GetInstance();
  174. PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst);
  175. if (!gpu_profiler_inst->GetEnableFlag()) {
  176. return;
  177. }
  178. PROFILER_ERROR_IF_NULLPTR(cb_data);
  179. if (cb_data->context == nullptr) {
  180. MS_LOG(DEBUG) << "Callback data context is null , correlation Id:" << cb_data->correlationId
  181. << " callback id:" << cb_id;
  182. return;
  183. }
  184. if (cb_data->callbackSite == CUPTI_API_ENTER) {
  185. *cb_data->correlationData = GetCUPTITimeStamp();
  186. } else if (cb_data->callbackSite == CUPTI_API_EXIT) {
  187. CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data);
  188. }
  189. }
  190. std::string GetKernelFuncName(std::string kernel_name) {
  191. // remove the return type name (void) in kernel_name.
  192. std::string search_pattern("void ");
  193. auto func_name_begin_iter = kernel_name.find(search_pattern);
  194. if (func_name_begin_iter == kernel_name.npos) {
  195. func_name_begin_iter = 0;
  196. } else {
  197. func_name_begin_iter += search_pattern.length();
  198. }
  199. return kernel_name.substr(func_name_begin_iter);
  200. }
  201. std::shared_ptr<GPUProfiler> &GPUProfiler::GetInstance() {
  202. MS_EXCEPTION_IF_NULL(profiler_inst_);
  203. return profiler_inst_;
  204. }
  205. void GPUProfiler::SyncEnable(const bool enable_flag) {
  206. MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag;
  207. sync_enable_flag_ = enable_flag;
  208. }
  209. void GPUProfiler::StepProfilingEnable(const bool enable_flag) {
  210. MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag;
  211. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
  212. enable_flag_ = enable_flag;
  213. }
  214. void GPUProfiler::FixOpNameByCorrelationId(Event *event) {
  215. PROFILER_ERROR_IF_NULLPTR(event);
  216. if (event->api_type != CUPTIApiType::kActivity) {
  217. return;
  218. }
  219. auto iter = op_name_map_.find(event->correlation_id);
  220. if (iter != op_name_map_.end()) {
  221. event->op_name = std::move(iter->second);
  222. }
  223. }
  224. void GPUProfiler::AddEvent(Event &&event) {
  225. // protect callback concurrency for driver api and activity
  226. std::unique_lock<std::mutex> lock(event_mutex_);
  227. switch (event.api_type) {
  228. case CUPTIApiType::kCallback: {
  229. if (cupti_callback_events_count_ < max_cupti_callback_events_) {
  230. events_.emplace_back(std::move(event));
  231. cupti_callback_events_count_++;
  232. } else {
  233. cupti_callback_events_drop_count_++;
  234. }
  235. break;
  236. }
  237. case CUPTIApiType::kActivity: {
  238. if (cupti_activity_events_count_ < max_cupti_activity_events_) {
  239. events_.emplace_back(std::move(event));
  240. cupti_activity_events_count_++;
  241. } else {
  242. cupti_activity_events_drop_count_++;
  243. }
  244. break;
  245. }
  246. default:
  247. break;
  248. }
  249. }
  250. void GPUProfiler::EventLog(const Event &event) {
  251. MS_LOG(DEBUG) << "GPUProfiler"
  252. << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type
  253. << ",api_type:" << static_cast<int>(event.api_type) << ",start_time_stamp:" << event.start_time_stamp
  254. << ",end_time_stamp:" << event.end_time_stamp << ",cost:,"
  255. << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name
  256. << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id
  257. << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id
  258. << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id;
  259. }
  260. void GPUProfiler::ProcessEvents() {
  261. for (Event &event : events_) {
  262. if (event.op_name.empty()) {
  263. FixOpNameByCorrelationId(&event);
  264. }
  265. EventLog(event);
  266. if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) {
  267. continue;
  268. }
  269. auto iter = op_info_map_.find(event.op_name);
  270. if (iter != op_info_map_.end()) {
  271. switch (event.api_type) {
  272. case CUPTIApiType::kCallback: {
  273. iter->second.op_kernel_api_count += 1;
  274. // The time unit from ns to us
  275. iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
  276. break;
  277. }
  278. case CUPTIApiType::kActivity: {
  279. iter->second.op_kernel_count += 1;
  280. // The time unit from ns to us
  281. iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
  282. break;
  283. }
  284. default:
  285. break;
  286. }
  287. }
  288. }
  289. }
  290. void GPUProfiler::OpsParser() {
  291. MS_LOG(INFO) << "Count the number of events size:" << events_.size()
  292. << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_;
  293. if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) {
  294. MS_LOG(WARNING)
  295. << "The total number of events exceeded the profiler's processing capacity, some events were discarded."
  296. << " activity api events:" << cupti_activity_events_drop_count_
  297. << " callback api events:" << cupti_callback_events_drop_count_;
  298. }
  299. if (events_.size() == 0) {
  300. return;
  301. }
  302. ProcessEvents();
  303. MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|"
  304. ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|"
  305. ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time"
  306. << std::endl;
  307. std::vector<std::pair<std::string, OpInfo>> order_vec(op_info_map_.begin(), op_info_map_.end());
  308. auto cmp_func = [](const std::pair<std::string, OpInfo> &a, const std::pair<std::string, OpInfo> &b) {
  309. return a.second.cupti_activity_time > b.second.cupti_activity_time;
  310. };
  311. std::sort(order_vec.begin(), order_vec.end(), cmp_func);
  312. for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) {
  313. if (iter->second.op_count == 0) {
  314. MS_LOG(ERROR) << "The num of operations can not be 0.";
  315. return;
  316. }
  317. MS_LOG(DEBUG) << "GPU_profiler"
  318. << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << ","
  319. << iter->second.op_kernel_api_count << ","
  320. << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << ","
  321. << iter->second.op_host_cost_time << ","
  322. << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << ","
  323. << round(iter->second.cupti_api_call_time / iter->second.op_count) << ","
  324. << round(iter->second.op_host_cost_time / iter->second.op_count) << std::endl;
  325. }
  326. }
  327. void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata,
  328. const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) {
  329. Event event;
  330. uint32_t device_id = -1;
  331. CuptiGetDeviceId(cbdata->context, &device_id);
  332. event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName;
  333. event.kernel_name = GetKernelFuncName(event.kernel_name);
  334. event.kernel_type = typestring;
  335. event.api_type = CUPTIApiType::kCallback;
  336. event.start_time_stamp = startTimestamp;
  337. event.end_time_stamp = endTimestamp;
  338. event.op_name = op_name_;
  339. event.device_id = device_id;
  340. event.correlation_id = cbdata->correlationId;
  341. event.thread_id = GetThreadID();
  342. event.context_id = cbdata->contextUid;
  343. event.stream_id = GetStreamID(cbdata->context, stream_);
  344. event.cb_id = cbid;
  345. op_name_map_[event.correlation_id] = event.op_name;
  346. AddEvent(std::move(event));
  347. }
  348. void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  349. void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  350. void GPUProfiler::Init(const std::string &profileDataPath = "") {
  351. MS_LOG(INFO) << "Initialize GPU Profiling";
  352. if (subscriber_ != nullptr) {
  353. StopCUPTI();
  354. MS_LOG(EXCEPTION)
  355. << "Repeated initialization, Please check whether you have created the Profiler object multiple times";
  356. }
  357. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this),
  358. "CuptiSubscribe");
  359. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain");
  360. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY);
  361. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
  362. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL);
  363. for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
  364. ++it) {
  365. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable");
  366. }
  367. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer),
  368. "CuptiActivityRegisterCallbacks");
  369. base_time_.gpu_start_time = GetCUPTITimeStamp();
  370. base_time_.host_start_time = GetHostTimeStamp();
  371. base_time_.host_start_monotonic_raw_time = GetHostMonoTimeStamp();
  372. profile_data_path_ = profileDataPath;
  373. MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time
  374. << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_;
  375. is_init_ = true;
  376. }
  377. void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) {
  378. auto iter = op_info_map_.find(op_name);
  379. if (iter != op_info_map_.end()) {
  380. iter->second.op_count += 1;
  381. } else {
  382. OpInfo op_info;
  383. op_info.op_name = op_name;
  384. op_info.stream = stream;
  385. op_info.op_count = 1;
  386. op_info_map_[op_name] = op_info;
  387. }
  388. op_name_ = op_name;
  389. stream_ = stream;
  390. }
  391. void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
  392. if (sync_enable_flag_) {
  393. CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate op event start failed");
  394. CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed");
  395. CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_),
  396. "cudaEventRecord op event start failed");
  397. op_host_time_start_ = GetHostTimeStamp();
  398. op_cupti_time_start_ = GetCUPTITimeStamp();
  399. } else {
  400. op_host_time_start_ = GetHostTimeStamp();
  401. op_cupti_time_start_ = GetCUPTITimeStamp();
  402. }
  403. SetRunTimeData(op_name, stream);
  404. if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
  405. RecordOneStepStartEndInfo(op_name);
  406. }
  407. }
  408. void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
  409. auto launch_end_time = GetTime();
  410. double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
  411. SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
  412. }
  413. void GPUProfiler::OpDataProducerEnd() {
  414. float op_time_elapsed = 0;
  415. if (sync_enable_flag_) {
  416. CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_),
  417. "cudaEventRecord op event stop failed");
  418. CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed");
  419. CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed");
  420. CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_),
  421. "cudaEventElapsedTime failed");
  422. CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy op event start failed");
  423. CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
  424. op_time_elapsed = op_time_elapsed * kTimeUnit;
  425. op_host_time_stop_ = GetHostTimeStamp();
  426. SingleOpLaunchTimeProcess(op_time_elapsed);
  427. } else {
  428. op_host_time_stop_ = GetHostTimeStamp();
  429. op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
  430. SingleOpLaunchTimeProcess(op_time_elapsed);
  431. }
  432. MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
  433. Profiler::SetRunTimeData(op_name_, op_time_elapsed);
  434. Profiler::SetRunTimeData(op_name_, op_cupti_time_start_, op_time_elapsed);
  435. }
  436. void GPUProfiler::StopCUPTI() {
  437. if (subscriber_ != nullptr) {
  438. CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe");
  439. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
  440. for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
  441. ++it) {
  442. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable");
  443. }
  444. subscriber_ = nullptr;
  445. }
  446. }
  447. void GPUProfiler::Stop() {
  448. MS_LOG(INFO) << "Stop GPU Profiling";
  449. StopCUPTI();
  450. OpsParser();
  451. SaveProfileData();
  452. ClearInst();
  453. }
  454. void GPUProfiler::SaveExtraProfileData() {
  455. for (auto op : profiling_op_) {
  456. op.second->SaveProfilingData();
  457. }
  458. MS_LOG(INFO) << "Save extra profiling data end.";
  459. }
  460. void GPUProfiler::SaveProfileData() {
  461. if (profile_data_path_.empty()) {
  462. MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
  463. } else {
  464. GpuDataSaver dataSaver(step_trace_op_name_, all_step_start_end_info_);
  465. dataSaver.ParseOpInfo(op_info_map_);
  466. dataSaver.ParseEvent(events_);
  467. dataSaver.WriteFile(profile_data_path_, base_time_);
  468. SaveExtraProfileData();
  469. }
  470. }
  471. void GPUProfiler::ClearInst() {
  472. op_info_map_.clear();
  473. op_name_map_.clear();
  474. events_.clear();
  475. activities_enable_.clear();
  476. enable_flag_ = false;
  477. sync_enable_flag_ = true;
  478. cupti_callback_events_count_ = 0l;
  479. cupti_callback_events_drop_count_ = 0l;
  480. cupti_activity_events_count_ = 0l;
  481. cupti_activity_events_drop_count_ = 0l;
  482. }
  483. void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
  484. auto gpu_profiler_inst = GPUProfiler::GetInstance();
  485. if (gpu_profiler_inst == nullptr) {
  486. MS_LOG(ERROR) << "GPU profiler instance is nullptr";
  487. return;
  488. }
  489. gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords);
  490. }
  491. void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
  492. PROFILER_ERROR_IF_NULLPTR(buffer);
  493. auto gpu_profiler_inst = GPUProfiler::GetInstance();
  494. if (gpu_profiler_inst == nullptr) {
  495. MS_LOG(ERROR) << "GPU profiler instance is nullptr";
  496. return;
  497. }
  498. gpu_profiler_inst->ProcessBuffer(ctx, streamId, buffer, size, validSize);
  499. }
  500. void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record,
  501. CUpti_ActivityMemcpy *cupti_activity_memcpy) {
  502. switch (cupti_activity_memcpy->copyKind) {
  503. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
  504. profilingData->activity_type = ActivityType::kMemcpyH2D;
  505. profilingData->kernel_name = "MemcpyH2D";
  506. break;
  507. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
  508. profilingData->activity_type = ActivityType::kMemcpyD2H;
  509. profilingData->kernel_name = "MemcpyD2H";
  510. break;
  511. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
  512. profilingData->activity_type = ActivityType::kMemcpyH2A;
  513. profilingData->kernel_name = "MemcpyH2A";
  514. break;
  515. case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
  516. profilingData->activity_type = ActivityType::kMemcpyA2H;
  517. profilingData->kernel_name = "MemcpyA2H";
  518. break;
  519. case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
  520. profilingData->activity_type = ActivityType::kMemcpyA2D;
  521. profilingData->kernel_name = "MemcpyA2D";
  522. break;
  523. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
  524. profilingData->activity_type = ActivityType::kMemcpyD2A;
  525. profilingData->kernel_name = "MemcpyD2A";
  526. break;
  527. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
  528. profilingData->activity_type = ActivityType::kMemcpyD2D;
  529. profilingData->kernel_name = "MemcpyD2D";
  530. break;
  531. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
  532. profilingData->activity_type = ActivityType::kMemcpyH2H;
  533. profilingData->kernel_name = "MemcpyH2H";
  534. break;
  535. case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
  536. profilingData->activity_type = ActivityType::kMemcpyP2P;
  537. profilingData->kernel_name = "MemcpyP2P";
  538. break;
  539. default:
  540. profilingData->activity_type = ActivityType::kMemcpyUnknown;
  541. profilingData->kernel_name = "MemcpyUnknown";
  542. break;
  543. }
  544. }
  545. void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) {
  546. CUpti_ActivityMemcpy *cupti_activity_memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record);
  547. ProcessActivityMemcpyRecord(profilingData, record, cupti_activity_memcpy);
  548. profilingData->kernel_type = "cuMemcpy";
  549. profilingData->api_type = CUPTIApiType::kActivity;
  550. profilingData->start_time_stamp = cupti_activity_memcpy->start;
  551. profilingData->end_time_stamp = cupti_activity_memcpy->end;
  552. profilingData->device_id = cupti_activity_memcpy->deviceId;
  553. profilingData->context_id = cupti_activity_memcpy->contextId;
  554. profilingData->stream_id = cupti_activity_memcpy->streamId;
  555. profilingData->correlation_id = cupti_activity_memcpy->correlationId;
  556. profilingData->memcpy_info.bytes = cupti_activity_memcpy->bytes;
  557. profilingData->memcpy_info.src_kind = cupti_activity_memcpy->srcKind;
  558. profilingData->memcpy_info.dst_kind = cupti_activity_memcpy->dstKind;
  559. }
  560. void HandleActivityMemcpy2Record(Event *profilingData, CUpti_Activity *record) {
  561. CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast<CUpti_ActivityMemcpy2 *>(record);
  562. profilingData->activity_type = ActivityType::kMemcpyP2P;
  563. profilingData->kernel_name = "MemcpyP2P";
  564. profilingData->kernel_type = "cuMemcpy";
  565. profilingData->api_type = CUPTIApiType::kActivity;
  566. profilingData->start_time_stamp = memcpyP2P->start;
  567. profilingData->end_time_stamp = memcpyP2P->end;
  568. profilingData->device_id = memcpyP2P->deviceId;
  569. profilingData->context_id = memcpyP2P->contextId;
  570. profilingData->stream_id = memcpyP2P->streamId;
  571. profilingData->correlation_id = memcpyP2P->correlationId;
  572. profilingData->memcpy_info.bytes = memcpyP2P->bytes;
  573. profilingData->memcpy_info.src_kind = memcpyP2P->srcKind;
  574. profilingData->memcpy_info.dst_kind = memcpyP2P->dstKind;
  575. }
  576. void HandleActivityMemsetRecord(Event *profilingData, CUpti_Activity *record) {
  577. CUpti_ActivityMemset *cupti_activity_memset = reinterpret_cast<CUpti_ActivityMemset *>(record);
  578. profilingData->activity_type = ActivityType::kMemset;
  579. profilingData->kernel_name = "MemorySet";
  580. profilingData->api_type = CUPTIApiType::kActivity;
  581. profilingData->start_time_stamp = cupti_activity_memset->start;
  582. profilingData->end_time_stamp = cupti_activity_memset->end;
  583. profilingData->device_id = cupti_activity_memset->deviceId;
  584. profilingData->context_id = cupti_activity_memset->contextId;
  585. profilingData->stream_id = cupti_activity_memset->streamId;
  586. profilingData->correlation_id = cupti_activity_memset->correlationId;
  587. profilingData->memcpy_info.bytes = cupti_activity_memset->bytes;
  588. }
  589. void HandleActivityKernelRecord(Event *profilingData, CUpti_Activity *record) {
  590. CUpti_ActivityKernel4 *kernel = reinterpret_cast<CUpti_ActivityKernel4 *>(record);
  591. profilingData->activity_type = ActivityType::kKernel;
  592. profilingData->api_type = CUPTIApiType::kActivity;
  593. profilingData->kernel_name = GetKernelFunc(kernel->name);
  594. profilingData->kernel_name = GetKernelFuncName(profilingData->kernel_name);
  595. profilingData->kernel_type = "cuLaunchKernel";
  596. profilingData->start_time_stamp = kernel->start;
  597. profilingData->end_time_stamp = kernel->end;
  598. profilingData->device_id = kernel->deviceId;
  599. profilingData->context_id = kernel->contextId;
  600. profilingData->stream_id = kernel->streamId;
  601. profilingData->correlation_id = kernel->correlationId;
  602. profilingData->kernel_info.registers_per_thread = kernel->registersPerThread;
  603. profilingData->kernel_info.static_shared_memory = kernel->staticSharedMemory;
  604. profilingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory;
  605. profilingData->kernel_info.block_x = kernel->blockX;
  606. profilingData->kernel_info.block_y = kernel->blockY;
  607. profilingData->kernel_info.block_z = kernel->blockZ;
  608. profilingData->kernel_info.grid_x = kernel->gridX;
  609. profilingData->kernel_info.grid_y = kernel->gridY;
  610. profilingData->kernel_info.grid_z = kernel->gridZ;
  611. }
  612. void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
  613. PROFILER_ERROR_IF_NULLPTR(record);
  614. Event profilingData;
  615. profilingData.cb_id = 0;
  616. switch (record->kind) {
  617. case CUPTI_ACTIVITY_KIND_MEMCPY: {
  618. HandleActivityMemcpyRecord(&profilingData, record);
  619. break;
  620. }
  621. case CUPTI_ACTIVITY_KIND_MEMCPY2: {
  622. HandleActivityMemcpy2Record(&profilingData, record);
  623. break;
  624. }
  625. case CUPTI_ACTIVITY_KIND_MEMSET: {
  626. HandleActivityMemsetRecord(&profilingData, record);
  627. break;
  628. }
  629. case CUPTI_ACTIVITY_KIND_KERNEL:
  630. case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
  631. HandleActivityKernelRecord(&profilingData, record);
  632. break;
  633. }
  634. default:
  635. MS_LOG(WARNING) << "Unknown activity type!";
  636. return;
  637. }
  638. AddEvent(std::move(profilingData));
  639. }
  640. void GPUProfiler::SetStepTraceOpName(ProfilingTraceInfo trace_op_name) { step_trace_op_name_ = trace_op_name; }
  641. void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) {
  642. PROFILER_ERROR_IF_NULLPTR(node);
  643. if (profiling_op_.find(node->Name()) != profiling_op_.end()) {
  644. return;
  645. }
  646. node->Init();
  647. profiling_op_[node->Name()] = node;
  648. }
  649. void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
  650. PROFILER_ERROR_IF_NULLPTR(size);
  651. PROFILER_ERROR_IF_NULLPTR(maxNumRecords);
  652. int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
  653. if (stat) {
  654. MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed.";
  655. return;
  656. }
  657. MS_LOG(DEBUG) << "Alloc activity buffer, buffer size: " << BUF_SIZE;
  658. *size = BUF_SIZE;
  659. *maxNumRecords = 0;
  660. }
  661. void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size,
  662. size_t validSize) {
  663. if (!enable_flag_) {
  664. MS_LOG(DEBUG) << "Profiler is not enable, skip to process activity record.";
  665. free(buffer);
  666. return;
  667. }
  668. CUptiResult status;
  669. CUpti_Activity *record = NULL;
  670. MS_LOG(DEBUG) << "Process activity buffer, valid size:" << validSize << ",Stream ID:" << streamId;
  671. if (validSize > 0) {
  672. do {
  673. status = CuptiActivityGetNextRecord(buffer, validSize, &record);
  674. if (status == CUPTI_SUCCESS) {
  675. HandleActivityRecord(record);
  676. } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
  677. break;
  678. } else {
  679. CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord");
  680. }
  681. } while (1);
  682. // report any records dropped from the queue
  683. size_t dropped;
  684. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped),
  685. "CuptiActivityGetNumDroppedRecords");
  686. if (dropped != 0) {
  687. MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n";
  688. }
  689. }
  690. free(buffer);
  691. }
  692. REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) {
  693. (void)py::class_<GPUProfiler, std::shared_ptr<GPUProfiler>>(*m, "GPUProfiler")
  694. .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.")
  695. .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init")
  696. .def("stop", &GPUProfiler::Stop, "stop")
  697. .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"),
  698. "enable or disable step profiling")
  699. .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"),
  700. "enable or disable synchronization profiling");
  701. }));
  702. } // namespace gpu
  703. } // namespace profiler
  704. } // namespace mindspore