You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.cc 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <cxxabi.h>
  17. #include <cmath>
  18. #include <chrono>
  19. #include "profiler/device/gpu/gpu_profiling.h"
  20. #include "profiler/device/gpu/cupti_interface.h"
  21. #include "profiler/device/gpu/data_saver.h"
  22. #include "utils/log_adapter.h"
  23. #include "pybind_api/api_register.h"
  24. namespace mindspore {
  25. namespace profiler {
  26. namespace gpu {
  27. #define BUF_SIZE (32 * 1024)
  28. #define ALIGN_SIZE (8)
  29. #define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \
  30. if (expression != CUPTI_SUCCESS) { \
  31. const char *errstr; \
  32. CuptiGetResultString(expression, &errstr); \
  33. MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << message; \
  34. }
  35. #define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \
  36. if (expression != CUPTI_SUCCESS) { \
  37. const char *errstr; \
  38. CuptiGetResultString(expression, &errstr); \
  39. MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << message; \
  40. }
  41. #define CHECK_CUDA_RET_WITH_ERROR(expression, message) \
  42. { \
  43. cudaError_t status = (expression); \
  44. if (status != cudaSuccess) { \
  45. MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
  46. << cudaGetErrorString(status); \
  47. } \
  48. }
  49. #define PROFILER_ERROR_IF_NULLPTR(ptr) \
  50. do { \
  51. if ((ptr) == nullptr) { \
  52. MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \
  53. return; \
  54. } \
  55. } while (0)
  56. std::shared_ptr<GPUProfiler> GPUProfiler::profiler_inst_ = nullptr;
  57. int32_t GetThreadID() {
  58. uint32_t thread_id = 0;
  59. thread_id = static_cast<uint32_t>(pthread_self());
  60. return thread_id;
  61. }
  62. uint32_t GetStreamID(const CUcontext context, const void *stream) {
  63. uint32_t stream_id = 0;
  64. if (stream != nullptr) {
  65. CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId");
  66. }
  67. return stream_id;
  68. }
  69. uint64_t GetCUPTITimeStamp() {
  70. uint64_t time_stamp = 0l;
  71. CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp");
  72. return time_stamp;
  73. }
  74. uint64_t GetHostTimeStamp() {
  75. auto cur_sys_clock = std::chrono::system_clock::now();
  76. uint64_t cur_time_stamp =
  77. std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
  78. return cur_time_stamp;
  79. }
  80. std::string GetKernelFunc(const char *name) {
  81. char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr);
  82. if (demangledName != nullptr) {
  83. return demangledName;
  84. } else {
  85. return name;
  86. }
  87. }
  88. void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id,
  89. const CUpti_CallbackData *cb_data) {
  90. if (domain != CUPTI_CB_DOMAIN_DRIVER_API) {
  91. return;
  92. }
  93. auto gpu_profiler_inst = GPUProfiler::GetInstance();
  94. PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst);
  95. if (!gpu_profiler_inst->GetEnableFlag()) {
  96. return;
  97. }
  98. PROFILER_ERROR_IF_NULLPTR(cb_data);
  99. if (cb_data->context == nullptr) {
  100. MS_LOG(DEBUG) << "Callback data context is null , correlation Id:" << cb_data->correlationId
  101. << " callback id:" << cb_id;
  102. return;
  103. }
  104. uint64_t start_timestamp;
  105. uint64_t end_timestamp;
  106. if (cb_data->callbackSite == CUPTI_API_ENTER) {
  107. *cb_data->correlationData = GetCUPTITimeStamp();
  108. } else if (cb_data->callbackSite == CUPTI_API_EXIT) {
  109. start_timestamp = *cb_data->correlationData;
  110. end_timestamp = GetCUPTITimeStamp();
  111. switch (cb_id) {
  112. case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
  113. case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
  114. case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
  115. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp);
  116. break;
  117. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
  118. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
  119. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
  120. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
  121. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
  122. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
  123. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
  124. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
  125. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
  126. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
  127. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
  128. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
  129. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
  130. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
  131. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
  132. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
  133. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
  134. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
  135. case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
  136. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
  137. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
  138. case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
  139. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp);
  140. break;
  141. case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc:
  142. case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
  143. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp);
  144. break;
  145. case CUPTI_DRIVER_TRACE_CBID_cuEventCreate:
  146. case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2:
  147. case CUPTI_DRIVER_TRACE_CBID_cuEventRecord:
  148. case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize:
  149. case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime:
  150. // In some cases, the callback of cuctxsetcurrent is only exist
  151. // without entry, so this callback is ignored
  152. case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent:
  153. break;
  154. default:
  155. gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp);
  156. break;
  157. }
  158. }
  159. }
  160. std::shared_ptr<GPUProfiler> GPUProfiler::GetInstance() {
  161. if (profiler_inst_ == nullptr) {
  162. profiler_inst_ = std::shared_ptr<GPUProfiler>(new (std::nothrow) GPUProfiler());
  163. }
  164. return profiler_inst_;
  165. }
  166. void GPUProfiler::SyncEnable(const bool enable_flag) {
  167. MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag;
  168. sync_enable_flag_ = enable_flag;
  169. }
  170. void GPUProfiler::StepProfilingEnable(const bool enable_flag) {
  171. MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag;
  172. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
  173. enable_flag_ = enable_flag;
  174. }
  175. void GPUProfiler::FixOpNameByCorrelationId(Event *event) {
  176. PROFILER_ERROR_IF_NULLPTR(event);
  177. if (event->api_type != CUPTIApiType::kActivity) {
  178. return;
  179. }
  180. auto iter = op_name_map_.find(event->correlation_id);
  181. if (iter != op_name_map_.end()) {
  182. event->op_name = std::move(iter->second);
  183. }
  184. }
  185. void GPUProfiler::AddEvent(Event &&event) {
  186. // protect callback concurrency for driver api and activity
  187. std::unique_lock<std::mutex> lock(event_mutex_);
  188. switch (event.api_type) {
  189. case CUPTIApiType::kCallback: {
  190. if (cupti_callback_events_count_ < max_cupti_callback_events_) {
  191. events_.emplace_back(std::move(event));
  192. cupti_callback_events_count_++;
  193. } else {
  194. cupti_callback_events_drop_count_++;
  195. }
  196. break;
  197. }
  198. case CUPTIApiType::kActivity: {
  199. if (cupti_activity_events_count_ < max_cupti_activity_events_) {
  200. events_.emplace_back(std::move(event));
  201. cupti_activity_events_count_++;
  202. } else {
  203. cupti_activity_events_drop_count_++;
  204. }
  205. break;
  206. }
  207. default:
  208. break;
  209. }
  210. }
  211. void GPUProfiler::EventLog(const Event &event) {
  212. MS_LOG(DEBUG) << "GPUProfiler"
  213. << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type
  214. << ",api_type:" << static_cast<int>(event.api_type) << ",start_time_stamp:" << event.start_time_stamp
  215. << ",end_time_stamp:" << event.end_time_stamp << ",cost:,"
  216. << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name
  217. << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id
  218. << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id
  219. << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id;
  220. }
  221. void fillActivityInfo(OpInfo *opInfo, const Event &event) {
  222. if (event.api_type != CUPTIApiType::kActivity) {
  223. return;
  224. }
  225. switch (event.activity_type) {
  226. case ActivityType::kKernel:
  227. opInfo->kernel_info.registers_per_thread = event.kernel_info.registers_per_thread;
  228. opInfo->kernel_info.static_shared_memory = event.kernel_info.static_shared_memory;
  229. opInfo->kernel_info.dynamic_shared_memory = event.kernel_info.dynamic_shared_memory;
  230. opInfo->kernel_info.block_x = event.kernel_info.block_x;
  231. opInfo->kernel_info.block_y = event.kernel_info.block_y;
  232. opInfo->kernel_info.block_z = event.kernel_info.block_z;
  233. opInfo->kernel_info.grid_x = event.kernel_info.grid_x;
  234. opInfo->kernel_info.grid_y = event.kernel_info.grid_y;
  235. opInfo->kernel_info.grid_z = event.kernel_info.grid_z;
  236. break;
  237. case ActivityType::kMemcpyH2D:
  238. case ActivityType::kMemcpyD2H:
  239. case ActivityType::kMemcpyH2A:
  240. case ActivityType::kMemcpyA2H:
  241. case ActivityType::kMemcpyA2D:
  242. case ActivityType::kMemcpyD2A:
  243. case ActivityType::kMemcpyP2P:
  244. case ActivityType::kMemcpyH2H:
  245. case ActivityType::kMemset:
  246. case ActivityType::kMemcpyUnknown:
  247. opInfo->memcpy_info.bytes = event.memcpy_info.bytes;
  248. default:
  249. break;
  250. }
  251. }
  252. void GPUProfiler::OpsParser() {
  253. MS_LOG(INFO) << "Count the number of events size:" << events_.size()
  254. << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_;
  255. if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) {
  256. MS_LOG(WARNING)
  257. << "The total number of events exceeded the profiler's processing capacity, Some events were discarded."
  258. << " callback api events:" << cupti_activity_events_drop_count_
  259. << " activity api events:" << cupti_callback_events_drop_count_;
  260. }
  261. if (events_.size() == 0) {
  262. return;
  263. }
  264. for (Event &event : events_) {
  265. if (event.op_name.empty()) {
  266. FixOpNameByCorrelationId(&event);
  267. }
  268. EventLog(event);
  269. if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) {
  270. continue;
  271. }
  272. auto iter = op_info_map_.find(event.op_name);
  273. if (iter != op_info_map_.end()) {
  274. switch (event.api_type) {
  275. case CUPTIApiType::kCallback: {
  276. iter->second.op_kernel_api_count += 1;
  277. // The time unit from ns to us
  278. iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
  279. break;
  280. }
  281. case CUPTIApiType::kActivity: {
  282. iter->second.op_kernel_count += 1;
  283. // The time unit from ns to us
  284. iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
  285. fillActivityInfo(&iter->second, event);
  286. break;
  287. }
  288. default:
  289. break;
  290. }
  291. }
  292. }
  293. MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|"
  294. ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|"
  295. ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time,|"
  296. ",mem_bytes,registers_per_thread,static_shared_memory,dynamic_shared_memory"
  297. ",block_x,block_y,block_z,grid_x,grid_y,grid_z"
  298. << std::endl;
  299. std::vector<std::pair<std::string, OpInfo>> order_vec(op_info_map_.begin(), op_info_map_.end());
  300. auto cmp_func = [](const std::pair<std::string, OpInfo> &a, const std::pair<std::string, OpInfo> &b) {
  301. return a.second.cupti_activity_time > b.second.cupti_activity_time;
  302. };
  303. std::sort(order_vec.begin(), order_vec.end(), cmp_func);
  304. for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) {
  305. MS_LOG(DEBUG) << "GPU_profiler"
  306. << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << ","
  307. << iter->second.op_kernel_api_count << ","
  308. << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << ","
  309. << iter->second.op_host_cost_time << ","
  310. << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << ","
  311. << round(iter->second.cupti_api_call_time / iter->second.op_count) << ","
  312. << round(iter->second.op_host_cost_time / iter->second.op_count) << ","
  313. << "|," << iter->second.memcpy_info.bytes << "," << iter->second.kernel_info.registers_per_thread
  314. << "," << iter->second.kernel_info.static_shared_memory << ","
  315. << iter->second.kernel_info.dynamic_shared_memory << "," << iter->second.kernel_info.block_x << ","
  316. << iter->second.kernel_info.block_y << "," << iter->second.kernel_info.block_z << ","
  317. << iter->second.kernel_info.grid_x << "," << iter->second.kernel_info.grid_y << ","
  318. << iter->second.kernel_info.grid_z << std::endl;
  319. }
  320. }
  321. void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata,
  322. const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) {
  323. Event event;
  324. uint32_t device_id = -1;
  325. CuptiGetDeviceId(cbdata->context, &device_id);
  326. event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName;
  327. event.kernel_type = typestring;
  328. event.api_type = CUPTIApiType::kCallback;
  329. event.start_time_stamp = startTimestamp;
  330. event.end_time_stamp = endTimestamp;
  331. event.op_name = op_name_;
  332. event.device_id = device_id;
  333. event.correlation_id = cbdata->correlationId;
  334. event.thread_id = GetThreadID();
  335. event.context_id = cbdata->contextUid;
  336. event.stream_id = GetStreamID(cbdata->context, stream_);
  337. event.cb_id = cbid;
  338. op_name_map_[event.correlation_id] = event.op_name;
  339. AddEvent(std::move(event));
  340. }
  341. void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  342. void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  343. void GPUProfiler::Init(const std::string &profileDataPath = "") {
  344. MS_LOG(INFO) << "Initialize GPU Profiling";
  345. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this),
  346. "CuptiSubscribe");
  347. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain");
  348. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY);
  349. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
  350. activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL);
  351. for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
  352. ++it) {
  353. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable");
  354. }
  355. CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer),
  356. "CuptiActivityRegisterCallbacks");
  357. base_time_.gpu_start_time = GetCUPTITimeStamp();
  358. base_time_.host_start_time = GetHostTimeStamp();
  359. profile_data_path_ = profileDataPath;
  360. MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time
  361. << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_;
  362. }
  363. void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) {
  364. auto iter = op_info_map_.find(op_name);
  365. if (iter != op_info_map_.end()) {
  366. iter->second.op_count += 1;
  367. } else {
  368. OpInfo op_info;
  369. op_info.op_name = op_name;
  370. op_info.stream = stream;
  371. op_info.op_count = 1;
  372. op_info_map_[op_name] = op_info;
  373. }
  374. op_name_ = op_name;
  375. stream_ = stream;
  376. }
  377. void GPUProfiler::SetRunTimeData(const std::string &op_name, const float time_elapsed) {
  378. auto iter = op_info_map_.find(op_name);
  379. if (iter != op_info_map_.end()) {
  380. // The time unit is ms ,convert to us
  381. iter->second.op_host_cost_time += time_elapsed;
  382. }
  383. }
  384. void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
  385. if (sync_enable_flag_) {
  386. CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate op event start failed");
  387. CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed");
  388. CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_),
  389. "cudaEventRecord op event start failed");
  390. } else {
  391. op_host_time_start_ = GetHostTimeStamp();
  392. }
  393. SetRunTimeData(op_name, stream);
  394. }
  395. void GPUProfiler::OpDataProducerEnd() {
  396. float op_time_elapsed = 0;
  397. if (sync_enable_flag_) {
  398. CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_),
  399. "cudaEventRecord op event stop failed");
  400. CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed");
  401. CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed");
  402. CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_),
  403. "cudaEventElapsedTime failed");
  404. CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy op event start failed");
  405. CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
  406. op_time_elapsed = op_time_elapsed * kTimeUnit;
  407. } else {
  408. op_host_time_stop_ = GetHostTimeStamp();
  409. op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
  410. }
  411. MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
  412. SetRunTimeData(op_name_, op_time_elapsed);
  413. }
  414. void GPUProfiler::StopCUPTI() {
  415. if (subscriber_ != nullptr) {
  416. CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe");
  417. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
  418. for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
  419. ++it) {
  420. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable");
  421. }
  422. subscriber_ = nullptr;
  423. }
  424. }
  425. void GPUProfiler::Stop() {
  426. MS_LOG(INFO) << "Stop GPU Profiling";
  427. StopCUPTI();
  428. OpsParser();
  429. SaveProfileData();
  430. ClearInst();
  431. }
  432. void GPUProfiler::SaveProfileData() {
  433. if (profile_data_path_.empty()) {
  434. MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
  435. } else {
  436. DataSaver dataSaver;
  437. dataSaver.ParseOpInfo(op_info_map_);
  438. dataSaver.ParseEvent(events_);
  439. dataSaver.WriteFile(profile_data_path_);
  440. }
  441. }
  442. void GPUProfiler::ClearInst() {
  443. op_info_map_.clear();
  444. op_name_map_.clear();
  445. events_.clear();
  446. activities_enable_.clear();
  447. enable_flag_ = false;
  448. sync_enable_flag_ = true;
  449. cupti_callback_events_count_ = 0l;
  450. cupti_callback_events_drop_count_ = 0l;
  451. cupti_activity_events_count_ = 0l;
  452. cupti_activity_events_drop_count_ = 0l;
  453. }
  454. void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
  455. auto gpu_profiler_inst = GPUProfiler::GetInstance();
  456. if (gpu_profiler_inst == nullptr) {
  457. MS_LOG(ERROR) << "GPU profiler instance is nullptr";
  458. return;
  459. }
  460. gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords);
  461. }
  462. void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
  463. PROFILER_ERROR_IF_NULLPTR(buffer);
  464. GPUProfiler::GetInstance()->ProcessBuffer(ctx, streamId, buffer, size, validSize);
  465. }
  466. void HandleActivityMemcpyRecord(Event *profillingData, CUpti_Activity *record) {
  467. CUpti_ActivityMemcpy *memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record);
  468. switch (memcpy->copyKind) {
  469. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
  470. profillingData->activity_type = ActivityType::kMemcpyH2D;
  471. profillingData->kernel_name = "MemcpyH2D";
  472. break;
  473. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
  474. profillingData->activity_type = ActivityType::kMemcpyD2H;
  475. profillingData->kernel_name = "MemcpyD2H";
  476. break;
  477. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
  478. profillingData->activity_type = ActivityType::kMemcpyH2A;
  479. profillingData->kernel_name = "MemcpyH2A";
  480. break;
  481. case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
  482. profillingData->activity_type = ActivityType::kMemcpyA2H;
  483. profillingData->kernel_name = "MemcpyA2H";
  484. break;
  485. case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
  486. profillingData->activity_type = ActivityType::kMemcpyA2D;
  487. profillingData->kernel_name = "MemcpyA2D";
  488. break;
  489. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
  490. profillingData->activity_type = ActivityType::kMemcpyD2A;
  491. profillingData->kernel_name = "MemcpyD2A";
  492. break;
  493. case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
  494. profillingData->activity_type = ActivityType::kMemcpyD2D;
  495. profillingData->kernel_name = "MemcpyD2D";
  496. break;
  497. case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
  498. profillingData->activity_type = ActivityType::kMemcpyH2H;
  499. profillingData->kernel_name = "MemcpyH2H";
  500. break;
  501. case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
  502. profillingData->activity_type = ActivityType::kMemcpyP2P;
  503. profillingData->kernel_name = "MemcpyP2P";
  504. break;
  505. default:
  506. profillingData->activity_type = ActivityType::kMemcpyUnknown;
  507. profillingData->kernel_name = "MemcpyUnknown";
  508. break;
  509. }
  510. profillingData->kernel_type = "cuMemcpy";
  511. profillingData->api_type = CUPTIApiType::kActivity;
  512. profillingData->start_time_stamp = memcpy->start;
  513. profillingData->end_time_stamp = memcpy->end;
  514. profillingData->device_id = memcpy->deviceId;
  515. profillingData->context_id = memcpy->contextId;
  516. profillingData->stream_id = memcpy->streamId;
  517. profillingData->correlation_id = memcpy->correlationId;
  518. profillingData->memcpy_info.bytes = memcpy->bytes;
  519. profillingData->memcpy_info.src_kind = memcpy->srcKind;
  520. profillingData->memcpy_info.dst_kind = memcpy->dstKind;
  521. }
  522. void HandleActivityMemcpy2Record(Event *profillingData, CUpti_Activity *record) {
  523. CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast<CUpti_ActivityMemcpy2 *>(record);
  524. profillingData->activity_type = ActivityType::kMemcpyP2P;
  525. profillingData->kernel_name = "MemcpyP2P";
  526. profillingData->kernel_type = "cuMemcpy";
  527. profillingData->api_type = CUPTIApiType::kActivity;
  528. profillingData->start_time_stamp = memcpyP2P->start;
  529. profillingData->end_time_stamp = memcpyP2P->end;
  530. profillingData->device_id = memcpyP2P->deviceId;
  531. profillingData->context_id = memcpyP2P->contextId;
  532. profillingData->stream_id = memcpyP2P->streamId;
  533. profillingData->correlation_id = memcpyP2P->correlationId;
  534. profillingData->memcpy_info.bytes = memcpyP2P->bytes;
  535. profillingData->memcpy_info.src_kind = memcpyP2P->srcKind;
  536. profillingData->memcpy_info.dst_kind = memcpyP2P->dstKind;
  537. }
  538. void HandleActivityMemsetRecord(Event *profillingData, CUpti_Activity *record) {
  539. CUpti_ActivityMemset *memset = reinterpret_cast<CUpti_ActivityMemset *>(record);
  540. profillingData->activity_type = ActivityType::kMemset;
  541. profillingData->kernel_name = "MemorySet";
  542. profillingData->api_type = CUPTIApiType::kActivity;
  543. profillingData->start_time_stamp = memset->start;
  544. profillingData->end_time_stamp = memset->end;
  545. profillingData->device_id = memset->deviceId;
  546. profillingData->context_id = memset->contextId;
  547. profillingData->stream_id = memset->streamId;
  548. profillingData->correlation_id = memset->correlationId;
  549. profillingData->memcpy_info.bytes = memset->bytes;
  550. }
  551. void HandleActivityKernelRecord(Event *profillingData, CUpti_Activity *record) {
  552. CUpti_ActivityKernel4 *kernel = reinterpret_cast<CUpti_ActivityKernel4 *>(record);
  553. profillingData->activity_type = ActivityType::kKernel;
  554. profillingData->api_type = CUPTIApiType::kActivity;
  555. profillingData->kernel_name = GetKernelFunc(kernel->name);
  556. profillingData->kernel_type = "cuLaunchKernel";
  557. profillingData->start_time_stamp = kernel->start;
  558. profillingData->end_time_stamp = kernel->end;
  559. profillingData->device_id = kernel->deviceId;
  560. profillingData->context_id = kernel->contextId;
  561. profillingData->stream_id = kernel->streamId;
  562. profillingData->correlation_id = kernel->correlationId;
  563. profillingData->kernel_info.registers_per_thread = kernel->registersPerThread;
  564. profillingData->kernel_info.static_shared_memory = kernel->staticSharedMemory;
  565. profillingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory;
  566. profillingData->kernel_info.block_x = kernel->blockX;
  567. profillingData->kernel_info.block_y = kernel->blockY;
  568. profillingData->kernel_info.block_z = kernel->blockZ;
  569. profillingData->kernel_info.grid_x = kernel->gridX;
  570. profillingData->kernel_info.grid_y = kernel->gridY;
  571. profillingData->kernel_info.grid_z = kernel->gridZ;
  572. }
  573. void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
  574. PROFILER_ERROR_IF_NULLPTR(record);
  575. Event profillingData;
  576. profillingData.cb_id = 0;
  577. switch (record->kind) {
  578. case CUPTI_ACTIVITY_KIND_MEMCPY: {
  579. HandleActivityMemcpyRecord(&profillingData, record);
  580. break;
  581. }
  582. case CUPTI_ACTIVITY_KIND_MEMCPY2: {
  583. HandleActivityMemcpy2Record(&profillingData, record);
  584. break;
  585. }
  586. case CUPTI_ACTIVITY_KIND_MEMSET: {
  587. HandleActivityMemsetRecord(&profillingData, record);
  588. break;
  589. }
  590. case CUPTI_ACTIVITY_KIND_KERNEL:
  591. case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
  592. HandleActivityKernelRecord(&profillingData, record);
  593. break;
  594. }
  595. default:
  596. MS_LOG(WARNING) << "Unknown activity type!";
  597. return;
  598. }
  599. AddEvent(std::move(profillingData));
  600. }
  601. void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
  602. int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
  603. if (stat) {
  604. MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed.";
  605. return;
  606. }
  607. MS_LOG(DEBUG) << "Alloc activity buffer, buffer size: " << BUF_SIZE;
  608. *size = BUF_SIZE;
  609. *maxNumRecords = 0;
  610. }
  611. void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size,
  612. size_t validSize) {
  613. if (!enable_flag_) {
  614. MS_LOG(DEBUG) << "Profiler is not enable, skip to process activity record.";
  615. free(buffer);
  616. return;
  617. }
  618. CUptiResult status;
  619. CUpti_Activity *record = NULL;
  620. MS_LOG(DEBUG) << "Process activity buffer, valid size:" << validSize << ",Stream ID:" << streamId;
  621. if (validSize > 0) {
  622. do {
  623. status = CuptiActivityGetNextRecord(buffer, validSize, &record);
  624. if (status == CUPTI_SUCCESS) {
  625. HandleActivityRecord(record);
  626. } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
  627. break;
  628. } else {
  629. CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord");
  630. }
  631. } while (1);
  632. // report any records dropped from the queue
  633. size_t dropped;
  634. CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped),
  635. "CuptiActivityGetNumDroppedRecords");
  636. if (dropped != 0) {
  637. MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n";
  638. }
  639. }
  640. free(buffer);
  641. }
  642. REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) {
  643. (void)py::class_<GPUProfiler, std::shared_ptr<GPUProfiler>>(*m, "GPUProfiler")
  644. .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.")
  645. .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init")
  646. .def("stop", &GPUProfiler::Stop, "stop")
  647. .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"),
  648. "enable or disable step profiling")
  649. .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"),
  650. "enable or disable synchronization profiling");
  651. }));
  652. } // namespace gpu
  653. } // namespace profiler
  654. } // namespace mindspore