|
|
|
@@ -60,8 +60,7 @@ namespace gpu { |
|
|
|
std::shared_ptr<GPUProfiler> GPUProfiler::profiler_inst_ = nullptr; |
|
|
|
|
|
|
|
int32_t GetThreadID() { |
|
|
|
uint32_t thread_id = 0; |
|
|
|
thread_id = static_cast<uint32_t>(pthread_self()); |
|
|
|
uint32_t thread_id = static_cast<uint32_t>(pthread_self()); |
|
|
|
return thread_id; |
|
|
|
} |
|
|
|
|
|
|
|
@@ -95,6 +94,59 @@ std::string GetKernelFunc(const char *name) { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void CUPTIApiExit(const std::shared_ptr<GPUProfiler> &gpu_profiler_inst, CUpti_CallbackId cb_id, |
|
|
|
const CUpti_CallbackData *cb_data) { |
|
|
|
uint64_t start_timestamp = *cb_data->correlationData; |
|
|
|
uint64_t end_timestamp = GetCUPTITimeStamp(); |
|
|
|
switch (cb_id) { |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: |
|
|
|
// In some cases, the callback of cuctxsetcurrent is only exist |
|
|
|
// without entry, so this callback is ignored |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: |
|
|
|
break; |
|
|
|
default: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, |
|
|
|
const CUpti_CallbackData *cb_data) { |
|
|
|
if (domain != CUPTI_CB_DOMAIN_DRIVER_API) { |
|
|
|
@@ -113,63 +165,10 @@ void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_Callb |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
uint64_t start_timestamp; |
|
|
|
uint64_t end_timestamp; |
|
|
|
|
|
|
|
if (cb_data->callbackSite == CUPTI_API_ENTER) { |
|
|
|
*cb_data->correlationData = GetCUPTITimeStamp(); |
|
|
|
|
|
|
|
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { |
|
|
|
start_timestamp = *cb_data->correlationData; |
|
|
|
end_timestamp = GetCUPTITimeStamp(); |
|
|
|
|
|
|
|
switch (cb_id) { |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: |
|
|
|
// In some cases, the callback of cuctxsetcurrent is only exist |
|
|
|
// without entry, so this callback is ignored |
|
|
|
case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: |
|
|
|
break; |
|
|
|
default: |
|
|
|
gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); |
|
|
|
break; |
|
|
|
} |
|
|
|
CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -240,21 +239,7 @@ void GPUProfiler::EventLog(const Event &event) { |
|
|
|
<< ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id; |
|
|
|
} |
|
|
|
|
|
|
|
void GPUProfiler::OpsParser() { |
|
|
|
MS_LOG(INFO) << "Count the number of events size:" << events_.size() |
|
|
|
<< " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; |
|
|
|
|
|
|
|
if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { |
|
|
|
MS_LOG(WARNING) |
|
|
|
<< "The total number of events exceeded the profiler's processing capacity, Some events were discarded." |
|
|
|
<< " callback api events:" << cupti_activity_events_drop_count_ |
|
|
|
<< " activity api events:" << cupti_callback_events_drop_count_; |
|
|
|
} |
|
|
|
|
|
|
|
if (events_.size() == 0) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
void GPUProfiler::ProcessEvents() { |
|
|
|
for (Event &event : events_) { |
|
|
|
if (event.op_name.empty()) { |
|
|
|
FixOpNameByCorrelationId(&event); |
|
|
|
@@ -286,7 +271,24 @@ void GPUProfiler::OpsParser() { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void GPUProfiler::OpsParser() { |
|
|
|
MS_LOG(INFO) << "Count the number of events size:" << events_.size() |
|
|
|
<< " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; |
|
|
|
|
|
|
|
if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { |
|
|
|
MS_LOG(WARNING) |
|
|
|
<< "The total number of events exceeded the profiler's processing capacity, some events were discarded." |
|
|
|
<< " activity api events:" << cupti_activity_events_drop_count_ |
|
|
|
<< " callback api events:" << cupti_callback_events_drop_count_; |
|
|
|
} |
|
|
|
|
|
|
|
if (events_.size() == 0) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
ProcessEvents(); |
|
|
|
MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|" |
|
|
|
",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|" |
|
|
|
",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time" |
|
|
|
@@ -490,8 +492,7 @@ void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *b |
|
|
|
GPUProfiler::GetInstance()->ProcessBuffer(ctx, streamId, buffer, size, validSize); |
|
|
|
} |
|
|
|
|
|
|
|
void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { |
|
|
|
CUpti_ActivityMemcpy *memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record); |
|
|
|
void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record, CUpti_ActivityMemcpy *memcpy) { |
|
|
|
switch (memcpy->copyKind) { |
|
|
|
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: |
|
|
|
profilingData->activity_type = ActivityType::kMemcpyH2D; |
|
|
|
@@ -534,6 +535,12 @@ void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { |
|
|
|
profilingData->kernel_name = "MemcpyUnknown"; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { |
|
|
|
CUpti_ActivityMemcpy *memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record); |
|
|
|
ProcessActivityMemcpyRecord(profilingData, record, memcpy); |
|
|
|
|
|
|
|
profilingData->kernel_type = "cuMemcpy"; |
|
|
|
profilingData->api_type = CUPTIApiType::kActivity; |
|
|
|
profilingData->start_time_stamp = memcpy->start; |
|
|
|
@@ -687,7 +694,6 @@ REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) { |
|
|
|
.def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"), |
|
|
|
"enable or disable synchronization profiling"); |
|
|
|
})); |
|
|
|
|
|
|
|
} // namespace gpu |
|
|
|
} // namespace profiler |
|
|
|
} // namespace mindspore |