/** * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "profiler/device/gpu/gpu_profiling.h" #include "profiler/device/gpu/cupti_interface.h" #include "profiler/device/gpu/data_saver.h" #include "utils/log_adapter.h" #include "pybind_api/api_register.h" namespace mindspore { namespace profiler { namespace gpu { #define BUF_SIZE (32 * 1024) #define ALIGN_SIZE (8) #define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \ if (expression != CUPTI_SUCCESS) { \ const char *errstr; \ CuptiGetResultString(expression, &errstr); \ MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << message; \ } #define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \ if (expression != CUPTI_SUCCESS) { \ const char *errstr; \ CuptiGetResultString(expression, &errstr); \ MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << message; \ } #define CHECK_CUDA_RET_WITH_ERROR(expression, message) \ { \ cudaError_t status = (expression); \ if (status != cudaSuccess) { \ MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \ << cudaGetErrorString(status); \ } \ } #define PROFILER_ERROR_IF_NULLPTR(ptr) \ do { \ if ((ptr) == nullptr) { \ MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \ return; \ } \ } while (0) std::shared_ptr GPUProfiler::profiler_inst_ = nullptr; int32_t GetThreadID() { uint32_t thread_id = static_cast(pthread_self()); return thread_id; } uint32_t GetStreamID(const CUcontext context, const void *stream) { uint32_t stream_id = 0; if (stream != nullptr) { CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId"); } return stream_id; } uint64_t GetCUPTITimeStamp() { uint64_t time_stamp = 0l; CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp"); return time_stamp; } uint64_t GetHostTimeStamp() { auto cur_sys_clock = std::chrono::system_clock::now(); uint64_t cur_time_stamp = std::chrono::duration_cast(cur_sys_clock.time_since_epoch()).count(); return cur_time_stamp; } std::string GetKernelFunc(const char *name) { char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr); if (demangledName != nullptr) { return demangledName; } else { return name; } } void CUPTIApiExit(const std::shared_ptr &gpu_profiler_inst, CUpti_CallbackId cb_id, const CUpti_CallbackData *cb_data) { uint64_t start_timestamp = *cb_data->correlationData; uint64_t end_timestamp = GetCUPTITimeStamp(); switch (cb_id) { case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp); break; case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp); break; case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc: case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp); break; case CUPTI_DRIVER_TRACE_CBID_cuEventCreate: case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2: case CUPTI_DRIVER_TRACE_CBID_cuEventRecord: case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize: case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime: // In some cases, the callback of cuctxsetcurrent is only exist // without entry, so this callback is ignored case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent: break; default: gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp); break; } } void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, const CUpti_CallbackData *cb_data) { if (domain != CUPTI_CB_DOMAIN_DRIVER_API) { return; } auto gpu_profiler_inst = GPUProfiler::GetInstance(); PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst); if (!gpu_profiler_inst->GetEnableFlag()) { return; } PROFILER_ERROR_IF_NULLPTR(cb_data); if (cb_data->context == nullptr) { MS_LOG(DEBUG) << "Callback data context is null , correlation Id:" << cb_data->correlationId << " callback id:" << cb_id; return; } if (cb_data->callbackSite == CUPTI_API_ENTER) { *cb_data->correlationData = GetCUPTITimeStamp(); } else if (cb_data->callbackSite == CUPTI_API_EXIT) { CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data); } } std::shared_ptr GPUProfiler::GetInstance() { if (profiler_inst_ == nullptr) { profiler_inst_ = std::shared_ptr(new (std::nothrow) GPUProfiler()); } return profiler_inst_; } void GPUProfiler::SyncEnable(const bool enable_flag) { MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag; sync_enable_flag_ = enable_flag; } void GPUProfiler::StepProfilingEnable(const bool enable_flag) { MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag; CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll"); enable_flag_ = enable_flag; } void GPUProfiler::FixOpNameByCorrelationId(Event *event) { PROFILER_ERROR_IF_NULLPTR(event); if (event->api_type != CUPTIApiType::kActivity) { return; } auto iter = op_name_map_.find(event->correlation_id); if (iter != op_name_map_.end()) { event->op_name = std::move(iter->second); } } void GPUProfiler::AddEvent(Event &&event) { // protect callback concurrency for driver api and activity std::unique_lock lock(event_mutex_); switch (event.api_type) { case CUPTIApiType::kCallback: { if (cupti_callback_events_count_ < max_cupti_callback_events_) { events_.emplace_back(std::move(event)); cupti_callback_events_count_++; } else { cupti_callback_events_drop_count_++; } break; } case CUPTIApiType::kActivity: { if (cupti_activity_events_count_ < max_cupti_activity_events_) { events_.emplace_back(std::move(event)); cupti_activity_events_count_++; } else { cupti_activity_events_drop_count_++; } break; } default: break; } } void GPUProfiler::EventLog(const Event &event) { MS_LOG(DEBUG) << "GPUProfiler" << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type << ",api_type:" << static_cast(event.api_type) << ",start_time_stamp:" << event.start_time_stamp << ",end_time_stamp:" << event.end_time_stamp << ",cost:," << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id; } void GPUProfiler::ProcessEvents() { for (Event &event : events_) { if (event.op_name.empty()) { FixOpNameByCorrelationId(&event); } EventLog(event); if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) { continue; } auto iter = op_info_map_.find(event.op_name); if (iter != op_info_map_.end()) { switch (event.api_type) { case CUPTIApiType::kCallback: { iter->second.op_kernel_api_count += 1; // The time unit from ns to us iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit; break; } case CUPTIApiType::kActivity: { iter->second.op_kernel_count += 1; // The time unit from ns to us iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit; break; } default: break; } } } } void GPUProfiler::OpsParser() { MS_LOG(INFO) << "Count the number of events size:" << events_.size() << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_; if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) { MS_LOG(WARNING) << "The total number of events exceeded the profiler's processing capacity, some events were discarded." << " activity api events:" << cupti_activity_events_drop_count_ << " callback api events:" << cupti_callback_events_drop_count_; } if (events_.size() == 0) { return; } ProcessEvents(); MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|" ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|" ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time" << std::endl; std::vector> order_vec(op_info_map_.begin(), op_info_map_.end()); auto cmp_func = [](const std::pair &a, const std::pair &b) { return a.second.cupti_activity_time > b.second.cupti_activity_time; }; std::sort(order_vec.begin(), order_vec.end(), cmp_func); for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) { MS_LOG(DEBUG) << "GPU_profiler" << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << "," << iter->second.op_kernel_api_count << "," << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << "," << iter->second.op_host_cost_time << "," << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << "," << round(iter->second.cupti_api_call_time / iter->second.op_count) << "," << round(iter->second.op_host_cost_time / iter->second.op_count) << std::endl; } } void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) { Event event; uint32_t device_id = -1; CuptiGetDeviceId(cbdata->context, &device_id); event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName; event.kernel_type = typestring; event.api_type = CUPTIApiType::kCallback; event.start_time_stamp = startTimestamp; event.end_time_stamp = endTimestamp; event.op_name = op_name_; event.device_id = device_id; event.correlation_id = cbdata->correlationId; event.thread_id = GetThreadID(); event.context_id = cbdata->contextUid; event.stream_id = GetStreamID(cbdata->context, stream_); event.cb_id = cbid; op_name_map_[event.correlation_id] = event.op_name; AddEvent(std::move(event)); } void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords); void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize); void GPUProfiler::Init(const std::string &profileDataPath = "") { MS_LOG(INFO) << "Initialize GPU Profiling"; if (subscriber_ != nullptr) { StopCUPTI(); MS_LOG(EXCEPTION) << "Repeated initialization, Please check whether you have created the Profiler object multiple times"; } CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this), "CuptiSubscribe"); CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain"); activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY); activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2); activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL); for (std::vector::iterator it = activities_enable_.begin(); it != activities_enable_.end(); ++it) { CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable"); } CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer), "CuptiActivityRegisterCallbacks"); base_time_.gpu_start_time = GetCUPTITimeStamp(); base_time_.host_start_time = GetHostTimeStamp(); profile_data_path_ = profileDataPath; MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_; } void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) { auto iter = op_info_map_.find(op_name); if (iter != op_info_map_.end()) { iter->second.op_count += 1; } else { OpInfo op_info; op_info.op_name = op_name; op_info.stream = stream; op_info.op_count = 1; op_info_map_[op_name] = op_info; } op_name_ = op_name; stream_ = stream; } void GPUProfiler::SetRunTimeData(const std::string &op_name, const float time_elapsed) { auto iter = op_info_map_.find(op_name); if (iter != op_info_map_.end()) { // The time unit is ms ,convert to us iter->second.op_host_cost_time += time_elapsed; } } void GPUProfiler::SetRunTimeData(const std::string &op_name, const uint64_t start, const float duration) { auto iter = op_info_map_.find(op_name); if (iter != op_info_map_.end()) { iter->second.start_duration.emplace_back(StartDuration({start, duration})); } } void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) { if (sync_enable_flag_) { CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate op event start failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_), "cudaEventRecord op event start failed"); op_host_time_start_ = GetHostTimeStamp(); op_cupti_time_start_ = GetCUPTITimeStamp(); } else { op_host_time_start_ = GetHostTimeStamp(); op_cupti_time_start_ = GetCUPTITimeStamp(); } SetRunTimeData(op_name, stream); } void GPUProfiler::OpDataProducerEnd() { float op_time_elapsed = 0; if (sync_enable_flag_) { CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_), "cudaEventRecord op event stop failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_), "cudaEventElapsedTime failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy op event start failed"); CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed"); op_time_elapsed = op_time_elapsed * kTimeUnit; op_host_time_stop_ = GetHostTimeStamp(); } else { op_host_time_stop_ = GetHostTimeStamp(); op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit; } MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed; SetRunTimeData(op_name_, op_time_elapsed); SetRunTimeData(op_name_, op_cupti_time_start_, op_time_elapsed); } void GPUProfiler::StopCUPTI() { if (subscriber_ != nullptr) { CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe"); CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll"); for (std::vector::iterator it = activities_enable_.begin(); it != activities_enable_.end(); ++it) { CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable"); } subscriber_ = nullptr; } } void GPUProfiler::Stop() { MS_LOG(INFO) << "Stop GPU Profiling"; StopCUPTI(); OpsParser(); SaveProfileData(); ClearInst(); } void GPUProfiler::SaveProfileData() { if (profile_data_path_.empty()) { MS_LOG(WARNING) << "Profile data path is empty, skip save profile data."; } else { DataSaver dataSaver; dataSaver.ParseOpInfo(op_info_map_); dataSaver.ParseEvent(events_); dataSaver.WriteFile(profile_data_path_); } } void GPUProfiler::ClearInst() { op_info_map_.clear(); op_name_map_.clear(); events_.clear(); activities_enable_.clear(); enable_flag_ = false; sync_enable_flag_ = true; cupti_callback_events_count_ = 0l; cupti_callback_events_drop_count_ = 0l; cupti_activity_events_count_ = 0l; cupti_activity_events_drop_count_ = 0l; } void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { auto gpu_profiler_inst = GPUProfiler::GetInstance(); if (gpu_profiler_inst == nullptr) { MS_LOG(ERROR) << "GPU profiler instance is nullptr"; return; } gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords); } void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { PROFILER_ERROR_IF_NULLPTR(buffer); GPUProfiler::GetInstance()->ProcessBuffer(ctx, streamId, buffer, size, validSize); } void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record, CUpti_ActivityMemcpy *memcpy) { switch (memcpy->copyKind) { case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: profilingData->activity_type = ActivityType::kMemcpyH2D; profilingData->kernel_name = "MemcpyH2D"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: profilingData->activity_type = ActivityType::kMemcpyD2H; profilingData->kernel_name = "MemcpyD2H"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: profilingData->activity_type = ActivityType::kMemcpyH2A; profilingData->kernel_name = "MemcpyH2A"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: profilingData->activity_type = ActivityType::kMemcpyA2H; profilingData->kernel_name = "MemcpyA2H"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: profilingData->activity_type = ActivityType::kMemcpyA2D; profilingData->kernel_name = "MemcpyA2D"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: profilingData->activity_type = ActivityType::kMemcpyD2A; profilingData->kernel_name = "MemcpyD2A"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: profilingData->activity_type = ActivityType::kMemcpyD2D; profilingData->kernel_name = "MemcpyD2D"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: profilingData->activity_type = ActivityType::kMemcpyH2H; profilingData->kernel_name = "MemcpyH2H"; break; case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: profilingData->activity_type = ActivityType::kMemcpyP2P; profilingData->kernel_name = "MemcpyP2P"; break; default: profilingData->activity_type = ActivityType::kMemcpyUnknown; profilingData->kernel_name = "MemcpyUnknown"; break; } } void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) { CUpti_ActivityMemcpy *memcpy = reinterpret_cast(record); ProcessActivityMemcpyRecord(profilingData, record, memcpy); profilingData->kernel_type = "cuMemcpy"; profilingData->api_type = CUPTIApiType::kActivity; profilingData->start_time_stamp = memcpy->start; profilingData->end_time_stamp = memcpy->end; profilingData->device_id = memcpy->deviceId; profilingData->context_id = memcpy->contextId; profilingData->stream_id = memcpy->streamId; profilingData->correlation_id = memcpy->correlationId; profilingData->memcpy_info.bytes = memcpy->bytes; profilingData->memcpy_info.src_kind = memcpy->srcKind; profilingData->memcpy_info.dst_kind = memcpy->dstKind; } void HandleActivityMemcpy2Record(Event *profilingData, CUpti_Activity *record) { CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast(record); profilingData->activity_type = ActivityType::kMemcpyP2P; profilingData->kernel_name = "MemcpyP2P"; profilingData->kernel_type = "cuMemcpy"; profilingData->api_type = CUPTIApiType::kActivity; profilingData->start_time_stamp = memcpyP2P->start; profilingData->end_time_stamp = memcpyP2P->end; profilingData->device_id = memcpyP2P->deviceId; profilingData->context_id = memcpyP2P->contextId; profilingData->stream_id = memcpyP2P->streamId; profilingData->correlation_id = memcpyP2P->correlationId; profilingData->memcpy_info.bytes = memcpyP2P->bytes; profilingData->memcpy_info.src_kind = memcpyP2P->srcKind; profilingData->memcpy_info.dst_kind = memcpyP2P->dstKind; } void HandleActivityMemsetRecord(Event *profilingData, CUpti_Activity *record) { CUpti_ActivityMemset *memset = reinterpret_cast(record); profilingData->activity_type = ActivityType::kMemset; profilingData->kernel_name = "MemorySet"; profilingData->api_type = CUPTIApiType::kActivity; profilingData->start_time_stamp = memset->start; profilingData->end_time_stamp = memset->end; profilingData->device_id = memset->deviceId; profilingData->context_id = memset->contextId; profilingData->stream_id = memset->streamId; profilingData->correlation_id = memset->correlationId; profilingData->memcpy_info.bytes = memset->bytes; } void HandleActivityKernelRecord(Event *profilingData, CUpti_Activity *record) { CUpti_ActivityKernel4 *kernel = reinterpret_cast(record); profilingData->activity_type = ActivityType::kKernel; profilingData->api_type = CUPTIApiType::kActivity; profilingData->kernel_name = GetKernelFunc(kernel->name); profilingData->kernel_type = "cuLaunchKernel"; profilingData->start_time_stamp = kernel->start; profilingData->end_time_stamp = kernel->end; profilingData->device_id = kernel->deviceId; profilingData->context_id = kernel->contextId; profilingData->stream_id = kernel->streamId; profilingData->correlation_id = kernel->correlationId; profilingData->kernel_info.registers_per_thread = kernel->registersPerThread; profilingData->kernel_info.static_shared_memory = kernel->staticSharedMemory; profilingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory; profilingData->kernel_info.block_x = kernel->blockX; profilingData->kernel_info.block_y = kernel->blockY; profilingData->kernel_info.block_z = kernel->blockZ; profilingData->kernel_info.grid_x = kernel->gridX; profilingData->kernel_info.grid_y = kernel->gridY; profilingData->kernel_info.grid_z = kernel->gridZ; } void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) { PROFILER_ERROR_IF_NULLPTR(record); Event profilingData; profilingData.cb_id = 0; switch (record->kind) { case CUPTI_ACTIVITY_KIND_MEMCPY: { HandleActivityMemcpyRecord(&profilingData, record); break; } case CUPTI_ACTIVITY_KIND_MEMCPY2: { HandleActivityMemcpy2Record(&profilingData, record); break; } case CUPTI_ACTIVITY_KIND_MEMSET: { HandleActivityMemsetRecord(&profilingData, record); break; } case CUPTI_ACTIVITY_KIND_KERNEL: case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { HandleActivityKernelRecord(&profilingData, record); break; } default: MS_LOG(WARNING) << "Unknown activity type!"; return; } AddEvent(std::move(profilingData)); } void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { int stat = posix_memalign(reinterpret_cast(buffer), ALIGN_SIZE, BUF_SIZE); if (stat) { MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed."; return; } MS_LOG(DEBUG) << "Alloc activity buffer, buffer size: " << BUF_SIZE; *size = BUF_SIZE; *maxNumRecords = 0; } void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { if (!enable_flag_) { MS_LOG(DEBUG) << "Profiler is not enable, skip to process activity record."; free(buffer); return; } CUptiResult status; CUpti_Activity *record = NULL; MS_LOG(DEBUG) << "Process activity buffer, valid size:" << validSize << ",Stream ID:" << streamId; if (validSize > 0) { do { status = CuptiActivityGetNextRecord(buffer, validSize, &record); if (status == CUPTI_SUCCESS) { HandleActivityRecord(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { break; } else { CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord"); } } while (1); // report any records dropped from the queue size_t dropped; CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped), "CuptiActivityGetNumDroppedRecords"); if (dropped != 0) { MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n"; } } free(buffer); } REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) { (void)py::class_>(*m, "GPUProfiler") .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.") .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init") .def("stop", &GPUProfiler::Stop, "stop") .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"), "enable or disable step profiling") .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"), "enable or disable synchronization profiling"); })); } // namespace gpu } // namespace profiler } // namespace mindspore