| @@ -29,6 +29,7 @@ endif() | |||||
| include(GNUInstallDirs) | include(GNUInstallDirs) | ||||
| include(CheckCXXCompilerFlag) | include(CheckCXXCompilerFlag) | ||||
| include(CheckIPOSupported) | include(CheckIPOSupported) | ||||
| include(CMakeDependentOption) | |||||
| check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | ||||
| @@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) | |||||
| option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | ||||
| option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | ||||
| # TODO: add windows support | |||||
| cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON | |||||
| "MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF) | |||||
| set(MGB_CUPTI ${MGE_WITH_CUPTI}) | |||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| # FIXME: static link Windows vc runtime with some version from Visual Studio have some | # FIXME: static link Windows vc runtime with some version from Visual Studio have some | ||||
| # runtime issue at some call PATH, for example: _imperative_rt.pyd --> | # runtime issue at some call PATH, for example: _imperative_rt.pyd --> | ||||
| @@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS) | |||||
| include(cmake/flatbuffers.cmake) | include(cmake/flatbuffers.cmake) | ||||
| endif() | endif() | ||||
| if(MGE_WITH_CUPTI) | |||||
| include(cmake/cupti.cmake) | |||||
| endif() | |||||
| if(MGE_WITH_CUDA) | if(MGE_WITH_CUDA) | ||||
| include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | ||||
| foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | ||||
| @@ -6,7 +6,7 @@ endif() | |||||
| if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | ||||
| set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | ||||
| endif() | endif() | ||||
| message("CUDNN ROOT: " ${CUDNN_ROOT_DIR}) | |||||
| message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}") | |||||
| if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | ||||
| find_library( | find_library( | ||||
| CUDNN_LIBRARY | CUDNN_LIBRARY | ||||
| @@ -0,0 +1,85 @@ | |||||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "") | |||||
| set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR}) | |||||
| endif() | |||||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "") | |||||
| set(CUDA_ROOT_DIR $ENV{CUDA_PATH}) | |||||
| endif() | |||||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "") | |||||
| set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH}) | |||||
| endif() | |||||
| if("${CUDA_ROOT_DIR}" STREQUAL "") | |||||
| message( | |||||
| FATAL_ERROR | |||||
| "Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" | |||||
| ) | |||||
| endif() | |||||
| # TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake | |||||
| set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC}) | |||||
| # relates https://stackoverflow.com/questions/67485114 | |||||
| if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD}) | |||||
| message(WARNING "static linking CuPTI with gold may break exception handling,\ | |||||
| use shared one instead") | |||||
| set(MGE_CUPTI_USE_STATIC OFF) | |||||
| endif() | |||||
| if(MGE_CUPTI_USE_STATIC) | |||||
| find_library( | |||||
| CUPTI_LIBRARY | |||||
| NAMES libcupti_static.a | |||||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
| PATH_SUFFIXES lib lib64 | |||||
| DOC "CuPTI library.") | |||||
| if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||||
| message(WARNING "Can not find static CuPTI Library, use shared one instead") | |||||
| set(MGE_CUPTI_USE_STATIC OFF) | |||||
| endif() | |||||
| endif() | |||||
| if(NOT ${MGE_CUPTI_USE_STATIC}) | |||||
| find_library( | |||||
| CUPTI_LIBRARY | |||||
| NAMES libcupti.so | |||||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
| PATH_SUFFIXES lib lib64 | |||||
| DOC "CuPTI library.") | |||||
| set(CUPTI_LIBRARY_TYPE SHARED) | |||||
| else() | |||||
| set(CUPTI_LIBRARY_TYPE STATIC) | |||||
| endif() | |||||
| if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||||
| message(FATAL_ERROR "Can not find CuPTI Library") | |||||
| endif() | |||||
| find_path( | |||||
| CUPTI_INCLUDE_DIR | |||||
| NAMES cupti.h | |||||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
| PATH_SUFFIXES include | |||||
| DOC "Path to CuPTI include directory.") | |||||
| if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND") | |||||
| message(FATAL_ERROR "Can not find CuPTI INCLUDE") | |||||
| endif() | |||||
| if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h) | |||||
| file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS) | |||||
| else() | |||||
| file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS) | |||||
| endif() | |||||
| string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION | |||||
| "${CUPTI_VERSION_FILE_CONTENTS}") | |||||
| string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION | |||||
| "${CUPTI_API_VERSION}") | |||||
| add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED) | |||||
| set_target_properties( | |||||
| libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES | |||||
| ${CUPTI_INCLUDE_DIR}) | |||||
| message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})") | |||||
| @@ -36,7 +36,7 @@ else() | |||||
| PATH_SUFFIXES lib lib64 | PATH_SUFFIXES lib lib64 | ||||
| DOC "TRT plugin library.") | DOC "TRT plugin library.") | ||||
| endif() | endif() | ||||
| message("TRT_LIBRARY" ${TRT_LIBRARY}) | |||||
| message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}") | |||||
| if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | ||||
| message( | message( | ||||
| FATAL_ERROR | FATAL_ERROR | ||||
| @@ -51,6 +51,10 @@ if(ANDROID) | |||||
| target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | ||||
| endif() | endif() | ||||
| if(MGE_WITH_CUPTI) | |||||
| target_link_libraries(${MODULE_NAME} PRIVATE libcupti) | |||||
| endif() | |||||
| add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | ||||
| ${PROJECT_BINARY_DIR}/third_party/range-v3) | ${PROJECT_BINARY_DIR}/third_party/range-v3) | ||||
| target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | ||||
| @@ -16,6 +16,10 @@ from weakref import WeakSet | |||||
| from .. import _atexit | from .. import _atexit | ||||
| from ..core._imperative_rt.core2 import ( | from ..core._imperative_rt.core2 import ( | ||||
| cupti_available, | |||||
| disable_cupti, | |||||
| enable_cupti, | |||||
| full_sync, | |||||
| pop_scope, | pop_scope, | ||||
| push_scope, | push_scope, | ||||
| start_profile, | start_profile, | ||||
| @@ -50,13 +54,18 @@ class Profiler(ContextDecorator): | |||||
| with profiler: | with profiler: | ||||
| # your code here | # your code here | ||||
| # Then open the profile file in chrome timeline window | # Then open the profile file in chrome timeline window | ||||
| """ | """ | ||||
| CHROME_TIMELINE = "chrome_timeline.json" | CHROME_TIMELINE = "chrome_timeline.json" | ||||
| valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10} | |||||
| valid_options = { | |||||
| "sample_rate": 0, | |||||
| "profile_device": 1, | |||||
| "num_tensor_watch": 10, | |||||
| "enable_cupti": 0, | |||||
| } | |||||
| valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | ||||
| def __init__( | def __init__( | ||||
| @@ -83,6 +92,11 @@ class Profiler(ContextDecorator): | |||||
| self._options[opt] = int(kwargs.pop(opt, optval)) | self._options[opt] = int(kwargs.pop(opt, optval)) | ||||
| self._pid = "<PID>" | self._pid = "<PID>" | ||||
| self._dump_callback = None | self._dump_callback = None | ||||
| if self._options.get("enable_cupti", 0): | |||||
| if cupti_available(): | |||||
| enable_cupti() | |||||
| else: | |||||
| get_logger().warning("CuPTI unavailable") | |||||
| @property | @property | ||||
| def path(self): | def path(self): | ||||
| @@ -116,7 +130,7 @@ class Profiler(ContextDecorator): | |||||
| assert _running_profiler is self | assert _running_profiler is self | ||||
| _running_profiler = None | _running_profiler = None | ||||
| sync() | |||||
| full_sync() | |||||
| self._dump_callback = stop_profile() | self._dump_callback = stop_profile() | ||||
| self._pid = os.getpid() | self._pid = os.getpid() | ||||
| _living_profilers.add(self) | _living_profilers.add(self) | ||||
| @@ -160,6 +174,9 @@ class Profiler(ContextDecorator): | |||||
| return func | return func | ||||
| def __del__(self): | def __del__(self): | ||||
| if self._options.get("enable_cupti", 0): | |||||
| if cupti_available(): | |||||
| disable_cupti() | |||||
| self.dump() | self.dump() | ||||
| @@ -11,6 +11,7 @@ | |||||
| #include "megbrain/common.h" | #include "megbrain/common.h" | ||||
| #include "megbrain/dtype.h" | #include "megbrain/dtype.h" | ||||
| #include "megbrain/imperative/cpp_cupti.h" | |||||
| #include "megbrain/imperative/ops/autogen.h" | #include "megbrain/imperative/ops/autogen.h" | ||||
| #include "megbrain/imperative/ops/backward_graph.h" | #include "megbrain/imperative/ops/backward_graph.h" | ||||
| #include "megbrain/imperative/ops/utility.h" | #include "megbrain/imperative/ops/utility.h" | ||||
| @@ -982,6 +983,7 @@ void init_tensor(py::module m) { | |||||
| m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | ||||
| channel->stop_profile(); | channel->stop_profile(); | ||||
| channel->sync(); | channel->sync(); | ||||
| CompNode::sync_all(); | |||||
| imperative::Profiler::stop_profile(); | imperative::Profiler::stop_profile(); | ||||
| auto results = std::make_shared<imperative::Profiler::bundle_t>( | auto results = std::make_shared<imperative::Profiler::bundle_t>( | ||||
| imperative::Profiler::collect()); | imperative::Profiler::collect()); | ||||
| @@ -990,6 +992,9 @@ void init_tensor(py::module m) { | |||||
| results = nullptr; | results = nullptr; | ||||
| }; | }; | ||||
| }); | }); | ||||
| m.def("enable_cupti", &cupti::enable); | |||||
| m.def("disable_cupti", &cupti::disable); | |||||
| m.def("cupti_available", &cupti::available); | |||||
| m.def("sync", [channel]() { | m.def("sync", [channel]() { | ||||
| if (channel->check_available()) { | if (channel->check_available()) { | ||||
| channel->sync(); | channel->sync(); | ||||
| @@ -0,0 +1,273 @@ | |||||
| #include "megbrain/imperative/cpp_cupti.h" | |||||
| #include <cinttypes> | |||||
| #include <cstddef> | |||||
| #include <cstdlib> | |||||
| #include "megbrain/exception.h" | |||||
| #include "megbrain/imperative/profiler.h" | |||||
| #include "megbrain/imperative/utils/platform.h" | |||||
| #include "./profiler/events.h" | |||||
| #if MGB_CUPTI | |||||
| #include "cupti.h" | |||||
| #define CUPTI_CALL(call) \ | |||||
| do { \ | |||||
| CUptiResult _status = call; \ | |||||
| if (_status != CUPTI_SUCCESS) { \ | |||||
| const char* errstr; \ | |||||
| cuptiGetResultString(_status, &errstr); \ | |||||
| mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \ | |||||
| } \ | |||||
| } while (0) | |||||
| #endif | |||||
| namespace mgb::imperative::cupti { | |||||
| #if MGB_CUPTI | |||||
| namespace { | |||||
| CUpti_SubscriberHandle cuptiSubscriber; | |||||
| void cuptiSubscriberCallback( | |||||
| void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, | |||||
| const void* cb_info) { | |||||
| using namespace profiler; | |||||
| switch (domain) { | |||||
| case CUPTI_CB_DOMAIN_DRIVER_API: { | |||||
| auto cb_data = (const CUpti_CallbackData*)cb_info; | |||||
| switch (cb_id) { | |||||
| case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { | |||||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIKernelLaunchEvent, cb_data->correlationId, | |||||
| cb_data->symbolName); | |||||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIKernelLaunchFinishEvent, cb_data->correlationId, | |||||
| cb_data->symbolName); | |||||
| } | |||||
| break; | |||||
| } | |||||
| case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: { | |||||
| } | |||||
| case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { | |||||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIMemcpyLaunchEvent, cb_data->correlationId); | |||||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId); | |||||
| } | |||||
| break; | |||||
| } | |||||
| default: { | |||||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIDriverEvent, cb_data->correlationId, | |||||
| cb_data->functionName); | |||||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIDriverFinishEvent, cb_data->correlationId, | |||||
| cb_data->functionName); | |||||
| } | |||||
| } | |||||
| } | |||||
| break; | |||||
| } | |||||
| case CUPTI_CB_DOMAIN_RUNTIME_API: { | |||||
| auto cb_data = (const CUpti_CallbackData*)cb_info; | |||||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIRuntimeEvent, cb_data->correlationId, | |||||
| cb_data->functionName); | |||||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
| MGB_RECORD_EVENT( | |||||
| CUPTIRuntimeFinishEvent, cb_data->correlationId, | |||||
| cb_data->functionName); | |||||
| } | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| void handleActivity(CUpti_Activity* record) { | |||||
| using namespace std::chrono_literals; | |||||
| auto delta = 16ns; | |||||
| switch (record->kind) { | |||||
| case CUPTI_ACTIVITY_KIND_KERNEL: | |||||
| case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { | |||||
| auto kernel = cupti::activity<CUpti_ActivityKernel4>(record); | |||||
| MGB_RECORD_EVENT( | |||||
| profiler::CUPTIKernelExecuteEvent, kernel->correlationId, | |||||
| kernel->name, kernel.stream(), kernel.start(), | |||||
| kernel.end() - delta); | |||||
| break; | |||||
| } | |||||
| case CUPTI_ACTIVITY_KIND_MEMCPY: { | |||||
| auto memcpy = cupti::activity<CUpti_ActivityMemcpy>(record); | |||||
| MGB_RECORD_EVENT( | |||||
| profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind, | |||||
| memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(), | |||||
| memcpy.end()); | |||||
| break; | |||||
| } | |||||
| case CUPTI_ACTIVITY_KIND_MEMSET: { | |||||
| auto memset = cupti::activity<CUpti_ActivityMemset>(record); | |||||
| MGB_RECORD_EVENT( | |||||
| profiler::CUPTIMemsetEvent, memset->correlationId, memset->value, | |||||
| memset->bytes, memset.stream(), memset.start(), | |||||
| memset.end() - delta); | |||||
| break; | |||||
| } | |||||
| default: | |||||
| break; | |||||
| } | |||||
| } | |||||
| using activity_buffer_t = | |||||
| std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>; | |||||
| void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { | |||||
| *buffer = reinterpret_cast<uint8_t*>(new activity_buffer_t()); | |||||
| *size = sizeof(activity_buffer_t); | |||||
| *maxNumRecords = 0; | |||||
| } | |||||
| void bufferCompleted( | |||||
| CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, | |||||
| size_t validSize) { | |||||
| CUptiResult status; | |||||
| CUpti_Activity* record = NULL; | |||||
| if (validSize > 0) { | |||||
| do { | |||||
| status = cuptiActivityGetNextRecord(buffer, validSize, &record); | |||||
| if (status == CUPTI_SUCCESS) { | |||||
| handleActivity(record); | |||||
| } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) | |||||
| break; | |||||
| else { | |||||
| CUPTI_CALL(status); | |||||
| } | |||||
| } while (1); | |||||
| size_t dropped; | |||||
| CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); | |||||
| mgb_assert(dropped == 0, "%zu records dropped", dropped); | |||||
| } | |||||
| delete reinterpret_cast<activity_buffer_t*>(buffer); | |||||
| } | |||||
| static bool initialized = false; | |||||
| } // namespace | |||||
| bool available() { | |||||
| uint32_t compiletime_version = (CUPTI_API_VERSION); | |||||
| uint32_t runtime_version; | |||||
| CUPTI_CALL(cuptiGetVersion(&runtime_version)); | |||||
| if (compiletime_version != runtime_version) { | |||||
| static std::once_flag once; | |||||
| std::call_once(once, [&] { | |||||
| mgb_log_warn( | |||||
| "CuPTI version %d mismatch against compiletime version %d. " | |||||
| "This may caused by user config LD_LIBRARY_PATH" | |||||
| "at unix-like env or config PATH at Windows env", | |||||
| (int)compiletime_version, (int)runtime_version); | |||||
| }); | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| void enable() { | |||||
| // not thread safe | |||||
| mgb_assert(!initialized, "cupti already initialized"); | |||||
| // callback | |||||
| CUPTI_CALL(cuptiSubscribe( | |||||
| &cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback, | |||||
| (void*)nullptr)); | |||||
| CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API)); | |||||
| CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); | |||||
| // activity | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | |||||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | |||||
| CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); | |||||
| initialized = true; | |||||
| } | |||||
| void disable() { | |||||
| mgb_assert(initialized, "cupti not initialized yet"); | |||||
| flush(); | |||||
| CUPTI_CALL(cuptiFinalize()); | |||||
| initialized = false; | |||||
| } | |||||
| void flush() { | |||||
| if (initialized) { | |||||
| CUPTI_CALL(cuptiActivityFlushAll(1)); | |||||
| } | |||||
| } | |||||
| bool enabled() { | |||||
| return initialized; | |||||
| } | |||||
| time_point clock::now() { | |||||
| uint64_t timestamp; | |||||
| CUPTI_CALL(cuptiGetTimestamp(×tamp)); | |||||
| using namespace std::chrono; | |||||
| // overflow? | |||||
| return time_point(duration((int64_t)timestamp)); | |||||
| } | |||||
| #else | |||||
| class CuPTIUnavailableError : public MegBrainError { | |||||
| public: | |||||
| CuPTIUnavailableError() | |||||
| : MegBrainError( | |||||
| #if MGB_CUDA | |||||
| "CuPTI disabled at compile time" | |||||
| #else | |||||
| "CuPTI unsupported on non cuda platform" | |||||
| #endif | |||||
| ) { | |||||
| } | |||||
| }; | |||||
| bool available() { | |||||
| return false; | |||||
| } | |||||
| void enable() { | |||||
| throw CuPTIUnavailableError(); | |||||
| } | |||||
| void disable() { | |||||
| throw CuPTIUnavailableError(); | |||||
| } | |||||
| void flush() {} | |||||
| bool enabled() { | |||||
| return false; | |||||
| } | |||||
| time_point clock::now() { | |||||
| throw CuPTIUnavailableError(); | |||||
| } | |||||
| #endif | |||||
| } // namespace mgb::imperative::cupti | |||||
| @@ -12,7 +12,9 @@ | |||||
| #include "megbrain/imperative/profiler.h" | #include "megbrain/imperative/profiler.h" | ||||
| #include <chrono> | #include <chrono> | ||||
| #include <unordered_map> | |||||
| #include "megbrain/imperative/cpp_cupti.h" | |||||
| #include "megbrain/imperative/ops/opr_attr.h" | #include "megbrain/imperative/ops/opr_attr.h" | ||||
| #include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
| @@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false; | |||||
| thread_local Profiler* Profiler::tm_profiler = nullptr; | thread_local Profiler* Profiler::tm_profiler = nullptr; | ||||
| std::atomic_size_t Profiler::sm_preferred_capacity; | std::atomic_size_t Profiler::sm_preferred_capacity; | ||||
| void Profiler::start_profile() { | |||||
| mgb_assert(!sm_profiling); | |||||
| sm_start_at = Timer::record_host(); | |||||
| sm_profiling = true; | |||||
| if (cupti::enabled()) { | |||||
| MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now()); | |||||
| } | |||||
| } | |||||
| void Profiler::stop_profile() { | |||||
| mgb_assert(sm_profiling); | |||||
| cupti::flush(); | |||||
| sm_profiling = false; | |||||
| } | |||||
| auto Profiler::get_thread_dict() -> thread_dict_t { | auto Profiler::get_thread_dict() -> thread_dict_t { | ||||
| thread_dict_t thread_dict; | thread_dict_t thread_dict; | ||||
| for (auto&& [tid, profiler] : sm_profilers) { | for (auto&& [tid, profiler] : sm_profilers) { | ||||
| @@ -19,6 +19,7 @@ | |||||
| #include "nlohmann/json.hpp" | #include "nlohmann/json.hpp" | ||||
| #include "megbrain/imperative/utils/platform.h" | |||||
| #include "megbrain/utils/debug.h" | #include "megbrain/utils/debug.h" | ||||
| #include "./formats.h" | #include "./formats.h" | ||||
| @@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| decltype(getpid()) pid = getpid(); | decltype(getpid()) pid = getpid(); | ||||
| std::string pid_str = std::to_string(pid); | std::string pid_str = std::to_string(pid); | ||||
| ChromeTimelineEventVisitor() {} | |||||
| ChromeTraceEvent& new_event( | ChromeTraceEvent& new_event( | ||||
| std::string name, char ph, size_t tid, profiler::HostTime time) { | std::string name, char ph, size_t tid, profiler::HostTime time) { | ||||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | ||||
| @@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| .ts(since_start(current->time)); | .ts(since_start(current->time)); | ||||
| } | } | ||||
| ChromeTraceEvent& new_cupti_event( | |||||
| std::string name, char ph, cupti::stream_t stream, | |||||
| cupti::time_point timestamp) { | |||||
| return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp)); | |||||
| } | |||||
| ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ||||
| using namespace std::literals::chrono_literals; | |||||
| auto time = since_start(to_device_time(current->time, device)); | auto time = since_start(to_device_time(current->time, device)); | ||||
| return trace_events.new_event() | return trace_events.new_event() | ||||
| .name(name) | .name(name) | ||||
| @@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||
| current_device_time - current_host_time); | current_device_time - current_host_time); | ||||
| new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | ||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchEvent>) { | |||||
| new_host_event(demangle(event.name), 'B'); | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.correlation_id) | |||||
| .cat("KernelLink") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchFinishEvent>) { | |||||
| new_host_event(demangle(event.name), 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelExecuteEvent>) { | |||||
| new_cupti_event(demangle(event.name), 'B', event.stream, event.start) | |||||
| .arg("execution_time", (event.end - event.start).count()); | |||||
| new_cupti_event(pid_str, 'f', event.stream, event.end) | |||||
| .id(event.correlation_id) | |||||
| .bp('e') | |||||
| .cat("KernelLink") | |||||
| .scope(pid_str); | |||||
| new_cupti_event(demangle(event.name), 'E', event.stream, event.end) | |||||
| .arg("execution_time", (event.end - event.start).count()); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchEvent>) { | |||||
| new_host_event("Memcpy", 'B'); | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.correlation_id) | |||||
| .cat("CUPTILink") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchFinishEvent>) { | |||||
| new_host_event("Memcpy", 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyEvent>) { | |||||
| auto memkind2str = [](uint8_t kind) { | |||||
| const char* const valid_kinds[] = { | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_PINNED", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_ARRAY", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC", | |||||
| "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"}; | |||||
| if (kind > (sizeof(valid_kinds) / sizeof(const char*))) { | |||||
| return "invalid"; | |||||
| } | |||||
| return valid_kinds[kind]; | |||||
| }; | |||||
| new_cupti_event("Memcpy", 'B', event.stream, event.start) | |||||
| .arg("bytes", imperative::to_string(event.bytes)) | |||||
| .arg("src_kind", memkind2str(event.src_kind)) | |||||
| .arg("dst_kind", memkind2str(event.dst_kind)); | |||||
| new_cupti_event(pid_str, 'f', event.stream, event.start) | |||||
| .id(event.correlation_id) | |||||
| .bp('e') | |||||
| .cat("CUPTILink") | |||||
| .scope(pid_str); | |||||
| new_cupti_event("Memcpy", 'E', event.stream, event.end) | |||||
| .arg("bytes", imperative::to_string(event.bytes)) | |||||
| .arg("src_kind", memkind2str(event.src_kind)) | |||||
| .arg("dst_kind", memkind2str(event.dst_kind)); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemsetEvent>) { | |||||
| new_cupti_event("Memset", 'B', event.stream, event.start) | |||||
| .arg("value", imperative::to_string(event.value)) | |||||
| .arg("bytes", imperative::to_string(event.bytes)); | |||||
| new_cupti_event("Memset", 'E', event.stream, event.start) | |||||
| .arg("value", imperative::to_string(event.value)) | |||||
| .arg("bytes", imperative::to_string(event.bytes)); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeEvent>) { | |||||
| new_host_event(event.name, 'B'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeFinishEvent>) { | |||||
| new_host_event(event.name, 'E'); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIDriverEvent>) { | |||||
| new_host_event(event.name, 'B'); | |||||
| new_host_event(pid_str, 's') | |||||
| .id(event.correlation_id) | |||||
| .cat("CUPTILink") | |||||
| .scope(pid_str); | |||||
| } else if constexpr (std::is_same_v<TEvent, CUPTIDriverFinishEvent>) { | |||||
| new_host_event(event.name, 'E'); | |||||
| } | } | ||||
| } | } | ||||
| @@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| if (thread_dict.count(host)) { | if (thread_dict.count(host)) { | ||||
| trace_events.new_event() | trace_events.new_event() | ||||
| .name("thread_name") | .name("thread_name") | ||||
| .pid('M') | |||||
| .ph('M') | |||||
| .pid(pid) | |||||
| .tid(to_tid(host)) | .tid(to_tid(host)) | ||||
| .arg("name", thread_dict.at(host)); | .arg("name", thread_dict.at(host)); | ||||
| } | } | ||||
| @@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| for (auto&& device : devices()) { | for (auto&& device : devices()) { | ||||
| trace_events.new_event() | trace_events.new_event() | ||||
| .name("thread_name") | .name("thread_name") | ||||
| .pid('M') | |||||
| .ph('M') | |||||
| .pid(pid) | |||||
| .tid(to_tid(device)) | .tid(to_tid(device)) | ||||
| .arg("name", device.to_string_logical()); | .arg("name", device.to_string_logical()); | ||||
| } | } | ||||
| @@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
| }; | }; | ||||
| void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | ||||
| ChromeTimelineEventVisitor visitor; | |||||
| ChromeTimelineEventVisitor visitor{}; | |||||
| visitor.process_events(result); | visitor.process_events(result); | ||||
| visitor.name_threads(result.thread_dict); | visitor.name_threads(result.thread_dict); | ||||
| auto trace_events = std::move(visitor.trace_events); | auto trace_events = std::move(visitor.trace_events); | ||||
| @@ -16,6 +16,7 @@ | |||||
| #include "../interpreter/stack_manager.h" | #include "../interpreter/stack_manager.h" | ||||
| #include "../op_trait.h" | #include "../op_trait.h" | ||||
| #include "megbrain/imperative/cpp_cupti.h" | |||||
| namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
| @@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, { | |||||
| void* device_ptr; | void* device_ptr; | ||||
| }); | }); | ||||
| // cupti events | |||||
| DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); | |||||
| DEF_DUR_EVENT(CUPTIKernelLaunch, { | |||||
| uint32_t correlation_id; | |||||
| const char* name; | |||||
| }); | |||||
| DEF_EVENT(CUPTIKernelExecute, { | |||||
| uint32_t correlation_id; | |||||
| const char* name; | |||||
| cupti::stream_t stream; | |||||
| cupti::time_point start; | |||||
| cupti::time_point end; | |||||
| }); | |||||
| DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; }); | |||||
| DEF_EVENT(CUPTIMemcpy, { | |||||
| uint32_t correlation_id; | |||||
| uint8_t src_kind; | |||||
| uint8_t dst_kind; | |||||
| uint64_t bytes; | |||||
| cupti::stream_t stream; | |||||
| cupti::time_point start; | |||||
| cupti::time_point end; | |||||
| }); | |||||
| DEF_EVENT(CUPTIMemset, { | |||||
| uint32_t correlation_id; | |||||
| uint32_t value; | |||||
| uint64_t bytes; | |||||
| cupti::stream_t stream; | |||||
| cupti::time_point start; | |||||
| cupti::time_point end; | |||||
| }); | |||||
| DEF_EVENT(CUPTIUnknownDevice, {}); | |||||
| DEF_DUR_EVENT(CUPTIRuntime, { | |||||
| uint32_t correlation_id; | |||||
| const char* name; | |||||
| }); | |||||
| DEF_DUR_EVENT(CUPTIDriver, { | |||||
| uint32_t correlation_id; | |||||
| const char* name; | |||||
| }); | |||||
| DEF_EVENT(CUPTIIdentifyStream, { | |||||
| cupti::stream_t stream; | |||||
| CompNode device; | |||||
| }); | |||||
| #undef DEF_EVENT | #undef DEF_EVENT | ||||
| #undef DEF_DUR_EVENT | #undef DEF_DUR_EVENT | ||||
| @@ -180,10 +180,13 @@ private: | |||||
| HostTime m_start_time; | HostTime m_start_time; | ||||
| CompNode::UnorderedMap<size_t> m_device_tid_table; | CompNode::UnorderedMap<size_t> m_device_tid_table; | ||||
| std::unordered_map<std::thread::id, size_t> m_host_tid_table; | std::unordered_map<std::thread::id, size_t> m_host_tid_table; | ||||
| std::unordered_map<cupti::stream_t, size_t> m_cupti_tid_table; | |||||
| CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | ||||
| m_device_timeline; | m_device_timeline; | ||||
| std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | ||||
| std::unordered_map<std::string, int64_t> m_counter_table; | std::unordered_map<std::string, int64_t> m_counter_table; | ||||
| std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp = | |||||
| {}; | |||||
| protected: | protected: | ||||
| Profiler::Record* current; | Profiler::Record* current; | ||||
| @@ -191,6 +194,11 @@ protected: | |||||
| ProfileTensorState* current_tensor; | ProfileTensorState* current_tensor; | ||||
| protected: | protected: | ||||
| size_t next_tid() { | |||||
| return m_host_tid_table.size() + m_device_tid_table.size() + | |||||
| m_cupti_tid_table.size(); | |||||
| } | |||||
| profiler::Duration since_start(profiler::HostTime time) { | profiler::Duration since_start(profiler::HostTime time) { | ||||
| return time - m_start_time; | return time - m_start_time; | ||||
| } | } | ||||
| @@ -229,6 +237,10 @@ protected: | |||||
| size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | ||||
| size_t to_tid(cupti::stream_t cupti_stream) { | |||||
| return m_cupti_tid_table.at(cupti_stream); | |||||
| } | |||||
| SmallVector<std::thread::id> host_threads() { | SmallVector<std::thread::id> host_threads() { | ||||
| SmallVector<std::thread::id> host_threads; | SmallVector<std::thread::id> host_threads; | ||||
| for (auto&& [host, _] : m_host_tid_table) { | for (auto&& [host, _] : m_host_tid_table) { | ||||
| @@ -254,6 +266,13 @@ protected: | |||||
| value += delta; | value += delta; | ||||
| } | } | ||||
| profiler::HostTime time_from_cupti(cupti::time_point timestamp) { | |||||
| mgb_assert(m_cupti_timestamp.has_value()); | |||||
| return m_cupti_timestamp->first + | |||||
| std::chrono::duration_cast<profiler::HostTime::duration>( | |||||
| timestamp - m_cupti_timestamp->second); | |||||
| } | |||||
| public: | public: | ||||
| void process_events(Profiler::bundle_t& bundle) { | void process_events(Profiler::bundle_t& bundle) { | ||||
| m_start_time = bundle.start_at; | m_start_time = bundle.start_at; | ||||
| @@ -272,7 +291,11 @@ public: | |||||
| TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | ||||
| AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | ||||
| ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | ||||
| HostToDeviceFinishEvent> | |||||
| HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, | |||||
| CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, | |||||
| CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, | |||||
| CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, | |||||
| CUPTIDriverFinishEvent, CUPTIMemsetEvent> | |||||
| converter; | converter; | ||||
| auto for_each_entry = [&](auto&& handler) { | auto for_each_entry = [&](auto&& handler) { | ||||
| @@ -289,7 +312,9 @@ public: | |||||
| std::shared_ptr<CompNode::Event> device; | std::shared_ptr<CompNode::Event> device; | ||||
| }; | }; | ||||
| CompNode::UnorderedMap<DeviceStartPair> device_start_table; | CompNode::UnorderedMap<DeviceStartPair> device_start_table; | ||||
| std::unordered_map<cupti::stream_t, CompNode> cupti_stream_table; | |||||
| // record device time | |||||
| for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
| using T = std::decay_t<decltype(event)>; | using T = std::decay_t<decltype(event)>; | ||||
| if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | ||||
| @@ -313,8 +338,7 @@ public: | |||||
| // register host threads | // register host threads | ||||
| for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
| if (!m_host_tid_table.count(current->tid)) { | if (!m_host_tid_table.count(current->tid)) { | ||||
| m_host_tid_table[current->tid] = { | |||||
| m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
| m_host_tid_table[current->tid] = next_tid(); | |||||
| } | } | ||||
| }); | }); | ||||
| @@ -340,14 +364,39 @@ public: | |||||
| } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | ||||
| auto& tensor = m_tensors[event.tensor_id]; | auto& tensor = m_tensors[event.tensor_id]; | ||||
| if (!m_device_tid_table.count(event.device)) { | if (!m_device_tid_table.count(event.device)) { | ||||
| m_device_tid_table[event.device] = { | |||||
| m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
| m_device_tid_table[event.device] = next_tid(); | |||||
| } | } | ||||
| tensor.device = event.device; | tensor.device = event.device; | ||||
| tensor.layout = event.layout; | tensor.layout = event.layout; | ||||
| } | } | ||||
| }); | }); | ||||
| for_each_entry([&](auto&& event) { | |||||
| using T = std::decay_t<decltype(event)>; | |||||
| if constexpr (std::is_same_v<T, CUPTIIdentifyStreamEvent>) { | |||||
| if (!m_cupti_tid_table.count(event.stream)) { | |||||
| m_cupti_tid_table[event.stream] = | |||||
| m_device_tid_table.at(event.device); | |||||
| } | |||||
| } | |||||
| }); | |||||
| // record cupti streams | |||||
| for_each_entry([&](auto&& event) { | |||||
| using T = std::decay_t<decltype(event)>; | |||||
| if constexpr ( | |||||
| std::is_same_v<T, CUPTIKernelExecuteEvent> || | |||||
| std::is_same_v<T, CUPTIMemcpyEvent> || | |||||
| std::is_same_v<T, CUPTIMemsetEvent>) { | |||||
| if (!m_cupti_tid_table.count(event.stream)) { | |||||
| m_cupti_tid_table[event.stream] = next_tid(); | |||||
| } | |||||
| } else if constexpr (std::is_same_v<T, CUPTITimestampEvent>) { | |||||
| mgb_assert(!m_cupti_timestamp.has_value()); | |||||
| m_cupti_timestamp.emplace(current->time, event.timestamp); | |||||
| } | |||||
| }); | |||||
| // replay execution | // replay execution | ||||
| using namespace std::placeholders; | using namespace std::placeholders; | ||||
| for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
| @@ -0,0 +1,25 @@ | |||||
| #include "megbrain/imperative/utils/platform.h" | |||||
| #ifdef __GNUG__ | |||||
| #include <cxxabi.h> | |||||
| #include <cstdlib> | |||||
| #include <memory> | |||||
| #endif | |||||
| using namespace mgb; | |||||
| using namespace imperative; | |||||
| /* | |||||
| * demangle typeid, see | |||||
| * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname | |||||
| */ | |||||
| std::string mgb::imperative::demangle(std::string mangled) { | |||||
| #ifdef __GNUG__ | |||||
| int status = -1; | |||||
| std::unique_ptr<char, void (*)(void*)> res{ | |||||
| abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free}; | |||||
| return (status == 0) ? res.get() : mangled; | |||||
| #else | |||||
| return mangled; | |||||
| #endif | |||||
| } | |||||
| @@ -0,0 +1,86 @@ | |||||
| #pragma once | |||||
| #include <chrono> | |||||
| #include <ctime> | |||||
| #include "megbrain/common.h" | |||||
| #include "megbrain/imperative/utils/to_string.h" | |||||
| namespace mgb::imperative::cupti { | |||||
| struct clock { | |||||
| typedef std::chrono::nanoseconds duration; | |||||
| typedef duration::rep rep; | |||||
| typedef duration::period period; | |||||
| typedef std::chrono::time_point<clock> time_point; | |||||
| static const bool is_steady = false; | |||||
| static time_point now() /* noexcept */; | |||||
| }; | |||||
| using time_point = clock::time_point; | |||||
| using duration = clock::duration; | |||||
| struct device_t { | |||||
| uint32_t device_id; | |||||
| bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; } | |||||
| }; | |||||
| struct context_t : device_t { | |||||
| uint32_t context_id; | |||||
| bool operator==(const context_t& rhs) const { | |||||
| return device_t::operator==(rhs) && context_id == rhs.context_id; | |||||
| } | |||||
| }; | |||||
| struct stream_t : context_t { | |||||
| uint32_t stream_id; | |||||
| bool operator==(const stream_t& rhs) const { | |||||
| return context_t::operator==(rhs) && stream_id == rhs.stream_id; | |||||
| } | |||||
| }; | |||||
| bool available(); | |||||
| void enable(); | |||||
| void disable(); | |||||
| void flush(); | |||||
| bool enabled(); | |||||
| template <typename TActivity> | |||||
| struct activity { | |||||
| private: | |||||
| TActivity* m_ptr; | |||||
| public: | |||||
| activity(void* ptr) : m_ptr((TActivity*)ptr) {} | |||||
| time_point start() const { return time_point(duration(m_ptr->start)); } | |||||
| time_point end() const { return time_point(duration(m_ptr->end)); } | |||||
| device_t device() const { return {m_ptr->deviceId}; } | |||||
| context_t context() const { return {device(), m_ptr->contextId}; } | |||||
| stream_t stream() const { return {context(), m_ptr->streamId}; } | |||||
| TActivity* operator->() const { return m_ptr; } | |||||
| }; | |||||
| } // namespace mgb::imperative::cupti | |||||
| template <> | |||||
| class std::hash<mgb::imperative::cupti::stream_t> { | |||||
| public: | |||||
| size_t operator()(const mgb::imperative::cupti::stream_t& value) const { | |||||
| return value.stream_id; | |||||
| } | |||||
| }; | |||||
| @@ -194,16 +194,9 @@ public: | |||||
| static bool is_profiling() { return sm_profiling; } | static bool is_profiling() { return sm_profiling; } | ||||
| static void start_profile() { | |||||
| mgb_assert(!sm_profiling); | |||||
| sm_start_at = Timer::record_host(); | |||||
| sm_profiling = true; | |||||
| } | |||||
| static void start_profile(); | |||||
| static void stop_profile() { | |||||
| mgb_assert(sm_profiling); | |||||
| sm_profiling = false; | |||||
| } | |||||
| static void stop_profile(); | |||||
| static thread_dict_t get_thread_dict(); | static thread_dict_t get_thread_dict(); | ||||
| @@ -0,0 +1,9 @@ | |||||
| #pragma once | |||||
| #include <string> | |||||
| namespace mgb::imperative { | |||||
| std::string demangle(std::string mangled); | |||||
| } | |||||
| @@ -37,6 +37,10 @@ if(MGE_WITH_CUDA) | |||||
| list(APPEND LINK_LIBS cudart) | list(APPEND LINK_LIBS cudart) | ||||
| endif() | endif() | ||||
| if(MGE_WITH_CUPTI) | |||||
| list(APPEND LINK_LIBS libcupti) | |||||
| endif() | |||||
| if(MGE_WITH_DISTRIBUTED) | if(MGE_WITH_DISTRIBUTED) | ||||
| list(APPEND LINK_LIBS megray) | list(APPEND LINK_LIBS megray) | ||||
| endif() | endif() | ||||
| @@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}" | |||||
| if [ $SDK_NAME == "cu101" ];then | if [ $SDK_NAME == "cu101" ];then | ||||
| CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | ||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
| BUILD_GCC8="ON" | |||||
| REQUIR_CUDA_VERSION="10010" | |||||
| REQUIR_CUDNN_VERSION="7.6.3" | |||||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
| BUILD_GCC8="ON" | |||||
| REQUIR_CUDA_VERSION="10010" | |||||
| REQUIR_CUDNN_VERSION="7.6.3" | |||||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
| REQUIR_CUBLAS_VERSION="10.2.1.243" | REQUIR_CUBLAS_VERSION="10.2.1.243" | ||||
| elif [ $SDK_NAME == "cu102_JetsonNano" ];then | elif [ $SDK_NAME == "cu102_JetsonNano" ];then | ||||
| @@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
| if [ ${machine} == "aarch64" ];then | |||||
| CUDA_COPY_LIB_LIST="\ | |||||
| ${CUDA_LIB_DIR}/libcupti.so.10.2:\ | |||||
| ${CUDA_COPY_LIB_LIST}" | |||||
| fi | |||||
| EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | ||||
| elif [ $SDK_NAME == "cu111" ];then | elif [ $SDK_NAME == "cu111" ];then | ||||
| @@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
| if [ ${machine} == "aarch64" ];then | |||||
| CUDA_COPY_LIB_LIST="\ | |||||
| ${CUDA_LIB_DIR}/libcupti.so.11.1:\ | |||||
| ${CUDA_COPY_LIB_LIST}" | |||||
| fi | |||||
| if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | ||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | ||||
| else | else | ||||
| @@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then | |||||
| -gencode arch=compute_86,code=sm_86 \ | -gencode arch=compute_86,code=sm_86 \ | ||||
| -gencode arch=compute_86,code=compute_86\" " | -gencode arch=compute_86,code=compute_86\" " | ||||
| REQUIR_CUDA_VERSION="11020" | |||||
| REQUIR_CUDNN_VERSION="8.0.4" | |||||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
| REQUIR_CUDA_VERSION="11020" | |||||
| REQUIR_CUDNN_VERSION="8.0.4" | |||||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
| REQUIR_CUBLAS_VERSION="11.3.1.68" | REQUIR_CUBLAS_VERSION="11.3.1.68" | ||||
| elif [ $SDK_NAME == "cpu" ];then | elif [ $SDK_NAME == "cpu" ];then | ||||
| @@ -35,6 +35,7 @@ | |||||
| #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | ||||
| #cmakedefine01 MGB_IS_DEV | #cmakedefine01 MGB_IS_DEV | ||||
| #cmakedefine01 MGB_CUSTOM_OP | #cmakedefine01 MGB_CUSTOM_OP | ||||
| #cmakedefine01 MGB_CUPTI | |||||
| // DNN related flags | // DNN related flags | ||||
| // Platform macro's | // Platform macro's | ||||
| #cmakedefine01 MEGDNN_WITH_CUDA | #cmakedefine01 MEGDNN_WITH_CUDA | ||||